---
title: "soilVAE Workflow"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{soilVAE Workflow}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
bibliography: references.bib
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "figures/workflow-",
  fig.ext  = "png",
  fig.width = 7,
  fig.height = 4
)

set.seed(19101991)
```

This vignette mirrors the main package example (see `README`) and compares a **PLS** baseline to **soilVAE** on the included dataset `datsoilspc`.

> **CRAN/CI note**: this vignette is written to build on systems **without** Python/TensorFlow.  
> The **VAE training** chunks are automatically skipped unless TensorFlow/Keras are available.

## Packages

```{r, message=FALSE}
# IMPORTANT: do NOT use devtools in vignettes (it is not available on CRAN/CI runners).
# Only load packages that are in Imports/Suggests.

for (p in c("prospectr", "pls", "reticulate", "soilVAE")) {
  if (!requireNamespace(p, quietly = TRUE)) {
    stop("Package '", p, "' is required to build this vignette. ",
         "Please install it (it should be installed automatically from Suggests/Imports during checks).")
  }
}

library(prospectr)
library(pls)
library(reticulate)
library(soilVAE)
```

## Data

```{r}
data("datsoilspc", package = "soilVAE")
str(datsoilspc, max.level = 1)
```

## Utility: evaluation metrics (base R)

```{r}
eval_quant <- function(y, yhat) {
  y <- as.numeric(y); yhat <- as.numeric(yhat)
  ok <- is.finite(y) & is.finite(yhat)
  y <- y[ok]; yhat <- yhat[ok]

  if (length(y) < 3) {
    return(list(n = length(y), ME = NA_real_, MAE = NA_real_, RMSE = NA_real_,
                R2 = NA_real_, RPIQ = NA_real_, RPD = NA_real_))
  }

  err <- yhat - y
  me <- mean(err)
  mae <- mean(abs(err))
  rmse <- sqrt(mean(err^2))

  ss_res <- sum((y - yhat)^2)
  ss_tot <- sum((y - mean(y))^2)
  r2 <- if (ss_tot == 0) NA_real_ else 1 - ss_res / ss_tot

  rpiq <- stats::IQR(y) / rmse
  rpd  <- stats::sd(y) / rmse

  list(n = length(y), ME = me, MAE = mae, RMSE = rmse, R2 = r2, RPIQ = rpiq, RPD = rpd)
}

as_df_metrics <- function(x) {
  data.frame(
    n = x$n,
    ME = round(x$ME, 2),
    MAE = round(x$MAE, 2),
    RMSE = round(x$RMSE, 2),
    R2 = round(x$R2, 2),
    RPIQ = round(x$RPIQ, 2),
    RPD = round(x$RPD, 2),
    stringsAsFactors = FALSE
  )
}
```

## Spectra preprocessing (reflectance → absorbance → resample → SNV → movav)

```{r}
# Reflectance → absorbance
spcA <- log(1 / as.matrix(datsoilspc$spc))

# Resample to 5 nm
oldWavs <- as.numeric(colnames(spcA))
newWavs <- seq(min(oldWavs), max(oldWavs), by = 5)

spcARs <- prospectr::resample(
  X = spcA,
  wav = oldWavs,
  new.wav = newWavs,
  interpol = "linear"
)

# SNV + moving average
spcASnv   <- prospectr::standardNormalVariate(spcARs)
spcAMovav <- prospectr::movav(spcASnv, w = 11)

# Store in object to match book-style workflows
datsoilspc$spcAMovav <- spcAMovav
```

```{r spectra-plot, fig.alt="Preprocessed spectra (SNV + movav)", fig.cap="Preprocessed spectra (SNV + movav)"}
matplot(
  x = as.numeric(colnames(datsoilspc$spcAMovav)),
  y = t(datsoilspc$spcAMovav),
  xlab = "Wavelength / nm",
  ylab = "Absorbance (SNV + movav)",
  type = "l", lty = 1,
  col = rgb(0.5, 0.5, 0.5, alpha = 0.25)
)
```

## Split: calibration (datC) vs test (datV)

```{r}
set.seed(19101991)
calId <- sample(seq_len(nrow(datsoilspc)), size = round(0.75 * nrow(datsoilspc)))

datC <- datsoilspc[calId, ]
datV <- datsoilspc[-calId, ]
```

## Baseline: PLS (train on datC, evaluate on datV)

```{r pls, fig.alt="PLS CV performance", fig.cap="PLS CV performance"}
maxc <- 30
pls_fit <- pls::plsr(
  TotalCarbon ~ spcAMovav,
  data = datC,
  method = "oscorespls",
  ncomp = maxc,
  validation = "CV"
)

plot(pls_fit, "val", main = "PLS CV performance", xlab = "Number of components")
```

```{r}
nc <- 14

pls_pred_C <- as.numeric(predict(pls_fit, ncomp = nc, newdata = datC$spcAMovav))
pls_pred_V <- as.numeric(predict(pls_fit, ncomp = nc, newdata = datV$spcAMovav))

pls_cal <- eval_quant(datC$TotalCarbon, pls_pred_C)
pls_tst <- eval_quant(datV$TotalCarbon, pls_pred_V)

rbind(
  cbind(Model = "PLS", Split = "Calibration (datC)", as_df_metrics(pls_cal)),
  cbind(Model = "PLS", Split = "TEST (datV)",        as_df_metrics(pls_tst))
)
```

## soilVAE: supervised VAE regression (skips automatically if TF/Keras unavailable)

```{r}
has_py <- reticulate::py_available(initialize = FALSE)
has_tf <- FALSE
if (has_py) {
  try(reticulate::py_config(), silent = TRUE)
  has_tf <- reticulate::py_module_available("tensorflow") &&
            reticulate::py_module_available("keras")
}

has_py
has_tf
```

### Prepare matrices (scale **X** using train stats; keep **y** in original units)

```{r}
# Predictors
X_tr <- scale(as.matrix(datC$spcAMovav))
X_center <- attr(X_tr, "scaled:center")
X_scale  <- attr(X_tr, "scaled:scale")
X_te <- scale(as.matrix(datV$spcAMovav), center = X_center, scale = X_scale)

# Target (no transform, keep original units like the PLS baseline)
y_tr <- as.numeric(datC$TotalCarbon)
y_te <- as.numeric(datV$TotalCarbon)

dim(X_tr)
length(y_tr)
```

### Fit and evaluate (only runs when TF/Keras is available)

```{r, eval=has_tf}
# If needed, pin an env *before* importing TF/Keras:
# soilVAE::vae_configure(conda = "soilvae-tf")

# Internal split from datC -> train/val for early stopping + model selection
set.seed(19101991)
idx <- seq_len(nrow(X_tr))
val_id <- sample(idx, size = max(1L, round(0.20 * length(idx))))
tr_id  <- setdiff(idx, val_id)

X_int_tr <- X_tr[tr_id, , drop = FALSE]
y_int_tr <- y_tr[tr_id]

X_int_va <- X_tr[val_id, , drop = FALSE]
y_int_va <- y_tr[val_id]

# A small-but-useful grid
grid_vae <- data.frame(
  latent_dim = c(8L, 16L, 32L, 64L),
  dropout    = c(0.2, 0.3),
  lr         = c(5e-4),
  beta_kl    = c(0.01),
  alpha_y    = c(5),
  epochs     = c(500L),
  batch_size = c(64L, 128L),
  patience   = c(50L),
  stringsAsFactors = FALSE
)
grid_vae$hidden_enc <- list(c(512L, 256L, 128L))
grid_vae$hidden_dec <- list(c(128L, 256L, 512L))

tuned <- soilVAE::tune_vae_train_val(
  X_tr = X_int_tr, y_tr = y_int_tr,
  X_va = X_int_va, y_va = y_int_va,
  seed = 19101991,
  grid_vae = grid_vae
)

best <- soilVAE::select_best_from_grid(tuned$tuning_df, selection_metric = "euclid")
cfg <- best$best

m <- soilVAE::vae_build(
  input_dim  = ncol(X_tr),
  hidden_enc = as.integer(strsplit(cfg$hidden_enc_str, "-")[[1]]),
  hidden_dec = as.integer(strsplit(cfg$hidden_dec_str, "-")[[1]]),
  latent_dim = as.integer(cfg$latent_dim),
  dropout    = as.numeric(cfg$dropout),
  lr         = as.numeric(cfg$lr),
  beta_kl    = as.numeric(cfg$beta_kl),
  alpha_y    = as.numeric(cfg$alpha_y)
)

soilVAE::vae_fit(
  model = m,
  X = X_int_tr, y = y_int_tr,
  X_val = X_int_va, y_val = y_int_va,
  epochs = as.integer(cfg$epochs),
  batch_size = as.integer(cfg$batch_size),
  patience = as.integer(cfg$patience),
  verbose = 0L
)

# Predict on: internal train, internal val, and external TEST (datV)
yhat_int_tr <- soilVAE::vae_predict(m, X_int_tr)
yhat_int_va <- soilVAE::vae_predict(m, X_int_va)
yhat_te     <- soilVAE::vae_predict(m, X_te)

vae_tr <- eval_quant(y_int_tr, yhat_int_tr)
vae_va <- eval_quant(y_int_va, yhat_int_va)
vae_te <- eval_quant(y_te, yhat_te)

rbind(
  cbind(Model = "soilVAE", Split = "Train (internal)", as_df_metrics(vae_tr)),
  cbind(Model = "soilVAE", Split = "Val (internal)",   as_df_metrics(vae_va)),
  cbind(Model = "soilVAE", Split = "TEST (datV)",      as_df_metrics(vae_te))
)
```

## Final comparison table (PLS vs soilVAE)

```{r, eval=has_tf}
tab <- rbind(
  cbind(Model = "PLS",    Split = "Calibration (datC)", as_df_metrics(pls_cal)),
  cbind(Model = "PLS",    Split = "TEST (datV)",        as_df_metrics(pls_tst)),
  cbind(Model = "soilVAE",Split = "Train (internal)",   as_df_metrics(vae_tr)),
  cbind(Model = "soilVAE",Split = "Val (internal)",     as_df_metrics(vae_va)),
  cbind(Model = "soilVAE",Split = "TEST (datV)",        as_df_metrics(vae_te))
)

row.names(tab) <- NULL
tab
```