## -----------------------------------------------------------------------------
#| message: false
#| echo: false
library(resemble)
library(prospectr)
Sys.setenv(OMP_NUM_THREADS = 2)


## -----------------------------------------------------------------------------
#| label: data-prep
#| message: false
library(resemble)
library(prospectr)

data(NIRsoil)

# Wavelengths
wavs <- as.numeric(colnames(NIRsoil$spc))

# Preprocess: detrend + first derivative
NIRsoil$spc_pr <- savitzkyGolay(
  detrend(NIRsoil$spc, wav = wavs),
  m = 1, p = 1, w = 7
)

# Split into training and test sets
# Note: missing values in the response are allowed in liblex
train_x <- NIRsoil$spc_pr[NIRsoil$train == 1, ]
train_y <- NIRsoil$Ciso[NIRsoil$train == 1]
test_x  <- NIRsoil$spc_pr[NIRsoil$train == 0, ]
test_y  <- NIRsoil$Ciso[NIRsoil$train == 0]

cat("Training set:", nrow(train_x), "observations\n")
cat("Test set:", nrow(test_x), "observations\n")


## -----------------------------------------------------------------------------
#| label: neighbors-example1
# Fixed neighborhood sizes to evaluate
neighbors_k(k = seq(40, 120, by = 20))


## -----------------------------------------------------------------------------
#| label: neighbors-example2
# Threshold-based selection with bounds
neighbors_diss(
  threshold = seq(0.05, 0.5, length.out = 10), 
  k_min = 40, 
  k_max = 150
)


## -----------------------------------------------------------------------------
#| label: diss-example
# Moving-window correlation dissimilarity (recommended for spectra)
diss_correlation(ws = 37, scale = TRUE)

# PCA-based Mahalanobis distance with optimized components
diss_pca()


## -----------------------------------------------------------------------------
#| label: fit-method-example
# waPLS with modified PLS algorithm and scaling
fit_wapls(
  min_ncomp = 3, 
  max_ncomp = 15, 
  scale = TRUE, 
  method = "mpls"
)


## -----------------------------------------------------------------------------
#| label: control-example
#| eval: false
# # Build library with hyperparameter tuning
# liblex_control(mode = "build", tune = TRUE)


## -----------------------------------------------------------------------------
#| eval: false
# ciso_lib_k <- liblex(
#   Xr = train_x,
#   Yr = train_y,
#   neighbors = neighbors_k(seq(40, 80, by = 20)),
#   diss_method = diss_correlation(ws = 37, scale = TRUE),
#   fit_method = fit_wapls(
#     min_ncomp = 3,
#     max_ncomp = 15,
#     scale = TRUE,
#     method = "mpls"
#   ),
#   control = liblex_control(tune = TRUE)
# )
# 
# ciso_lib_k


## -----------------------------------------------------------------------------
#| label: liblex-fixed-k
#| cache: true
#| echo: false
#| message: false
#| warning: false
ciso_lib_k <- liblex(
  Xr = train_x, 
  Yr = train_y, 
  neighbors = neighbors_k(seq(40, 80, by = 20)),
  diss_method = diss_correlation(ws = 37, scale = TRUE),
  fit_method = fit_wapls(
    min_ncomp = 3,
    max_ncomp = 15,
    scale = TRUE,
    method = "mpls"
  ),
  control = liblex_control(tune = TRUE),
  verbose = FALSE
)

ciso_lib_k


## -----------------------------------------------------------------------------
#| label: fig-liblex
#| fig-cap: "Top: Best model obtained for each neighborhood size (based on the RMSE). Bottom: Centroids of the neighborhoods, usled later in prediction to selet the models to be used."
#| fig-align: "center"
#| fig-width: 7.5
#| fig-height: 6.5
plot(ciso_lib_k)


## -----------------------------------------------------------------------------
#| label: liblex-threshold
#| cache: true
#| echo: false
ciso_lib_thr <- liblex(
  Xr = train_x, 
  Yr = train_y, 
  neighbors = neighbors_diss(
    threshold = seq(0.05, 0.5, length.out = 5), 
    k_min = 40, 
    k_max = 150
  ),
  diss_method = diss_correlation(ws = 37, scale = TRUE),
  fit_method = fit_wapls(
    min_ncomp = 3,
    max_ncomp = 15,
    scale = TRUE,
    method = "mpls"
  ),
  control = liblex_control(tune = TRUE),
  verbose = FALSE
)

ciso_lib_thr


## -----------------------------------------------------------------------------
#| cache: true
#| eval: false
# ciso_lib_thr <- liblex(
#   Xr = train_x,
#   Yr = train_y,
#   neighbors = neighbors_diss(
#     threshold = seq(0.05, 0.5, length.out = 5),
#     k_min = 40,
#     k_max = 150
#   ),
#   diss_method = diss_correlation(ws = 37, scale = TRUE),
#   fit_method = fit_wapls(
#     min_ncomp = 3,
#     max_ncomp = 15,
#     scale = TRUE,
#     method = "mpls"
#   ),
#   control = liblex_control(tune = TRUE)
# )
# 
# ciso_lib_thr


## -----------------------------------------------------------------------------
#| label: fig-centroids
#| fig-cap: "Neighborhood centroids (blue) compared to test spectra (red)"
#| fig-width: 8
#| fig-height: 5
#| fig-align: center
wavs_pr <- as.numeric(colnames(ciso_lib_k$scaling$local_x_center))

matplot(
  wavs_pr,
  t(test_x),
  col = rgb(1, 0, 0, 0.3),
  lty = 1,
  type = "l",
  xlab = "Wavelength (nm)",
  ylab = "First derivative detrended absorbance"
)

matlines(
  wavs_pr,
  t(ciso_lib_k$scaling$local_x_center),
  col = rgb(0, 0, 1, 0.3),
  lty = 1
)

grid(lty = 1)

legend(
  "topright",
  legend = c("Samples to predict", "Neighborhood centroids"),
  col = c(rgb(1, 0, 0, 0.8), rgb(0, 0, 1, 0.8)),
  lty = 1,
  lwd = 2,
  bty = "n"
)


## -----------------------------------------------------------------------------
#| label: fig-regression-coefficients
#| fig-cap: "Regression coefficients (top) and intercept distribution (bottom) of the local experts"
#| fig-width: 8
#| fig-height: 7
#| fig-align: center

par(mfrow = c(2, 1), mar = c(4, 4, 2, 1))

# Regression coefficients across wavelengths
models <- ciso_lib_k$coefficients

matplot(
  x = wavs_pr,
  y = t(models$B),
  type = "l",
  lty = 1,
  col = rgb(0.23, 0.51, 0.96, 0.15),
  xlab = "Wavelength (nm)",
  ylab = "Regression coefficient",
  main = paste0("Regression coefficients (", nrow(models$B), " experts)")
)

# Add mean coefficient profile
lines(wavs_pr, colMeans(models$B), col = "red", lwd = 2)
grid(lty = 1)
legend(
  "topright",
  legend = c("Individual experts", "Mean"),
  col = c(rgb(0.23, 0.51, 0.96, 0.6), "red"),
  lty = 1,
  lwd = c(1, 2),
  bty = "n"
)

# Distribution of intercepts
plot(
  density(models$B0, na.rm = TRUE),
  col = rgb(0.23, 0.51, 0.96),
  lwd = 2,
  xlab = "Intercept value",
  ylab = "Density",
  main = "Distribution of intercepts"
)
polygon(
  density(models$B0, na.rm = TRUE),
  col = rgb(0.23, 0.51, 0.96, 0.2),
  border = NA
)
abline(v = mean(models$B0, na.rm = TRUE), col = "red", lty = 2, lwd = 2)
grid(lty = 1)
legend(
  "topright",
  legend = c("Density", "Mean"),
  col = c(rgb(0.23, 0.51, 0.96), "red"),
  lty = c(1, 2),
  lwd = 2,
  bty = "n"
)

par(mfrow = c(1, 1))


## -----------------------------------------------------------------------------
#| label: kmeans-anchors
# Select 350 representative anchors using k-means sampling
# on the first 20 principal components
set.seed(1124)
kms <- naes(
  train_x, 
  k = 350, 
  pc = 20, 
  iter.max = 100,
  .center = TRUE, 
  .scale = TRUE
)

anchor_km <- kms$model

cat("Selected", length(anchor_km), "anchors via k-means\n")


## -----------------------------------------------------------------------------
#| label: liblex-anchored
#| cache: true
#| echo: false
ciso_lib_anchored <- liblex(
  Xr = train_x, 
  Yr = train_y, 
  neighbors = neighbors_k(seq(40, 80, by = 20)),
  diss_method = diss_correlation(ws = 37, scale = TRUE),
  fit_method = fit_wapls(
    min_ncomp = 3,
    max_ncomp = 15,
    scale = TRUE,
    method = "mpls"
  ),
  anchor_indices = anchor_km,
  control = liblex_control(tune = TRUE),
  verbose = FALSE
)

ciso_lib_anchored


## -----------------------------------------------------------------------------
#| cache: true
#| eval: false
# ciso_lib_anchored <- liblex(
#   Xr = train_x,
#   Yr = train_y,
#   neighbors = neighbors_k(seq(40, 80, by = 20)),
#   diss_method = diss_correlation(ws = 37, scale = TRUE),
#   fit_method = fit_wapls(
#     min_ncomp = 3,
#     max_ncomp = 15,
#     scale = TRUE,
#     method = "mpls"
#   ),
#   anchor_indices = anchor_km,
#   control = liblex_control(tune = TRUE)
# )
# 
# ciso_lib_anchored


## -----------------------------------------------------------------------------
# Number of experts in the anchored library
n_experts <- nrow(ciso_lib_anchored$coefficients$B)
cat("Number of experts in the anchored library:", n_experts, "\n")


## -----------------------------------------------------------------------------
#| label: fig-centroids-anchored
#| fig-cap: "Neighborhood centroids from anchored library compared to test spectra"
#| fig-width: 8
#| fig-height: 5
#| fig-align: center

wavs_pr <- as.numeric(colnames(ciso_lib_anchored$scaling$local_x_center))
n_experts <- nrow(ciso_lib_anchored$scaling$local_x_center)
n_test <- nrow(test_x)

# Plot test spectra first (background)
matplot(
  wavs_pr,
  t(test_x),
  col = rgb(0.8, 0.2, 0.2, 0.2),
  lty = 1,
  type = "l",
  xlab = "Wavelength (nm)",
  ylab = "First derivative detrended absorbance",
  main = paste0("Coverage: ", n_experts, " experts vs ", n_test, " test samples")
)

# Overlay centroids
matlines(
  wavs_pr,
  t(ciso_lib_anchored$scaling$local_x_center),
  col = rgb(0.23, 0.51, 0.96, 0.3),
  lty = 1
)
grid(lty = 1)

legend(
  "topright",
  legend = c(
    paste0("Test samples (n = ", n_test, ")"),
    paste0("Centroids (n = ", n_experts, ")")
  ),
  col = c(
    rgb(0.8, 0.2, 0.2, 0.5),
    rgb(0.23, 0.51, 0.96, 0.5)
  ),
  lty = 1,
  lwd = c(1, 1, 2, 2),
  bty = "n"
)


## -----------------------------------------------------------------------------
#| label: predict-basic
ciso_pred <- predict(ciso_lib_k, newdata = test_x, verbose = FALSE)

# Prediction output structure
names(ciso_pred)

# Main predictions with uncertainty
head(ciso_pred$predictions)


## -----------------------------------------------------------------------------
#| label: weighting-options
#| eval: false
# # Gaussian kernel (default)
# predict(ciso_lib_k, newdata = test_x, weighting = "gaussian")
# 
# # Tricubic kernel (similar to LOCAL algorithm)
# predict(ciso_lib_k, newdata = test_x, weighting = "tricube")
# 
# # Equal weights
# predict(ciso_lib_k, newdata = test_x, weighting = "none")


## -----------------------------------------------------------------------------
#| label: enforce-indices
#| eval: false
# # Always include specific experts in predictions
# predict(ciso_lib_k, newdata = test_x, enforce_indices = c(1, 5, 10))


## -----------------------------------------------------------------------------
#| label: evaluation
pred_values <- ciso_pred$predictions$pred
pred_sd <- ciso_pred$predictions$pred_sd

# Performance metrics (excluding missing values)
complete <- !is.na(test_y)
r2 <- cor(pred_values[complete], test_y[complete])^2
rmse <- sqrt(mean((pred_values[complete] - test_y[complete])^2))

cat("R²:  ", round(r2, 3), "\n")
cat("RMSE:", round(rmse, 3), "\n")


## -----------------------------------------------------------------------------
#| label: uncertainty-filter
# Filter predictions by uncertainty threshold
unc_threshold <- quantile(pred_sd[complete], 0.75)
reliable <- complete & (pred_sd < unc_threshold)

r2_filtered <- cor(pred_values[reliable], test_y[reliable])^2
rmse_filtered <- sqrt(mean((pred_values[reliable] - test_y[reliable])^2))

cat("After filtering high-uncertainty predictions:\n")
cat("  Retained:", sum(reliable), "/", sum(complete), "observations\n")
cat("  R²:      ", round(r2_filtered, 3), "\n")
cat("  RMSE:    ", round(rmse_filtered, 3), "\n")


## -----------------------------------------------------------------------------
#| label: fig-validation
#| fig-cap: "Predicted versus observed values"
#| fig-width: 6
#| fig-height: 5
#| fig-align: center
lims <- range(pred_values[complete], test_y[complete], na.rm = TRUE)

plot(
  pred_values[complete],
  test_y[complete],
  pch = 16,
  col = rgb(0, 0, 0, 0.5),
  xlab = "Predicted",
  ylab = "Observed",
  xlim = lims,
  ylim = lims
)

abline(0, 1, col = "red", lwd = 2)
grid(lty = 1)


## -----------------------------------------------------------------------------
#| label: parallel-example
#| eval: false
# library(doParallel)
# 
# # Register parallel backend
# n_cores <- min(parallel::detectCores() - 1, 4)
# cl <- makeCluster(n_cores)
# registerDoParallel(cl)
# 
# # Build library with parallel processing
# ciso_lib_parallel <- liblex(
#   Xr = train_x,
#   Yr = train_y,
#   neighbors = neighbors_k(seq(40, 120, by = 20)),
#   diss_method = diss_correlation(ws = 37, scale = TRUE),
#   fit_method = fit_wapls(
#     min_ncomp = 3,
#     max_ncomp = 15,
#     scale = TRUE,
#     method = "mpls"
#   ),
#   control = liblex_control(
#     tune = TRUE,
#     allow_parallel = TRUE,
#     chunk_size = 10
#   )
# )
# 
# # Clean up
# stopCluster(cl)
# registerDoSEQ()