## -----------------------------------------------------------------------------
#| message: false
#| echo: false
library(resemble)
Sys.setenv(OMP_NUM_THREADS = 2)


## -----------------------------------------------------------------------------
#| message: false
library(prospectr)
data(NIRsoil)

# Preprocess spectra
NIRsoil$spc_pr <- savitzkyGolay(
  detrend(NIRsoil$spc, wav = as.numeric(colnames(NIRsoil$spc))),
  m = 1, p = 1, w = 7
)

# Missing values in the response are allowed
train_x <- NIRsoil$spc_pr[NIRsoil$train == 1, ]
train_y <- NIRsoil$Ciso[NIRsoil$train == 1]
test_x  <- NIRsoil$spc_pr[NIRsoil$train == 0, ]
test_y  <- NIRsoil$Ciso[NIRsoil$train == 0]

md <- dissimilarity(
  test_x,
  apply(test_x, 2, median),
  diss_method = diss_correlation(ws = 101, center = T, scale = T)
)

threshold <- 0.4


## -----------------------------------------------------------------------------
#| label: fig-diss-threshold
#| fig-cap: "Left: Dissimilarity of the test spectra to the median test spectrum. Samples above the threshold (dashed line and red) were excluded from the target set as a simple attempt to reduce spectral heterogeneity and potential non-linearities in the spectra-response relationship. Right: Spectra of the samples above (red, to be excluded) and below (blue, to be kept) the dissimilarity threshold."
#| fig-align: "center"
#| fig-width: 7.5
#| fig-height: 4.5
op <- par(mfrow = c(1, 2), mar = c(4.5, 4.5, 2, 1))

plot(
  seq_along(md$dissimilarity),
  md$dissimilarity, 
  ylim = c(0, 1), 
  col = "steelblue",
  pch = 16,
  cex = 1.5,
  xlab = "Sample index", 
  ylab = "Dissimilarity to median spectrum"
)
points(
  which(md$dissimilarity >= threshold),
  md$dissimilarity[md$dissimilarity >= threshold],
  col = "tomato",
  pch = 16,
  cex = 1.5
)
abline(h = threshold, col = "tomato", lty = 2, lwd = 2)
grid(lty = 1)

matplot(
  as.numeric(colnames(train_x)),
  t(test_x[md$dissimilarity >= threshold, ]),
  col = "tomato",
  ylab = "Preprocessed spectra",
  xlab = "Wavelengths, nm",
  lty = 1,
  type = "l"
)
grid(lty = 1)
matlines(
  as.numeric(colnames(train_x)),
  t(test_x[md$dissimilarity < threshold, ]),
  col = "steelblue",
  lty = 1,
  type = "l"
)

par(op)


## -----------------------------------------------------------------------------
keep <- md$dissimilarity < threshold

cat("Number of samples retained:", sum(keep))

test_x_test <- test_x[keep, ]
test_y_test <- test_y[keep]

# sample a very small subset to guide the search, 
# and use the rest for testing the final model
set.seed(1124)
ref_ind <- sample(sum(keep), 8)
test_x_ref <- test_x_test[ref_ind, ]
test_y_ref <- test_y_test[ref_ind]

test_x_test <- test_x_test[-ref_ind, ]
test_y_test <- test_y_test[-ref_ind]


## -----------------------------------------------------------------------------
set.seed(1124)

pls_model <- model(
  Xr = rbind(
    train_x[!is.na(train_y), ],
    test_x_ref[!is.na(test_y_ref), ]
  ), 
  Yr = c(
    train_y[!is.na(train_y)],
    test_y_ref[!is.na(test_y_ref)]
  ),
  fit_method = fit_pls(ncomp = 15, method = "simpls", scale = TRUE),
  control = model_control(validation_type = "lgo", number = 10)
)

best_ncomp <- which(pls_model$cv_results$optimal)

pred <- predict(pls_model, test_x_test, ncomp = best_ncomp)


## -----------------------------------------------------------------------------
reg_metrics <- function(obs, pred, na.rm = TRUE) {
  if (na.rm) {
    ok <- complete.cases(obs, pred)
    obs <- obs[ok]
    pred <- pred[ok]
  }
  c(RMSE = sqrt(mean((obs - pred)^2)), R2 = cor(obs, pred)^2)
}


## -----------------------------------------------------------------------------
reg_metrics(test_y_test, pred)


## -----------------------------------------------------------------------------
my_control <- gesearch_control(
  retain_by = "probability",
  tune = FALSE,
  number = 10L,
  p = 0.75
)


## -----------------------------------------------------------------------------
#| eval: true
#| echo: false 
gs <- gesearch(
  Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
  Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
  Xu = test_x_ref, # the available target domain spectra
  Yu = test_y_ref, # the available target domain response (accepts NAs)
  k = 50, b = 100, retain = 0.95,
  target_size = 120,
  fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
  optimization = c("reconstruction", "similarity", "response"),
  control = my_control,
  seed = 1124, 
  verbose = FALSE
)


## -----------------------------------------------------------------------------
#| eval: false
# gs <- gesearch(
#   Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
#   Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
#   Xu = test_x_ref, # the available target domain spectra
#   Yu = test_y_ref, # the available target domain response (accepts NAs)
#   k = 50, b = 100, retain = 0.95,
#   target_size = 120,
#   fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
#   optimization = c("reconstruction", "similarity", "response"),
#   control = my_control,
#   seed = 1124
# )


## -----------------------------------------------------------------------------
#| eval: false
# plot(gs)


## -----------------------------------------------------------------------------
#| label: fig-evolution-gs
#| echo: false 
#| fig-cap: "Evolution of the three weakness metrics (response, reconstruction and dissimilarity) used in the search."
#| fig-align: "center"
#| fig-width: 8.5
#| fig-height: 5.5
plot(gs, main = NA)


## -----------------------------------------------------------------------------
# the internal leave-group out CV results for the data found
gs$validation_results[[1]]$results$train


## -----------------------------------------------------------------------------
# the CV on the target samples that drove the search (note that this is not a proper test set, as these samples were used to guide the search, but it can provide some insight into the performance of the final model on the target domain)
gs$validation_results[[1]]$results$test


## -----------------------------------------------------------------------------
best_ncomp <- which.min(gs$validation_results[[1]]$results$train$rmse_cv)


## -----------------------------------------------------------------------------
pred_gs <- predict(gs, test_x_test)[[1]]

reg_metrics(test_y_test, pred_gs[, best_ncomp])


## -----------------------------------------------------------------------------
#| label: fig-pred-comparison
#| fig-cap: "Comparison of observed versus predicted total carbon values. Left: model fitted using all available training samples. Right: model fitted using the subset of samples selected by gesearch()."
#| fig-align: "center"
#| fig-width: 7.5
#| fig-height: 4.5
op <- par(mfrow = c(1, 2), mar = c(4.5, 4.5, 3, 1))

rng <- range(pred, pred_gs, test_y_test, na.rm = TRUE)

plot(
  pred, test_y_test,
  xlim = rng,
  ylim = rng,
  xlab = "Predicted Total Carbon, %",
  ylab = "Observed Total Carbon, %",
  main = "Model with all training \nsamples",
  cex = 1.5,
  pch = 16,
  col = rgb(0.5, 0.5, 0.5, 0.6)
)
grid(lty = 1)
abline(0, 1, col = "red")

plot(
  pred_gs[, best_ncomp], test_y_test,
  xlim = rng,
  ylim = rng,
  xlab = "Predicted Total Carbon, %",
  ylab = "Observed Total Carbon, %",
  main = "Model with gesearch-selected \nsamples",
  cex = 1.5,
  pch = 16,
  col = rgb(0.5, 0.5, 0.5, 0.6)
)
grid(lty = 1)
abline(0, 1, col = "red")

par(op)


## -----------------------------------------------------------------------------
#| eval: true
#| echo: false 
gs_nr <- gesearch(
  Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
  Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
  Xu = test_x_ref, # the available target domain spectra
  Yu = NULL, # NO available target domain response (accepts NAs)
  k = 50, b = 80, retain = 0.98,
  target_size = 160,
  fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
  optimization = c("reconstruction", "similarity"),
  control = my_control,
  seed = 1124, 
  verbose = FALSE
)


## -----------------------------------------------------------------------------
#| eval: false
# gs_nr <- gesearch(
#   Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
#   Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
#   Xu = test_x_ref, # the available target domain spectra
#   Yu = NULL, # NO available target domain response (accepts NAs)
#   k = 50, b = 80, retain = 0.98,
#   target_size = 160,
#   fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
#   optimization = c("reconstruction", "similarity"),
#   control = my_control,
#   seed = 1124
# )


## -----------------------------------------------------------------------------
best_ncomp_nr <- which.min(gs_nr$validation_results[[1]]$results$train$rmse_cv)
best_ncomp_nr
pred_gs_nr <- predict(gs_nr, test_x_test)[[1]]

reg_metrics(test_y_test, pred_gs_nr[, best_ncomp_nr])


## -----------------------------------------------------------------------------
#| eval: true
#| echo: false 
gs_nr2 <- gesearch(
  Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
  Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
  Xu = test_x_ref, # the available target domain spectra
  Yu = NULL, # NO available target domain response (accepts NAs)
  Yu_lims = c(0, 5), # the response range in the target domain
  k = 50, b = 80, retain = 0.98,
  target_size = 160,
  fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
  optimization = c("reconstruction", "similarity", "range"),
  control = my_control,
  seed = 1124, 
  verbose = FALSE
)


## -----------------------------------------------------------------------------
#| eval: false
# gs_nr2 <- gesearch(
#   Xr = train_x[!is.na(train_y), ], # the spectra of the reference "set/library"
#   Yr = train_y[!is.na(train_y)],   # the response of the reference "set/library"
#   Xu = test_x_ref, # the available target domain spectra
#   Yu = NULL, # NO available target domain response (accepts NAs)
#   Yu_lims = c(0, 5), # the response range in the target domain
#   k = 50, b = 80, retain = 0.98,
#   target_size = 160,
#   fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
#   optimization = c("reconstruction", "similarity", "range"),
#   control = my_control,
#   seed = 1124
# )


## -----------------------------------------------------------------------------
best_ncomp_nr2 <- which.min(gs_nr2$validation_results[[1]]$results$train$rmse_cv)
best_ncomp_nr2
pred_gs_nr2 <- predict(gs_nr2, test_x_test)[[1]]

reg_metrics(test_y_test, pred_gs_nr2[, best_ncomp_nr2])


## -----------------------------------------------------------------------------
#| label: parallel-gesearch
#| eval: false
# # Running gesearch() using multiple cores
# 
# # Execute with two cores, if available, ...
# n_cores <- 2
# 
# # ... if not then go with 1 core
# if (parallel::detectCores() < 2) {
#   n_cores <- 1
# }
# 
# # Set the number of cores
# library(doParallel)
# clust <- makeCluster(n_cores)
# registerDoParallel(clust)
# 
# # Alternatively:
# # library(doSNOW)
# # clust <- makeCluster(n_cores, type = "SOCK")
# # registerDoSNOW(clust)
# # getDoParWorkers()
# 
# my_control <- gesearch_control(
#   retain_by = "probability",
#   tune = FALSE,
#   number = 10L,
#   p = 0.75
# )
# 
# gs_p <- gesearch(
#   Xr = train_x[!is.na(train_y), ],
#   Yr = train_y[!is.na(train_y)],
#   Xu = test_x_ref,
#   Yu = test_y_ref,
#   k = 50,
#   b = 100,
#   retain = 0.98,
#   target_size = 80,
#   fit_method = fit_pls(ncomp = 20, method = "simpls", scale = TRUE),
#   optimization = c("reconstruction", "similarity", "response"),
#   control = my_control,
#   seed = 1124,
#   pchunks = 2L
# )
# 
# # Go back to sequential processing
# registerDoSEQ()
# try(stopCluster(clust))
# 
# gs_p