---
title: "ProxiMate: Read and recalibrate applications"
author: 
 - name: Leonardo Ramirez-Lopez and Claudio Orellano
   email: ramirez-lopez.l@buchi.com
   affiliation: Data Science Department, BUCHILabortechnik AG, Flawil, Switzerland
date: today
clean: true
bibliography: ["proximetricsR.bib"]
biblio-style: "apalike"
link-citations: true
format:
  html:
    toc: true
    toc-depth: 3
    toc-location: left
    number-sections: true
    code-overflow: wrap
    smooth-scroll: true
    html-math-method: mathjax
vignette: >
  %\VignetteIndexEntry{Read and recalibrate application}
  %\VignetteEncoding{UTF-8}
  %\VignetteEngine{quarto::html}
---


```{r}
#| label: setup
#| include: false

# Disable ANSI colours for vignette rendering
options(cli.num_colors = 1)
Sys.setenv("RSTUDIO" = "")
Sys.setenv("POSITRON" = "")
old_options <- options(digits = 3)
```

# Summary

This vignette presents a set of experimental functions aimed at facilitating the
read, exploration and re-calibration of ProxiMate applications.

```{r}
#| label: loadlib
#| results: hide
#| include: false

# Load package, or use devtools::load_all() if in development
if (!requireNamespace("proximetricsR", quietly = TRUE)) {
  devtools::load_all()
}
library("proximetricsR")
```

```{r}
#| label: loadlib2
#| eval: false

library("prospectr")
library("proximetricsR")
```


## Calibrate an application and write a nax

Here we use a two DEMO files containing spectral data of soybean meal samples
measured with two different BUCHI ProxiMate devices (up-view mode). These 
datasets are available at a public repository of BUCHI demo data.

```{r}
#| label: setawd
#| eval: false

# In practice, set this to the directory containing your data files, e.g.:
# my_working_dir <- "C:/Users/YourName/Downloads"
# my_working_dir <- "~/Downloads"
my_working_dir <- "path/to/your/data"
setwd(my_working_dir)
```

```{r}
#| label: read

# Data location
data_loc <- "https://raw.githubusercontent.com/buchi-labortechnik-ag/demo_data/main/data/"

# Location of the TSV files containing the spectral data of soybean meal
my_file_1 <- "SoybeanMeal_file1.tsv"
my_file_2 <- "SoybeanMeal_file2.tsv"

my_file_1 <- paste0(data_loc, my_file_1)
my_file_2 <- paste0(data_loc, my_file_2)

# Read the files
mdata_1 <- proximate_read_data(my_file_1)
mdata_2 <- proximate_read_data(my_file_2)

mdata <- proximate_merge(list(mdata_1, mdata_2))

# Define the vector of new wavelengths with constant resolution.
# Replace the spc matrix with the new resampled matrix.
# For working with proximetricsR, spectra must be stored in an spc matrix
# inside the data matrix.
mdata$spc <- process(mdata$spc, prep_resample(c(900, 1700, 2)))

final_wavs <- as.numeric(colnames(mdata$spc))
```

```{r}
#| label: plotspectra2
#| fig-cap: "Merged spectra."
#| fig-cap-style: "Image Caption"
#| fig-align: center
#| fig-width: 8
#| fig-height: 5
#| echo: true
#| fig-retina: 0.85

matplot(
  x = final_wavs,
  y = t(mdata$spc),
  xlab = "Wavelengths, nm",
  ylab = "Absorbance",
  type = "l",
  lty = 1,
  col = rgb(0.5, 0.5, 0.5, 0.3)
)
grid()
```

```{r}
#| label: properties2

# This gets the names of all variables in the data
names(mdata)

# This returns the column names of all responses
y_names <- extract_property_names(mdata)

# The indices of all response variables
ys_indices <- which(colnames(mdata) %in% y_names)

# Samples with reference values
colSums(!is.na(mdata[, y_names]))
```

From the `r length(y_names)` properties, the examples that will follow will only 
take into account 2 of them (the first two): `r paste0(y_names[1:2], collapse = " and ")`.

```{r}
#| label: properties3
#| results: hide

# Get the names of the response variables
y_names <- y_names[1:2]
```

```{r}
#| label: define
#| results: hide

# Define the necessary objects for creating an application
app_formulas <- lapply(paste0(y_names, " ~ spc"), FUN = formula)
app_formulas

# Define the metadata of each model, in the same order as app_formulas
models_metadata <- list(
  # For the first property
  add_model_metadata(decimal_places = 2, unit = "%"),
  # For the second property
  add_model_metadata(decimal_places = 2, unit = "%")
)

# Recipe with:
# - spline/resampling: spectral range between 900 and 1700 in steps of 4
# - standard normal variate
# - first derivative: gap parameter 5 and smoothing factor 9
my_precipe_1 <- preprocess_recipe(
  prep_resample(c(900, 1700, 4)),
  prep_snv(),
  prep_derivative(m = 1, w = 5, p = 9, algorithm = "nwp"), 
  device = "proximate"
)

my_precipe_1$preprocessing_order

# Recipe with:
# - spline/resampling: spectral range between 900 and 1700 in steps of 4
# - standard normal variate
# - second derivative: gap parameter 7 and smoothing factor 11
my_precipe_2 <- preprocess_recipe(
  prep_resample(c(900, 1700, 4)),
  prep_snv(),
  prep_derivative(m = 2, w = 7, p = 11, algorithm = "nwp"), 
  device = "proximate"
)

my_precipe_2$preprocessing_order

# Recipe with:
# - spline/resampling: spectral range between 900 and 1700 in steps of 4
# - standard normal variate
# - second derivative: gap parameter 9 and smoothing factor 13
my_precipe_3 <- preprocess_recipe(
  prep_resample(c(900, 1700, 4)),
  prep_snv(),
  prep_derivative(m = 2, w = 9, p = 13, algorithm = "nwp"), 
  device = "proximate"
)

my_precipe_3$preprocessing_order

my_precipes <- list(my_precipe_1, my_precipe_2, my_precipe_3)
```


```{r}
#| label: rmethod

# Use the modified PLS regression method, equivalent to the one
# implemented in NIRWise PLUS.
# We use a maximum of 11 components.
my_pls_method <- fit_plsr(ncomp = 11, type = "nwp")
my_pls_method
```


```{r}
#| label: calcontrol
# Control some aspects of how the calibration is built and optimized
# We use k-fold cross-validation with selection of folds based on the order of 
# the samples in the data table ("sequential")
# We also specify the number of times a model must be re-fitted after outlier 
# removal, e.g. 0 means no re-fiting i.e. no outlier removal; 1 means a model is 
# built, then it is used to identify and remove outliers and finally a the final 
# model is refitted; a value of 5 would mean that the model is refitted 4 times 
# for 4 outlier removal iterations. 
my_control <- calibration_control(
  validation_type = "kfold", 
  number = 4,
  folds = "sequential", 
  remove_outliers = 1 # the number of iterations of outlier removal
)
```

Finally, calibrate the models:

```{r}
#| label: calibration
optimized_app <- calibrate_models(
  formulas = app_formulas,
  data = mdata,
  preprocess_recipes = my_precipes,
  methods = list(my_pls_method),
  control = my_control, 
  metadata_list = models_metadata, 
  save_all = TRUE
)
```


```{r}
#| label: showoptimize2
optimized_app
```
The above object shows you a table with the validation results and the suggested
pre-processing methods per model:

```{r} 
#| label: showoptimize1
#| echo: false
print(optimized_app, separator = " >\n ")
```

Write the application file:

```{r}
#| label: mymodelslist
#| eval: true
#| results: hide
my_models <- optimized_app$final_models

# add some important metadata to the application/model list
my_models <- add_application_metadata(
  my_models, 
  view = "Up", 
  name = "my_application", 
  description = "created with proximetricsR"
)

proximate_write_nax(
  path = getwd(),
  object = my_models
)
```

# Read ProxiMate application files (.nax)

Simple:

```{r}
#| label: readnax 
#| echo: true
#| eval: true
#| results: hide
my_nax <- proximate_read_nax("my_application.nax")
```

# Recalibrate application

## Just re-fit the models 

Simple:

```{r}
#| label: recal
#| eval: true
#| results: hide
my_recalnax <- proximate_recalibrate_nax(my_nax)
```

## Recalibrate based on new data
```{r}
#| label: recal2
#| eval: true
#| results: hide
my_pred_file <- "SoybeanMeal_file3.tsv"
my_pred_file <- paste0(data_loc, my_pred_file)

pdata <- proximate_read_data(my_pred_file)

# prepare the data to add to the nax
to_add <- proximate_add2nax(data = pdata)

# re-calibrate based on the new data
my_recalnax2 <- proximate_recalibrate_nax(my_nax, add = to_add)
```

```{r cleanup, include = FALSE}
options(old_options)
```