## ----------------------------------------------------------------------------- #| label: setup #| include: false old_options <- options(digits = 3) ## ----loadlib, results = 'hide', include = FALSE------------------------------- # Load package, or use devtools::load_all() if in development if (!requireNamespace("proximetricsR", quietly = TRUE)) { devtools::load_all() } library("proximetricsR") ## ----loadlib2, eval = FALSE--------------------------------------------------- # library("prospectr") # library("proximetricsR") ## ----read, results = 'show'--------------------------------------------------- # data location data_loc <- "https://raw.githubusercontent.com/buchi-labortechnik-ag/demo_data/main/data/" # location of the tsv file containing the spectral data (of soybean meal) my_file_1 <- "SoybeanMeal_file1.tsv" my_file_2 <- "SoybeanMeal_file2.tsv" my_file_1 <- paste0(data_loc, my_file_1) my_file_2 <- paste0(data_loc, my_file_2) # read the files mdata_1 <- proximate_read_data(my_file_1) mdata_2 <- proximate_read_data(my_file_2) # explore the structure of the objects of the imported data sets str(mdata_1, vec.len = 0, give.attr = FALSE) str(mdata_2, vec.len = 0, give.attr = FALSE) ## ----spc, results = 'hide'---------------------------------------------------- mdata_1$spc mdata_2$spc ## ----wavelengths, results = 'hide'-------------------------------------------- original_wavs_1 <- as.numeric(colnames(mdata_1$spc)) original_wavs_2 <- as.numeric(colnames(mdata_2$spc)) ## ----plotspectra, fig.cap = "Spectra loaded. Left: datastet 1; Right: Dataset 2", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 8, fig.height = 8, echo = TRUE, fig.retina = 0.85---- old_par <- par(no.readonly = TRUE) par(mfrow = c(2, 1), mar = c(4, 4, 1, 4)) y_max <- max(c(max(mdata_1$spc), max(mdata_2$spc))) matplot( x = original_wavs_1, y = t(mdata_1$spc), xlab = "Wavelengths, nm", ylab = "Absorbance", xlim = c(400, 1700), ylim = c(0, y_max), type = "l", lty = 1, col = rgb(1, 0.5, 0.5, 0.3) ) grid() matplot( x = original_wavs_2, y = t(mdata_2$spc), xlab = "Wavelengths, nm", ylab = "Absorbance", xlim = c(400, 1700), ylim = c(0, y_max), type = "l", lty = 1, col = rgb(0.9, 0.8, 0.5, 0.3) ) grid() ## ----nwpmerge, results = 'hide'----------------------------------------------- mdata <- proximate_merge(list(mdata_1, mdata_2)) ## ----resample----------------------------------------------------------------- # define the vector of new wavelengths (with constant resolution) # (replace the psc matrix with the new resampled maxtrix) # for working with proximetricsR we need to have our spectra in a spc matrix inside # the data matrix mdata$spc <- process(mdata$spc, prep_resample(c(900, 1700, 2)), device = "proximate") ## ----finalwavs, results = 'hide'--------------------------------------------- final_wavs <- as.numeric(colnames(mdata$spc)) ## ----plotspectra2, fig.cap = "Merged spectra.", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 8, fig.height = 5, echo = TRUE, fig.retina = 0.85---- par(old_par) matplot( x = final_wavs, y = t(mdata$spc), xlab = "Wavelengths, nm", ylab = "Absorbance", type = "l", lty = 1, col = rgb(0.5, 0.5, 0.5, 0.3) ) grid() ## ----settoflase, results = 'hide'-------------------------------------------- # a vector with the row indices of the samples to be deactivated deact <- c(5, 10, 50) # deactivate mdata$Check[deact] <- "False" ## ----settotrue, results = 'hide'--------------------------------------------- mdata$Check <- "True" ## ----properties, results = 'hide'--------------------------------------------- # this gets the names of all the variables in the data names(mdata) # this returns the column names of all responses y_names <- extract_property_names(mdata) # the indices of all the response variables ys_indices <- which(colnames(mdata) %in% y_names) # samples with reference values colSums(!is.na(mdata[, y_names])) ## ----properties3, results = 'hide'-------------------------------------------- #get the names of the response variables y_names <- y_names[1:2] ## ----formula1, results = 'hide'----------------------------------------------- f_simple <- Protein ~ spc # or f_long <- as.formula(paste0("Protein ~ ", paste0(final_wavs, collapse = " + "))) ## ----formulas1, results = 'hide'---------------------------------------------- app_formulas <- list(Moisture ~ spc, Protein ~ spc) ## ----formulas2, results = 'hide'---------------------------------------------- app_formulas <- lapply(paste0(y_names, " ~ spc"), FUN = formula) app_formulas ## ----metadata, results = 'hide'----------------------------------------------- # Define the metadata of each model (in the same order they are in app_formulas) models_metadata <- list( # for the first property add_model_metadata(decimal_places = 2, unit = "%"), # for the second property add_model_metadata(decimal_places = 2, unit = "%") ) ## ----recipe, results = 'show'------------------------------------------------- # here a recipe with the following pre-processing steps is created: # - spline/resampling: with spectral range between 900 and 1700 in steps of 4 # - StandardNormalVariate # - First derivative: with a gap parameter of 5 and a smoothing factor of 9 my_precipe_1 <- preprocess_recipe( prep_resample(c(900, 1700, 4)), prep_snv(), prep_derivative(m = 1, w = 5, p = 9, algorithm = "nwp"), device = "proximate" ) my_precipe_1$preprocessing_order ## ----recipe2, results = 'show'------------------------------------------------ # here a recipe with the following pre-processing steps is created: # - spline/resampling: with spectral range between 900 and 1700 in steps of 5 # - StandardNormalVariate # - First derivative: with a gap parameter of 7 and a smoothing factor of 11 my_precipe_2 <- preprocess_recipe( prep_resample(c(900, 1700, 4)), prep_snv(), prep_derivative(m = 2, w = 7, p = 11, algorithm = "nwp"), device = "proximate" ) my_precipe_2$preprocessing_order # here a recipe with the following pre-processing steps is created: # - spline/resampling: with spectral range between 900 and 1700 in steps of 4 # - StandardNormalVariate # - Second derivative: with a gap parameter of 9 and a smoothing factor of 13 my_precipe_3 <- preprocess_recipe( prep_resample(c(900, 1700, 4)), prep_snv(), prep_derivative(m = 2, w = 9, p = 13, algorithm = "nwp"), device = "proximate" ) my_precipe_3$preprocessing_order # here a recipe with the following pre-processing steps is created: # - spline/resampling: with spectral range between 900 and 1700 in steps of 8 # - StandardNormalVariate # - Second derivative: with a gap parameter of 5 and a smoothing factor of 5 my_precipe_4 <- preprocess_recipe( prep_resample(c(900, 1700, 5)), prep_snv(), prep_derivative(m = 2, w = 5, p = 5, algorithm = "nwp"), device = "proximate" ) my_precipe_4$preprocessing_order ## ----recipelist, results = 'show'--------------------------------------------- my_precipes <- list(my_precipe_1, my_precipe_2, my_precipe_3, my_precipe_4) ## ----rmethod, results = 'show'------------------------------------------------ # here we use the modified pls regression method (equivalent to the one # implemented in NIRWise PLUS) # We use a maximum 11 components my_pls_method <- fit_plsr(ncomp = 11, type = "nwp") my_pls_method ## ----rmethodxls, results = 'show'--------------------------------------------- # here we use the modified pls regression method (equivalent to the one # implemented in NIRWise PLUS) # We use a maximum 11 components my_xls_method <- fit_xlsr(ncomp = 11) my_xls_method ## ----calcontrol, results = 'show'--------------------------------------------- # Control some aspects of how the calibration is built and optimized # We use k-fold cross-validation with selection of folds based on the order of # the samples in the data table ("sequential") # We also specify the number of times a model must be re-fitted after outlier # removal, e.g. 0 means no re-fiting i.e. no outlier removal; 1 means a model is # built, then it is used to identify and remove outliers and finally a the final # model is refitted; a value of 5 would mean that the model is refitted 4 times # for 4 outlier removal iterations. my_control <- calibration_control( validation_type = "kfold", number = 4, folds = "sequential", remove_outliers = 1 # the number of iterations of outlier removal ) ## ----calibration, results = 'hide'------------------------------------------- optimized_app <- calibrate_models( formulas = app_formulas, data = mdata, preprocess_recipes = my_precipes, methods = list(my_pls_method, my_xls_method), control = my_control, metadata_list = models_metadata, save_all = TRUE ) ## ----showoptimize2, eval = FALSE---------------------------------------------- # optimized_app ## ----showoptimize1, eval = TRUE, echo = FALSE--------------------------------- print(optimized_app, separator = " >\n") ## ----finalmodels1, eval = FALSE---------------------------------------------- # optimized_app$final_models ## ----finalmodels2, eval = TRUE----------------------------------------------- names(optimized_app$final_models) ## ----prepro1, echo = FALSE, eval = TRUE, comment = NA------------------------ print(optimized_app$final_models[[1]]$preprocess, separator = " >\n") ## ----prepro2, echo = FALSE, eval = TRUE, comment = NA------------------------ print(optimized_app$final_models[[2]]$preprocess, separator = " >\n") ## ----prepro3, eval = FALSE--------------------------------------------------- # optimized_app$final_models$`Protein ~ spc` # # and # optimized_app$final_models$`Moisture ~ spc` ## ----modelplots, eval = FALSE------------------------------------------------ # plot(optimized_app$final_models$`Protein ~ spc`, selection = "all") # # and # plot(optimized_app$final_models$`Moisture ~ spc`, selection = "all") ## ----summaryopt, eval = FALSE------------------------------------------------ # optimized_app$results_grid ## ----summaryopt2, eval = TRUE------------------------------------------------ optimized_app$results_grid[optimized_app$results_grid$selection, ] ## ----othermodels, eval = FALSE----------------------------------------------- # optimized_app$all_models ## ----othermodels2, eval = FALSE---------------------------------------------- # my_other_models <- optimized_app$all_models[c(1, 6)] ## ----preds, eval = TRUE, results = 'hide'------------------------------------ my_pred_file <- "SoybeanMeal_file3.tsv" my_pred_file <- paste0(data_loc, my_pred_file) pdata <- proximate_read_data(my_pred_file) original_wavs_pred <- as.numeric(colnames(pdata$spc)) ## ----plotspectra3, fig.cap = "Merged spectra.", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 8, fig.height = 5, echo = TRUE, fig.retina = 0.85---- matplot( x = original_wavs_pred, y = t(pdata$spc), xlab = "Wavelengths, nm", ylab = "Absorbance", type = "l", lty = 1, col = rgb(0.3, 0.9, 0.5, 0.5) ) grid() ## ----preds2, eval = TRUE, results = 'hide'----------------------------------- preds <- predict(optimized_app, newdata = pdata) # matrix of predictions preds$predictions # info on the models used preds$model_information ## ----preds3, fig.cap = "Validation plots for the preditions of the optimal models found", fig.cap.style = "Image Caption", fig.align = "center", fig.width = 7, fig.height = 3.5, echo = TRUE, fig.retina = 0.85---- old_par <- par(no.readonly = TRUE) par(mfrow = c(1, 2), mar = c(4, 4, 1, 1)) # Moisture plot plot( x = preds$predictions[, "Moisture"], y = pdata$Moisture, xlab = "Predicted moisture, %", ylab = "Measured moisture, %", xlim = c(10.5, 12.5), ylim = c(10.5, 12.5), pch = 16, cex = 1.5, col = "grey" ) grid() abline(0, 1, col = "red") # Protein plot plot( x = preds$predictions[, "Protein"], y = pdata$Protein, xlab = "Predicted protein, %", ylab = "Measured protein, %", xlim = c(44, 48), ylim = c(44, 48), pch = 16, cex = 1.5, col = "grey" ) grid() abline(0, 1, col = "red") par(old_par) ## ----preds4, eval = FALSE, results = 'hide'---------------------------------- # # squared correlations # cor(x = preds$predictions[, "Moisture"], y = pdata$Moisture)^2 # cor(x = preds$predictions[, "Protein"], y = pdata$Protein)^2 # # # rmses # mean((preds$predictions[, "Moisture"] - pdata$Moisture)^2)^0.5 # mean((x = preds$predictions[, "Protein"] - pdata$Protein)^2)^0.5 ## ----mymodelslist, eval = TRUE, results = 'hide'------------------------------ my_models <- optimized_app$final_models # add some important metadata to the application/model list my_models <- add_application_metadata( my_models, view = "Up", name = "my_application", description = "created with proximetricsR" ) ## ----appfile, eval = FALSE, results = 'hide'---------------------------------- # proximate_write_nax( # path = getwd(), # object = my_models # ) ## ----protein, eval = FALSE, results = 'hide'---------------------------------- # my_p_recipe <- preprocess_recipe( # prep_resample(c(900, 1700, 2)), # prep_snv(), # prep_derivative(m = 1, w = 5, p = 3, algorithm = "nwp"), # device = "proximate" # ) # # protein_model <- calibrate( # Protein ~ spc, # data = mdata, # preprocess = my_p_recipe, # here we can use only one recipe # method = fit_xlsr(5, type = "nwp"), # control = my_control # ) # protein_model # # # add the units and the number of decimal places to the model # protein_model <- add_model_metadata( # protein_model, # unit = "%", # decimal_places = 2 # ) # # plot(protein_model, selection = "all") ## ----predictions, results = 'hide', eval = FALSE------------------------------ # # add the units and the number of decimal places to the model # my_protein_predictions <- predict(protein_model, newdata = pdata$spc) # my_protein_predictions$model_information # my_protein_predictions$predictions ## ----writenax, results = 'hide', eval = FALSE--------------------------------- # # create a list with all the models separately calibrated # # (in this case we only have one) # a_model <- list(protein_model) # # a_model <- add_application_metadata( # a_model, # view = "Up", # name = "a_protein_application", # description = "created with proximetricsR" # ) # # proximate_write_nax( # path = getwd(), # object = a_model # ) ## ----replacemodel, results = 'hide', eval = FALSE----------------------------- # my_models <- optimized_app$final_models # my_models$`Protein ~ spc` <- protein_model ## ----writetsv, eval = FALSE, results = 'hide'--------------------------------- # # let's write down the calibration data into a tsv file # # define a name for the tsv to be written # tsv_name <- "merged_cal_data.tsv" # # # write the tsv # proximate_write_data( # x = mdata, # file = tsv_name, # id = mdata$ID, # spc = "spc", # spc_round = 8, # barcode = "", # properties = y_names, # note = "merged_file_1_and_file_2", # recipe = "SoybeanMeal up-view", # created = mdata$Date, # snr = mdata$SNR # ) ## ----prjcal, eval = FALSE, results = 'hide'----------------------------------- # proximate_write_model( # object = list(protein_model), # path = getwd(), # tsv_paths = tsv_name, # application_name = "Untitled", # cal = TRUE, # prj = TRUE, # rtf = TRUE # ) ## ----cleanup, include = FALSE------------------------------------------------- options(old_options)