--- title: "Advanced Modelling Workflows with NRMstatsML" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > %\VignetteIndexEntry{Advanced Modelling Workflows with NRMstatsML} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 4.5, message = FALSE, warning = FALSE ) ``` ## Overview This vignette covers advanced workflows built on top of the core NRMstatsML modules, including: 1. Combining trend and uncertainty analysis 2. Multi-variable response surface exploration 3. Integrating bootstrap uncertainty into forecasting 4. Reading data from the bundled CSV (reproducible pipelines) 5. Extending NRMstatsML with custom `stat_fn` closures --- ## 1. Loading Data from CSV The package ships a plain-text version of `nrm_example` under `inst/extdata` for users who want to demonstrate a full import-to-analysis pipeline: ```{r csv-import} library(NRMstatsML) csv_path <- system.file("extdata", "nrm_example.csv", package = "NRMstatsML") d <- read.csv(csv_path, stringsAsFactors = FALSE) nrm_data_check(d, time_var = "year", verbose = TRUE) ``` --- ## 2. Combining Trend Detection and Uncertainty A common question in long-term NRM studies is: *is the observed trend in yield real, or could it arise by chance?* Bootstrap sampling answers this by resampling the dataset many times and computing Sen's slope on each resample. ```{r trend-bootstrap} # Define a statistic function: Sen's slope on crop_yield sens_stat <- function(df) { nrm_sens_slope(df, time_var = "year", value_var = "crop_yield")$slope } # Bootstrap the slope — 500 replicates for speed bs_slope <- nrm_bootstrap(d, stat_fn = sens_stat, n_iter = 500) print(bs_slope) ``` If the 95% CI for the slope excludes zero, the trend is significant. This approach is more robust than a single-pass Mann-Kendall test because it directly quantifies slope uncertainty. --- ## 3. Multi-Variable Response Surface When two inputs interact (e.g. N × rainfall), a full response surface reveals the joint optimum. We build this by iterating `nrm_response_curve()` across a grid and collecting predicted yields. ```{r response-surface} # Fit a quadratic surface: yield ~ N + rainfall (manual grid approach) mv_surface <- nrm_multivariate(d, formula = crop_yield ~ N + I(N^2) + rainfall + I(rainfall^2) + N:rainfall, scale = FALSE) # Prediction grid N_seq <- seq(60, 120, length.out = 30) R_seq <- seq(610, 720, length.out = 30) grid <- expand.grid(N = N_seq, rainfall = R_seq) grid$predicted_yield <- stats::predict(mv_surface$model, newdata = grid) # Simple contour summary (base graphics — no extra dependencies) with( reshape(grid, idvar = "N", timevar = "rainfall", direction = "wide")[, 1:10], # abbreviated for display message(sprintf("Predicted yield range: %.2f to %.2f t/ha", min(grid$predicted_yield), max(grid$predicted_yield))) ) ``` ```{r surface-plot, fig.height=5} # ggplot2 visualisation of the response surface library(ggplot2) ggplot(grid, aes(x = N, y = rainfall, fill = predicted_yield)) + geom_tile() + scale_fill_viridis_c(name = "Yield\n(t/ha)", option = "C") + geom_contour(aes(z = predicted_yield), colour = "white", alpha = 0.5, linewidth = 0.4) + labs( title = "Predicted Yield Response Surface", subtitle = "Quadratic model: crop_yield ~ N × rainfall", x = "Nitrogen applied (kg/ha)", y = "Rainfall (mm)" ) + theme_minimal(base_size = 13) ``` --- ## 4. Uncertainty-Adjusted Forecasting Instead of a single ARIMA forecast, we propagate parametric uncertainty by running `nrm_forecast()` on bootstrap resamples of the historical record. ```{r boot-forecast} # Collect horizon-1 point forecasts from 200 bootstrap resamples set.seed(42) boot_forecasts <- replicate(200, { idx <- sample(nrow(d), replace = TRUE) boot_d <- d[sort(idx), ] # keep time order approximately boot_d$year <- d$year # restore exact year sequence fc <- nrm_forecast(boot_d, value_var = "crop_yield", horizon = 1, method = "auto_arima") as.numeric(fc$forecast$mean) }, simplify = TRUE) cat(sprintf( "Bootstrap forecast (h=1):\n Mean = %.3f t/ha\n 95%% CI: [%.3f, %.3f]\n", mean(boot_forecasts), quantile(boot_forecasts, 0.025), quantile(boot_forecasts, 0.975) )) ``` --- ## 5. Custom `stat_fn` Closures `nrm_uncertainty()` and `nrm_bootstrap()` accept any function of the form `f(data) → scalar`. This makes them highly extensible. ### Example A — R-squared of an OLS model ```{r custom-rsq} rsq_fn <- function(df) { m <- lm(crop_yield ~ N + P + K, data = df) summary(m)$r.squared } bs_rsq <- nrm_bootstrap(d, stat_fn = rsq_fn, n_iter = 500) print(bs_rsq) ``` ### Example B — Optimum N from a quadratic response curve ```{r custom-optN} opt_N_fn <- function(df) { rc <- nrm_response_curve(df, input_var = "N", response_var = "crop_yield", type = "quadratic") opt <- nrm_optimize_input(rc, price_ratio = 0.6) opt$economic_optimum } bs_optN <- nrm_bootstrap(d, stat_fn = opt_N_fn, n_iter = 300) cat(sprintf( "Bootstrapped economic optimum N:\n Mean = %.1f kg/ha\n 95%% CI: [%.1f, %.1f]\n", bs_optN$mean, bs_optN$ci[1], bs_optN$ci[2] )) ``` ### Example C — Mann-Kendall tau for soil OC ```{r custom-tau} tau_fn <- function(df) { nrm_mann_kendall(df, time_var = "year", value_var = "soil_OC")$tau } mc_tau <- nrm_monte_carlo(d, stat_fn = tau_fn, n_iter = 300, noise_sd = 0.05) print(mc_tau) ``` --- ## 6. Multi-Metric Model Comparison Compare OLS, quadratic OLS, and PLS side-by-side using `nrm_benchmark()` on a simple hold-out split. ```{r benchmark} set.seed(123) n_d <- nrow(d) idx <- sample(n_d, floor(0.75 * n_d)) train <- d[ idx, ] test <- d[-idx, ] # Fit three candidate models m_ols <- lm(crop_yield ~ N + P + K + rainfall, data = train) m_quad <- lm(crop_yield ~ N + I(N^2) + rainfall, data = train) m_full <- lm(crop_yield ~ N + P + K + rainfall + soil_OC, data = train) bm <- nrm_benchmark( models = list(ols = m_ols, quadratic = m_quad, full_ols = m_full), test_data = test, response_var = "crop_yield" ) print(bm) ``` --- ## 7. Complete Reproducible Pipeline Putting it all together — a single self-contained script suitable for a supplementary materials section: ```{r full-pipeline, eval = FALSE} library(NRMstatsML) # 1. Import d <- read.csv(system.file("extdata", "nrm_example.csv", package = "NRMstatsML")) # 2. Validate nrm_data_check(d) # 3. Trend tr <- nrm_trend(d, time_var = "year", value_var = "crop_yield") nrm_plot(tr) # 4. Multivariate mv <- nrm_multivariate(d, crop_yield ~ N + P + K + rainfall) nrm_summary(mv) # 5. Response curve and optimum rc <- nrm_response_curve(d, "N", "crop_yield", type = "quadratic") opt <- nrm_optimize_input(rc, price_ratio = 0.6) nrm_plot(rc) # 6. Forecast fc <- nrm_forecast(d, "crop_yield", horizon = 5) nrm_plot(fc) # 7. Uncertainty unc <- nrm_uncertainty(d, stat_fn = function(x) mean(x$crop_yield), method = "bootstrap", n_iter = 1000) print(unc) ``` --- ## Session Info ```{r session} sessionInfo() ```