Advanced Modelling Workflows with NRMstatsML

Overview

This vignette covers advanced workflows built on top of the core NRMstatsML modules, including:

  1. Combining trend and uncertainty analysis
  2. Multi-variable response surface exploration
  3. Integrating bootstrap uncertainty into forecasting
  4. Reading data from the bundled CSV (reproducible pipelines)
  5. Extending NRMstatsML with custom stat_fn closures

1. Loading Data from CSV

The package ships a plain-text version of nrm_example under inst/extdata for users who want to demonstrate a full import-to-analysis pipeline:

library(NRMstatsML)

csv_path <- system.file("extdata", "nrm_example.csv",
                        package = "NRMstatsML")
d <- read.csv(csv_path, stringsAsFactors = FALSE)

nrm_data_check(d, time_var = "year", verbose = TRUE)
#> === NRMstatsML: Data Check ===
#>   Rows       : 20
#>   Columns    : 10
#>   Numeric    : year, crop_yield, soil_OC, N, P, K, rainfall, runoff, biomass
#>   Status     : OK

2. Combining Trend Detection and Uncertainty

A common question in long-term NRM studies is: is the observed trend in yield real, or could it arise by chance? Bootstrap sampling answers this by resampling the dataset many times and computing Sen’s slope on each resample.

# Define a statistic function: Sen's slope on crop_yield
sens_stat <- function(df) {
  nrm_sens_slope(df, time_var = "year", value_var = "crop_yield")$slope
}

# Bootstrap the slope — 500 replicates for speed
bs_slope <- nrm_bootstrap(d, stat_fn = sens_stat, n_iter = 500)
print(bs_slope)
#> -- Bootstrap --
#>   Mean : -0.0005  SD : 0.0254  CI : [-0.0494, 0.0519]

If the 95% CI for the slope excludes zero, the trend is significant. This approach is more robust than a single-pass Mann-Kendall test because it directly quantifies slope uncertainty.


3. Multi-Variable Response Surface

When two inputs interact (e.g. N × rainfall), a full response surface reveals the joint optimum. We build this by iterating nrm_response_curve() across a grid and collecting predicted yields.

# Fit a quadratic surface: yield ~ N + rainfall (manual grid approach)
mv_surface <- nrm_multivariate(d,
  formula = crop_yield ~ N + I(N^2) + rainfall + I(rainfall^2) + N:rainfall,
  scale   = FALSE)

# Prediction grid
N_seq   <- seq(60, 120, length.out = 30)
R_seq   <- seq(610, 720, length.out = 30)
grid    <- expand.grid(N = N_seq, rainfall = R_seq)

grid$predicted_yield <- stats::predict(mv_surface$model, newdata = grid)

# Simple contour summary (base graphics — no extra dependencies)
with(
  reshape(grid, idvar = "N", timevar = "rainfall",
          direction = "wide")[, 1:10],   # abbreviated for display
  message(sprintf("Predicted yield range: %.2f to %.2f t/ha",
                  min(grid$predicted_yield),
                  max(grid$predicted_yield)))
)
# ggplot2 visualisation of the response surface
library(ggplot2)

ggplot(grid, aes(x = N, y = rainfall, fill = predicted_yield)) +
  geom_tile() +
  scale_fill_viridis_c(name = "Yield\n(t/ha)", option = "C") +
  geom_contour(aes(z = predicted_yield), colour = "white",
               alpha = 0.5, linewidth = 0.4) +
  labs(
    title    = "Predicted Yield Response Surface",
    subtitle = "Quadratic model: crop_yield ~ N × rainfall",
    x        = "Nitrogen applied (kg/ha)",
    y        = "Rainfall (mm)"
  ) +
  theme_minimal(base_size = 13)


4. Uncertainty-Adjusted Forecasting

Instead of a single ARIMA forecast, we propagate parametric uncertainty by running nrm_forecast() on bootstrap resamples of the historical record.

# Collect horizon-1 point forecasts from 200 bootstrap resamples
set.seed(42)
boot_forecasts <- replicate(200, {
  idx      <- sample(nrow(d), replace = TRUE)
  boot_d   <- d[sort(idx), ]          # keep time order approximately
  boot_d$year <- d$year               # restore exact year sequence
  fc <- nrm_forecast(boot_d, value_var = "crop_yield",
                     horizon = 1, method = "auto_arima")
  as.numeric(fc$forecast$mean)
}, simplify = TRUE)

cat(sprintf(
  "Bootstrap forecast (h=1):\n  Mean = %.3f t/ha\n  95%% CI: [%.3f, %.3f]\n",
  mean(boot_forecasts),
  quantile(boot_forecasts, 0.025),
  quantile(boot_forecasts, 0.975)
))
#> Bootstrap forecast (h=1):
#>   Mean = 5.448 t/ha
#>   95% CI: [5.157, 5.548]

5. Custom stat_fn Closures

nrm_uncertainty() and nrm_bootstrap() accept any function of the form f(data) → scalar. This makes them highly extensible.

Example A — R-squared of an OLS model

rsq_fn <- function(df) {
  m  <- lm(crop_yield ~ N + P + K, data = df)
  summary(m)$r.squared
}

bs_rsq <- nrm_bootstrap(d, stat_fn = rsq_fn, n_iter = 500)
print(bs_rsq)
#> -- Bootstrap --
#>   Mean : 0.9995  SD : 0.0002  CI : [0.9990, 0.9998]

Example B — Optimum N from a quadratic response curve

opt_N_fn <- function(df) {
  rc <- nrm_response_curve(df, input_var = "N",
                            response_var = "crop_yield",
                            type = "quadratic")
  opt <- nrm_optimize_input(rc, price_ratio = 0.6)
  opt$economic_optimum
}

bs_optN <- nrm_bootstrap(d, stat_fn = opt_N_fn, n_iter = 300)
cat(sprintf(
  "Bootstrapped economic optimum N:\n  Mean = %.1f kg/ha\n  95%% CI: [%.1f, %.1f]\n",
  bs_optN$mean, bs_optN$ci[1], bs_optN$ci[2]
))
#> Bootstrapped economic optimum N:
#>   Mean = -98660.8 kg/ha
#>   95% CI: [-279854.7, -11328.2]

Example C — Mann-Kendall tau for soil OC

tau_fn <- function(df) {
  nrm_mann_kendall(df, time_var = "year", value_var = "soil_OC")$tau
}

mc_tau <- nrm_monte_carlo(d, stat_fn = tau_fn, n_iter = 300, noise_sd = 0.05)
print(mc_tau)
#> -- Monte Carlo --
#>   Mean : 0.9972  SD : 0.0054  CI : [0.9789, 1.0000]

6. Multi-Metric Model Comparison

Compare OLS, quadratic OLS, and PLS side-by-side using nrm_benchmark() on a simple hold-out split.

set.seed(123)
n_d   <- nrow(d)
idx   <- sample(n_d, floor(0.75 * n_d))
train <- d[ idx, ]
test  <- d[-idx, ]

# Fit three candidate models
m_ols  <- lm(crop_yield ~ N + P + K + rainfall,          data = train)
m_quad <- lm(crop_yield ~ N + I(N^2) + rainfall,         data = train)
m_full <- lm(crop_yield ~ N + P + K + rainfall + soil_OC, data = train)

bm <- nrm_benchmark(
  models       = list(ols = m_ols, quadratic = m_quad, full_ols = m_full),
  test_data    = test,
  response_var = "crop_yield"
)
print(bm)
#> === NRMstatsML: Model Benchmark ===
#> 
#>      model   RMSE    MAE    Rsq
#>   full_ols 0.0279 0.0200 0.9974
#>        ols 0.0285 0.0191 0.9973
#>  quadratic 0.0435 0.0380 0.9937

7. Complete Reproducible Pipeline

Putting it all together — a single self-contained script suitable for a supplementary materials section:

library(NRMstatsML)

# 1. Import
d <- read.csv(system.file("extdata", "nrm_example.csv",
                           package = "NRMstatsML"))

# 2. Validate
nrm_data_check(d)

# 3. Trend
tr  <- nrm_trend(d, time_var = "year", value_var = "crop_yield")
nrm_plot(tr)

# 4. Multivariate
mv  <- nrm_multivariate(d, crop_yield ~ N + P + K + rainfall)
nrm_summary(mv)

# 5. Response curve and optimum
rc  <- nrm_response_curve(d, "N", "crop_yield", type = "quadratic")
opt <- nrm_optimize_input(rc, price_ratio = 0.6)
nrm_plot(rc)

# 6. Forecast
fc  <- nrm_forecast(d, "crop_yield", horizon = 5)
nrm_plot(fc)

# 7. Uncertainty
unc <- nrm_uncertainty(d,
        stat_fn = function(x) mean(x$crop_yield),
        method  = "bootstrap", n_iter = 1000)
print(unc)

Session Info

sessionInfo()
#> R version 4.2.1 (2022-06-23 ucrt)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 26200)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=C                          
#> [2] LC_CTYPE=English_United States.utf8   
#> [3] LC_MONETARY=English_United States.utf8
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.utf8    
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] ggplot2_4.0.2    NRMstatsML_0.1.4
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.1.1          urca_1.3-4          pillar_1.11.1      
#>  [4] bslib_0.10.0        compiler_4.2.1      RColorBrewer_1.1-3 
#>  [7] jquerylib_0.1.4     tools_4.2.1         trend_1.1.6        
#> [10] boot_1.3-32         digest_0.6.39       lattice_0.20-45    
#> [13] nlme_3.1-168        viridisLite_0.4.3   tibble_3.3.1       
#> [16] jsonlite_2.0.0      evaluate_1.0.5      lifecycle_1.0.5    
#> [19] gtable_0.3.6        pkgconfig_2.0.3     rlang_1.1.7        
#> [22] cli_3.6.5           rstudioapi_0.18.0   Kendall_2.2.2      
#> [25] parallel_4.2.1      yaml_2.3.12         xfun_0.57          
#> [28] fastmap_1.2.0       withr_3.0.2         extraDistr_1.10.0.2
#> [31] dplyr_1.2.0         knitr_1.51          generics_0.1.4     
#> [34] sass_0.4.10         vctrs_0.7.2         forecast_9.0.2     
#> [37] isoband_0.3.0       tidyselect_1.2.1    grid_4.2.1         
#> [40] glue_1.7.0          R6_2.6.1            otel_0.2.0         
#> [43] rmarkdown_2.31      farver_2.1.2        magrittr_2.0.3     
#> [46] scales_1.4.0        htmltools_0.5.9     timeDate_4052.112  
#> [49] colorspace_2.1-2    fracdiff_1.5-3      labeling_0.4.3     
#> [52] S7_0.2.1            cachem_1.1.0        zoo_1.8-15