--- title: "Framework Integration" author: "Gilles Colling" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Framework Integration} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) library(BORG) # Check package availability has_caret <- requireNamespace("caret", quietly = TRUE) has_recipes <- requireNamespace("recipes", quietly = TRUE) has_rsample <- requireNamespace("rsample", quietly = TRUE) has_mlr3 <- requireNamespace("mlr3", quietly = TRUE) ``` BORG integrates with major R machine learning frameworks. This guide shows how to validate workflows and use BORG-guarded CV functions in each ecosystem. ## Base R The simplest integration - manual index-based splitting: ```{r base-r} # Create data data <- iris set.seed(42) n <- nrow(data) train_idx <- sample(n, 0.7 * n) test_idx <- setdiff(1:n, train_idx) # Validate the split borg(data, train_idx = train_idx, test_idx = test_idx) ``` ### Safe Preprocessing Pattern ```{r preprocessing-pattern} # CORRECT: Fit preprocessing on training data only train_data <- data[train_idx, ] train_means <- colMeans(train_data[, 1:4]) train_sds <- apply(train_data[, 1:4], 2, sd) # Apply train statistics to both sets scaled_train <- scale(data[train_idx, 1:4], center = train_means, scale = train_sds) scaled_test <- scale(data[test_idx, 1:4], center = train_means, scale = train_sds) ``` --- ## caret BORG can validate `preProcess` and `trainControl` objects, and provides a guarded wrapper. ### Validating preProcess Objects ```{r caret-preprocess, eval = has_caret} library(caret) data(mtcars) train_idx <- 1:25 test_idx <- 26:32 # BAD: preProcess on full data (LEAKS!) pp_bad <- preProcess(mtcars[, -1], method = c("center", "scale")) borg_inspect(pp_bad, train_idx, test_idx, data = mtcars) # GOOD: preProcess on training data only pp_good <- preProcess(mtcars[train_idx, -1], method = c("center", "scale")) borg_inspect(pp_good, train_idx, test_idx, data = mtcars) ``` ### BORG-Guarded trainControl Use `borg_trainControl()` to automatically block random resampling when dependencies are detected: ```{r caret-traincontrol, eval = FALSE} # Standard caret workflow with spatial data spatial_data <- data.frame( lon = runif(200, 0, 100), lat = runif(200, 0, 100), response = rnorm(200) ) # This will warn/error if random CV is inappropriate ctrl <- borg_trainControl( data = spatial_data, coords = c("lon", "lat"), method = "cv", number = 5 ) # If spatial autocorrelation detected, blocks random CV # Use auto_block = TRUE to automatically switch to spatial blocking ``` --- ## tidymodels (rsample + recipes) ### Validating Recipe Objects ```{r tidymodels-recipes, eval = has_recipes && has_rsample} library(recipes) library(rsample) data(mtcars) set.seed(123) split <- initial_split(mtcars, prop = 0.8) train_idx <- split$in_id test_idx <- setdiff(seq_len(nrow(mtcars)), train_idx) # BAD: Recipe prepped on full data rec_bad <- recipe(mpg ~ ., data = mtcars) |> step_normalize(all_numeric_predictors()) |> prep() # Uses full mtcars! borg_inspect(rec_bad, train_idx, test_idx, data = mtcars) # GOOD: Recipe prepped on training only rec_good <- recipe(mpg ~ ., data = training(split)) |> step_normalize(all_numeric_predictors()) |> prep() borg_inspect(rec_good, train_idx, test_idx, data = mtcars) ``` ### BORG-Guarded rsample Functions BORG provides drop-in replacements for rsample functions that respect data dependencies: ```{r borg-vfold, eval = FALSE} # Standard rsample folds <- vfold_cv(data, v = 5) # Random folds # BORG-guarded version folds <- borg_vfold_cv( data = spatial_data, coords = c("lon", "lat"), v = 5, auto_block = TRUE # Switches to spatial blocking if needed ) ``` ```{r borg-group-vfold, eval = FALSE} # For grouped data folds <- borg_group_vfold_cv( data = clinical_data, group = patient_id, v = 5 ) ``` ```{r borg-initial-split, eval = FALSE} # For temporal data - enforces chronological ordering split <- borg_initial_split( data = ts_data, time = "date", prop = 0.8 ) ``` ### Validating rsample Objects ```{r rsample-validation, eval = has_rsample} # Validate existing rsample objects ts_data <- data.frame( date = seq(as.Date("2020-01-01"), by = "day", length.out = 200), value = cumsum(rnorm(200)) ) rolling <- rolling_origin( data = ts_data, initial = 100, assess = 20, cumulative = FALSE ) # Check for temporal leakage borg_inspect(rolling, train_idx = NULL, test_idx = NULL) ``` --- ## mlr3 Validate mlr3 tasks and resamplings: ```{r mlr3-example, eval = has_mlr3} library(mlr3) # Create task task <- TaskClassif$new("iris", iris, target = "Species") # Create resampling resampling <- rsmp("cv", folds = 5) resampling$instantiate(task) # Validate first fold train_idx <- resampling$train_set(1) test_idx <- resampling$test_set(1) borg_inspect(task, train_idx, test_idx) ``` --- ## Temporal Data Workflows For time series and panel data, temporal ordering is critical. ### Basic Temporal Validation ```{r temporal-basic} set.seed(123) n <- 365 ts_data <- data.frame( date = seq(as.Date("2020-01-01"), by = "day", length.out = n), value = cumsum(rnorm(n)), feature = rnorm(n) ) # Chronological split train_idx <- 1:252 test_idx <- 253:365 # Validate temporal ordering result <- borg(ts_data, train_idx = train_idx, test_idx = test_idx, time = "date") result ``` ### Rolling Origin with rsample ```{r rolling-origin, eval = has_rsample} rolling <- rolling_origin( data = ts_data, initial = 200, assess = 30, cumulative = FALSE ) # Validate the resampling scheme borg_inspect(rolling, train_idx = NULL, test_idx = NULL) ``` --- ## Spatial Data Workflows For spatial data, nearby points are often correlated. ### Spatial Block Validation ```{r spatial-basic} set.seed(456) n <- 200 spatial_data <- data.frame( lon = runif(n, -10, 10), lat = runif(n, -10, 10), response = rnorm(n), predictor = rnorm(n) ) # Geographic split (west vs east) train_idx <- which(spatial_data$lon < 0) test_idx <- which(spatial_data$lon >= 0) # Validate with spatial awareness result <- borg(spatial_data, train_idx = train_idx, test_idx = test_idx, coords = c("lon", "lat")) result ``` ### Automatic Spatial CV Generation ```{r spatial-auto} # Let BORG generate spatially-blocked folds result <- borg(spatial_data, coords = c("lon", "lat"), target = "response", v = 5) result$diagnosis@recommended_cv # Access the folds length(result$folds) ``` --- ## Grouped Data Workflows For hierarchical data (patients, sites, species): ```{r grouped-workflow} # Clinical trial data with repeated measures clinical_data <- data.frame( patient_id = rep(1:50, each = 4), visit = rep(1:4, times = 50), outcome = rnorm(200) ) # Let BORG create leave-group-out folds result <- borg(clinical_data, groups = "patient_id", target = "outcome", v = 5) result$diagnosis@recommended_cv # Verify no patient appears in both train and test fold1 <- result$folds[[1]] train_patients <- unique(clinical_data$patient_id[fold1$train]) test_patients <- unique(clinical_data$patient_id[fold1$test]) length(intersect(train_patients, test_patients)) # Should be 0 ``` --- ## Complete Pipeline Validation Validate an entire workflow at once: ```{r pipeline-validation} # Build a workflow data <- iris set.seed(789) n <- nrow(data) train_idx <- sample(n, 0.7 * n) test_idx <- setdiff(1:n, train_idx) # Validate everything result <- borg_validate(list( data = data, train_idx = train_idx, test_idx = test_idx )) result ``` ### With Problematic Workflow ```{r pipeline-bad} # Workflow with overlap (common mistake) bad_workflow <- list( data = iris, train_idx = 1:100, test_idx = 51:150 # Overlaps! ) result <- borg_validate(bad_workflow) result ``` --- ## Automatic Repair with borg_assimilate() BORG can automatically fix certain types of leakage: ```{r assimilate} # Workflow with fixable issues workflow <- list( data = iris, train_idx = 1:100, test_idx = 51:150 # Overlap ) # Attempt automatic repair fixed <- borg_assimilate(workflow) if (length(fixed$unfixable) > 0) { cat("Partial assimilation:", length(fixed$unfixable), "risk(s) require manual fix:", paste(fixed$unfixable, collapse = ", "), "\n") } else { cat("Assimilation complete:", length(fixed$fixed), "risk(s) corrected\n") } ``` Note: Index overlap requires choosing a new split strategy and cannot be fixed automatically. --- ## Summary: Framework Integration Patterns | Framework | Validation Function | Guarded Alternative | |-----------|--------------------|--------------------| | Base R | `borg()`, `borg_inspect()` | - | | caret | `borg_inspect(preProcess)` | `borg_trainControl()` | | rsample | `borg_inspect(vfold_cv)` | `borg_vfold_cv()`, `borg_initial_split()` | | recipes | `borg_inspect(recipe)` | - | | mlr3 | `borg_inspect(task)` | - | ## See Also - `vignette("quickstart")` - Basic usage and concepts - `vignette("risk-taxonomy")` - Complete catalog of detectable risks