--- title: "Supervised Learning with tidylearn" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Supervised Learning with tidylearn} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ``` ```{r setup} library(tidylearn) library(dplyr) ``` ## Introduction This vignette demonstrates supervised learning capabilities in tidylearn. All methods shown here wrap established R packages - the algorithms are unchanged, tidylearn simply provides a consistent interface and tidy output. **Wrapped packages include:** - stats (`lm()`, `glm()`) for linear and logistic regression - rpart for decision trees - randomForest for random forests - gbm and xgboost for gradient boosting - glmnet for regularization (ridge, lasso, elastic net) - e1071 for support vector machines - nnet for neural networks Access raw model objects via `model$fit` for package-specific functionality. ## Classification ### Binary Classification Let's create a binary classification problem from the iris dataset: ```{r} # Create binary classification dataset iris_binary <- iris %>% filter(Species %in% c("setosa", "versicolor")) %>% mutate(Species = droplevels(Species)) # Split data split <- tl_split(iris_binary, prop = 0.7, stratify = "Species", seed = 123) ``` #### Logistic Regression ```{r} # Train logistic regression model_logistic <- tl_model(split$train, Species ~ ., method = "logistic") print(model_logistic) ``` ```{r} # Predictions preds_logistic <- predict(model_logistic, new_data = split$test) head(preds_logistic) ``` #### Decision Trees ```{r} # Train decision tree model_tree <- tl_model(split$train, Species ~ ., method = "tree") print(model_tree) # Predictions preds_tree <- predict(model_tree, new_data = split$test) ``` ### Multi-class Classification ```{r} # Split full iris dataset split_multi <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123) ``` #### Random Forest ```{r} # Train random forest model_forest <- tl_model(split_multi$train, Species ~ ., method = "forest") print(model_forest) ``` ```{r} # Predictions preds_forest <- predict(model_forest, new_data = split_multi$test) head(preds_forest) ``` ```{r} # Accuracy on test set mean(preds_forest$.pred == split_multi$test$Species) ``` #### Support Vector Machines ```{r, eval=FALSE} # Train SVM model_svm <- tl_model(split_multi$train, Species ~ ., method = "svm") print(model_svm) # Predictions preds_svm <- predict(model_svm, new_data = split_multi$test) ``` ## Regression ### Linear Regression ```{r} # Split mtcars data split_reg <- tl_split(mtcars, prop = 0.7, seed = 123) # Train linear model model_lm <- tl_model(split_reg$train, mpg ~ wt + hp + disp, method = "linear") print(model_lm) ``` ```{r} # Predictions preds_lm <- predict(model_lm, new_data = split_reg$test) head(preds_lm) ``` ```{r} # Calculate RMSE rmse <- sqrt(mean((preds_lm$.pred - split_reg$test$mpg)^2)) cat("RMSE:", round(rmse, 2), "\n") ``` ### Polynomial Regression ```{r} # Polynomial regression for non-linear relationships model_poly <- tl_model(split_reg$train, mpg ~ wt, method = "polynomial", degree = 2) print(model_poly) ``` ```{r} # Predictions preds_poly <- predict(model_poly, new_data = split_reg$test) # RMSE rmse_poly <- sqrt(mean((preds_poly$.pred - split_reg$test$mpg)^2)) cat("Polynomial RMSE:", round(rmse_poly, 2), "\n") ``` ### Random Forest Regression ```{r} # Train random forest for regression model_rf_reg <- tl_model(split_reg$train, mpg ~ ., method = "forest") print(model_rf_reg) ``` ```{r} # Predictions preds_rf <- predict(model_rf_reg, new_data = split_reg$test) # RMSE rmse_rf <- sqrt(mean((preds_rf$.pred - split_reg$test$mpg)^2)) cat("Random Forest RMSE:", round(rmse_rf, 2), "\n") ``` ## Regularized Regression Regularization helps prevent overfitting by adding penalties to model complexity. ### Ridge Regression ```{r, eval=FALSE} # Ridge regression (L2 regularization) model_ridge <- tl_model(split_reg$train, mpg ~ ., method = "ridge") print(model_ridge) # Predictions preds_ridge <- predict(model_ridge, new_data = split_reg$test) ``` ### LASSO ```{r, eval=FALSE} # LASSO (L1 regularization) - performs feature selection model_lasso <- tl_model(split_reg$train, mpg ~ ., method = "lasso") print(model_lasso) # Predictions preds_lasso <- predict(model_lasso, new_data = split_reg$test) ``` ### Elastic Net ```{r, eval=FALSE} # Elastic Net - combines L1 and L2 regularization model_enet <- tl_model(split_reg$train, mpg ~ ., method = "elastic_net", alpha = 0.5) print(model_enet) # Predictions preds_enet <- predict(model_enet, new_data = split_reg$test) ``` ## Model Comparison ```{r} # Compare multiple models models <- list( linear = tl_model(split_reg$train, mpg ~ ., method = "linear"), tree = tl_model(split_reg$train, mpg ~ ., method = "tree"), forest = tl_model(split_reg$train, mpg ~ ., method = "forest") ) ``` ```{r} # Calculate RMSE for each model results <- data.frame( Model = character(), RMSE = numeric(), stringsAsFactors = FALSE ) for (model_name in names(models)) { preds <- predict(models[[model_name]], new_data = split_reg$test) rmse <- sqrt(mean((preds$.pred - split_reg$test$mpg)^2)) results <- rbind(results, data.frame( Model = model_name, RMSE = rmse )) } results <- results %>% arrange(RMSE) print(results) ``` ## Advanced Features ### Using Preprocessed Data ```{r} # Preprocess data processed <- tl_prepare_data( split_reg$train, mpg ~ ., scale_method = "standardize", remove_correlated = TRUE, correlation_cutoff = 0.9 ) ``` ```{r} # Train on preprocessed data model_processed <- tl_model(processed$data, mpg ~ ., method = "linear") print(model_processed) ``` ### Formula Variations ```{r} # Interaction terms model_interact <- tl_model(split_reg$train, mpg ~ wt * hp, method = "linear") # Polynomial terms using I() model_poly_manual <- tl_model(split_reg$train, mpg ~ wt + I(wt^2), method = "linear") # Subset of predictors model_subset <- tl_model(split_reg$train, mpg ~ wt + hp + disp, method = "linear") ``` ## Handling Different Data Types ### Categorical Predictors ```{r} # Create dataset with categorical variables mtcars_cat <- mtcars %>% mutate( cyl = as.factor(cyl), gear = as.factor(gear), am = as.factor(am) ) split_cat <- tl_split(mtcars_cat, prop = 0.7, seed = 123) # Model with categorical predictors model_cat <- tl_model(split_cat$train, mpg ~ ., method = "forest") print(model_cat) ``` ### Missing Values ```{r} # Create data with missing values mtcars_missing <- mtcars mtcars_missing[sample(1:nrow(mtcars_missing), 5), "hp"] <- NA mtcars_missing[sample(1:nrow(mtcars_missing), 3), "wt"] <- NA # Preprocess to handle missing values processed_missing <- tl_prepare_data( mtcars_missing, mpg ~ ., impute_method = "mean", scale_method = "standardize" ) # Train model model_imputed <- tl_model(processed_missing$data, mpg ~ ., method = "linear") ``` ## Best Practices 1. **Always split your data** before training to properly evaluate performance 2. **Use stratified splitting** for classification to maintain class proportions 3. **Preprocess your data** for better model performance 4. **Compare multiple models** to find the best approach 5. **Consider regularization** when dealing with many predictors 6. **Use appropriate metrics** - accuracy for classification, RMSE/MAE for regression ## Summary tidylearn provides a unified interface for supervised learning: - **Classification**: Logistic regression, decision trees, random forests, SVM, etc. - **Regression**: Linear, polynomial, random forests, regularized methods - **Preprocessing**: Integrated data preparation tools - **Consistent API**: Same function (`tl_model()`) for all methods - **Tidy Output**: Easy-to-use predictions and model objects ```{r} # Complete workflow example final_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42) final_prep <- tl_prepare_data(final_split$train, Species ~ ., scale_method = "standardize") final_model <- tl_model(final_prep$data, Species ~ ., method = "forest") final_preds <- predict(final_model, new_data = final_split$test) # Evaluate accuracy <- mean(final_preds$.pred == final_split$test$Species) cat("Test Accuracy:", round(accuracy * 100, 1), "%\n") ```