--- title: "Unsupervised Learning with tidylearn" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Unsupervised Learning with tidylearn} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ``` ```{r setup} library(tidylearn) library(dplyr) library(ggplot2) ``` ## Introduction This vignette explores unsupervised learning in tidylearn. All methods wrap established R packages - the algorithms are unchanged, tidylearn simply provides a consistent interface and tidy output. **Wrapped packages include:** - stats (`prcomp()`, `kmeans()`, `hclust()`, `cmdscale()`) - cluster (`pam()`, `clara()`) - dbscan for density-based clustering - MASS (`isoMDS()`, `sammon()`) - smacof for MDS algorithms Access raw model objects via `model$fit` for package-specific functionality. ## Dimensionality Reduction Dimensionality reduction techniques help visualize high-dimensional data and extract key patterns. ### Principal Component Analysis (PCA) ```{r} # Perform PCA on iris data (excluding species) model_pca <- tl_model(iris[, 1:4], method = "pca") print(model_pca) ``` ```{r} # Extract variance explained variance_explained <- model_pca$fit$variance_explained print(variance_explained) ``` ```{r} # Cumulative variance explained cumsum(variance_explained$prop_variance) ``` ```{r} # Transform data to principal components pca_scores <- predict(model_pca) head(pca_scores) ``` ```{r} # Visualize first two components pca_plot_data <- pca_scores %>% mutate(Species = iris$Species) ggplot(pca_plot_data, aes(x = PC1, y = PC2, color = Species)) + geom_point(size = 3, alpha = 0.7) + labs( title = "PCA of Iris Dataset", x = paste0("PC1 (", round(variance_explained$prop_variance[1] * 100, 1), "%)"), y = paste0("PC2 (", round(variance_explained$prop_variance[2] * 100, 1), "%)") ) + theme_minimal() ``` ```{r} # Examine loadings (variable contributions) loadings <- model_pca$fit$loadings print(loadings) ``` ### Multidimensional Scaling (MDS) ```{r} # Perform MDS model_mds <- tl_model(iris[, 1:4], method = "mds", k = 2) print(model_mds) ``` ```{r} # Extract MDS coordinates mds_points <- predict(model_mds) head(mds_points) ``` ```{r} # Visualize MDS mds_plot_data <- mds_points %>% mutate(Species = iris$Species) ggplot(mds_plot_data, aes(x = Dim1, y = Dim2, color = Species)) + geom_point(size = 3, alpha = 0.7) + labs(title = "MDS of Iris Dataset") + theme_minimal() ``` ## Clustering Clustering algorithms group similar observations together without using labels. ### K-means Clustering ```{r} # Perform k-means with k=3 model_kmeans <- tl_model(iris[, 1:4], method = "kmeans", k = 3) print(model_kmeans) ``` ```{r} # Extract cluster assignments clusters <- model_kmeans$fit$clusters head(clusters) ``` ```{r} # Compare clusters with actual species table(Cluster = clusters$cluster, Species = iris$Species) ``` ```{r} # Visualize clusters using PCA cluster_viz <- pca_scores %>% mutate( Cluster = as.factor(clusters$cluster), Species = iris$Species ) ggplot(cluster_viz, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) + geom_point(size = 3, alpha = 0.7) + labs(title = "K-means Clusters vs True Species") + theme_minimal() ``` ```{r} # Access cluster centers centers <- model_kmeans$fit$centers print(centers) ``` ### PAM (K-medoids) PAM is more robust to outliers than k-means: ```{r, eval=FALSE} # Perform PAM clustering model_pam <- tl_model(iris[, 1:4], method = "pam", k = 3) print(model_pam) # Extract clusters clusters_pam <- model_pam$fit$clusters table(Cluster = clusters_pam$cluster, Species = iris$Species) ``` ### Hierarchical Clustering ```{r} # Perform hierarchical clustering model_hclust <- tl_model(iris[, 1:4], method = "hclust") print(model_hclust) ``` ```{r} # Plot dendrogram plot(model_hclust$fit$model, labels = FALSE, main = "Hierarchical Clustering of Iris") ``` ```{r} # Cut tree to get clusters k <- 3 clusters_hc <- cutree(model_hclust$fit$model, k = k) table(Cluster = clusters_hc, Species = iris$Species) ``` ```{r} # Visualize hierarchical clusters hc_viz <- pca_scores %>% mutate( Cluster = as.factor(clusters_hc), Species = iris$Species ) ggplot(hc_viz, aes(x = PC1, y = PC2, color = Cluster)) + geom_point(size = 3, alpha = 0.7) + labs(title = "Hierarchical Clustering Results") + theme_minimal() ``` ### DBSCAN (Density-Based Clustering) DBSCAN can find arbitrarily shaped clusters and identify outliers: ```{r, eval=FALSE} # Perform DBSCAN model_dbscan <- tl_model(iris[, 1:4], method = "dbscan", eps = 0.5, minPts = 5) print(model_dbscan) # Extract clusters (0 = noise/outliers) clusters_dbscan <- model_dbscan$fit$clusters table(clusters_dbscan$cluster) # Compare with species table(Cluster = clusters_dbscan$cluster, Species = iris$Species) ``` ### CLARA (for Large Datasets) CLARA is efficient for large datasets: ```{r, eval=FALSE} # Create larger dataset large_data <- iris[rep(1:nrow(iris), 10), 1:4] # Perform CLARA model_clara <- tl_model(large_data, method = "clara", k = 3, samples = 5) print(model_clara) # Extract clusters clusters_clara <- model_clara$fit$clusters ``` ## Choosing the Number of Clusters ### Elbow Method ```{r} # Try different values of k k_values <- 2:8 within_ss <- numeric(length(k_values)) for (i in seq_along(k_values)) { k <- k_values[i] model <- tl_model(iris[, 1:4], method = "kmeans", k = k) within_ss[i] <- model$fit$model$tot.withinss } # Plot elbow curve elbow_data <- data.frame(k = k_values, within_ss = within_ss) ggplot(elbow_data, aes(x = k, y = within_ss)) + geom_line(linewidth = 1) + geom_point(size = 3) + labs( title = "Elbow Method for Optimal k", x = "Number of Clusters (k)", y = "Total Within-Cluster Sum of Squares" ) + theme_minimal() ``` ## Predicting on New Data ### Clustering New Observations ```{r} # Train clustering model model_train <- tl_model(iris[1:100, 1:4], method = "kmeans", k = 3) # Predict cluster assignments for new data new_data <- iris[101:150, 1:4] new_clusters <- predict(model_train, new_data = new_data) head(new_clusters) ``` ### Transforming New Data with PCA ```{r} # Train PCA model pca_train <- tl_model(iris[1:100, 1:4], method = "pca") # Transform new data new_pca <- predict(pca_train, new_data = new_data) head(new_pca) ``` ## Combining Multiple Techniques ### PCA followed by Clustering ```{r} # Reduce dimensions with PCA pca_model <- tl_model(iris[, 1:4], method = "pca") pca_data <- predict(pca_model) # Select first 2 components pca_reduced <- pca_data %>% select(PC1, PC2) # Cluster in reduced space kmeans_pca <- tl_model(pca_reduced, method = "kmeans", k = 3) clusters_pca <- kmeans_pca$fit$clusters # Visualize viz_combined <- pca_data %>% mutate( Cluster = as.factor(clusters_pca$cluster), Species = iris$Species ) ggplot(viz_combined, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) + geom_point(size = 3, alpha = 0.7) + labs(title = "Clustering in PCA Space") + theme_minimal() ``` ## Practical Applications ### Customer Segmentation ```{r} # Simulate customer data set.seed(42) customers <- data.frame( age = rnorm(200, 40, 15), income = rnorm(200, 50000, 20000), spending_score = rnorm(200, 50, 25) ) # Standardize features customers_scaled <- scale(customers) %>% as.data.frame() # Cluster customers customer_segments <- tl_model(customers_scaled, method = "kmeans", k = 4) customers$segment <- customer_segments$fit$clusters$cluster # Visualize segments ggplot(customers, aes(x = income, y = spending_score, color = as.factor(segment))) + geom_point(size = 3, alpha = 0.7) + labs( title = "Customer Segmentation", color = "Segment" ) + theme_minimal() ``` ### Feature Extraction ```{r} # Use PCA for feature extraction pca_features <- tl_model(mtcars, method = "pca") # Keep components explaining 90% of variance var_exp <- pca_features$fit$variance_explained cumulative_var <- cumsum(var_exp$prop_variance) n_components <- which(cumulative_var >= 0.90)[1] cat("Components needed for 90% variance:", n_components, "\n") cat("Original features:", ncol(mtcars), "\n") cat("Dimension reduction:", round((1 - n_components/ncol(mtcars)) * 100, 1), "%\n") ``` ## Best Practices 1. **Scale your data** before clustering or PCA for fair feature comparison 2. **Determine optimal k** using elbow method or silhouette analysis 3. **Try multiple methods** - different algorithms work better for different data 4. **Visualize results** to understand cluster structure 5. **Consider domain knowledge** when interpreting clusters 6. **Use PCA for visualization** when data has more than 2-3 dimensions ## Summary tidylearn provides comprehensive unsupervised learning tools: - **Dimensionality Reduction**: PCA, MDS for visualization and feature extraction - **Clustering**: K-means, PAM, CLARA, hierarchical, DBSCAN - **Unified API**: Same `tl_model()` function for all methods - **Prediction Support**: Transform new data with learned patterns - **Flexible**: Works with different data types and sizes ```{r} # Complete unsupervised workflow workflow_data <- iris[, 1:4] # 1. Reduce dimensions pca_final <- tl_model(workflow_data, method = "pca") # 2. Cluster in reduced space pca_coords <- predict(pca_final) %>% select(PC1, PC2) clusters_final <- tl_model(pca_coords, method = "kmeans", k = 3) # 3. Visualize final_viz <- pca_coords %>% mutate( Cluster = as.factor(clusters_final$fit$clusters$cluster), Species = iris$Species ) ggplot(final_viz, aes(x = PC1, y = PC2, color = Cluster)) + geom_point(size = 3, alpha = 0.7) + labs(title = "Complete Unsupervised Workflow") + theme_minimal() ```