---
title: "Unsupervised Learning with tidylearn"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Unsupervised Learning with tidylearn}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
```

```{r setup}
library(tidylearn)
library(dplyr)
library(ggplot2)
```

## Introduction

This vignette explores unsupervised learning in tidylearn. All methods wrap
established R packages - the algorithms are unchanged, tidylearn simply provides
a consistent interface and tidy output.

**Wrapped packages include:**

- stats (`prcomp()`, `kmeans()`, `hclust()`, `cmdscale()`)
- cluster (`pam()`, `clara()`)
- dbscan for density-based clustering
- MASS (`isoMDS()`, `sammon()`)
- smacof for MDS algorithms

Access raw model objects via `model$fit` for package-specific functionality.

## Dimensionality Reduction

Dimensionality reduction techniques help visualize high-dimensional data and extract key patterns.

### Principal Component Analysis (PCA)

```{r}
# Perform PCA on iris data (excluding species)
model_pca <- tl_model(iris[, 1:4], method = "pca")
print(model_pca)
```

```{r}
# Extract variance explained
variance_explained <- model_pca$fit$variance_explained
print(variance_explained)
```

```{r}
# Cumulative variance explained
cumsum(variance_explained$prop_variance)
```

```{r}
# Transform data to principal components
pca_scores <- predict(model_pca)
head(pca_scores)
```

```{r}
# Visualize first two components
pca_plot_data <- pca_scores %>%
  mutate(Species = iris$Species)

ggplot(pca_plot_data, aes(x = PC1, y = PC2, color = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "PCA of Iris Dataset",
    x = paste0("PC1 (", round(variance_explained$prop_variance[1] * 100, 1), "%)"),
    y = paste0("PC2 (", round(variance_explained$prop_variance[2] * 100, 1), "%)")
  ) +
  theme_minimal()
```

```{r}
# Examine loadings (variable contributions)
loadings <- model_pca$fit$loadings
print(loadings)
```

### Multidimensional Scaling (MDS)

```{r}
# Perform MDS
model_mds <- tl_model(iris[, 1:4], method = "mds", k = 2)
print(model_mds)
```

```{r}
# Extract MDS coordinates
mds_points <- predict(model_mds)
head(mds_points)
```

```{r}
# Visualize MDS
mds_plot_data <- mds_points %>%
  mutate(Species = iris$Species)

ggplot(mds_plot_data, aes(x = Dim1, y = Dim2, color = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "MDS of Iris Dataset") +
  theme_minimal()
```

## Clustering

Clustering algorithms group similar observations together without using labels.

### K-means Clustering

```{r}
# Perform k-means with k=3
model_kmeans <- tl_model(iris[, 1:4], method = "kmeans", k = 3)
print(model_kmeans)
```

```{r}
# Extract cluster assignments
clusters <- model_kmeans$fit$clusters
head(clusters)
```

```{r}
# Compare clusters with actual species
table(Cluster = clusters$cluster, Species = iris$Species)
```

```{r}
# Visualize clusters using PCA
cluster_viz <- pca_scores %>%
  mutate(
    Cluster = as.factor(clusters$cluster),
    Species = iris$Species
  )

ggplot(cluster_viz, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "K-means Clusters vs True Species") +
  theme_minimal()
```

```{r}
# Access cluster centers
centers <- model_kmeans$fit$centers
print(centers)
```

### PAM (K-medoids)

PAM is more robust to outliers than k-means:

```{r, eval=FALSE}
# Perform PAM clustering
model_pam <- tl_model(iris[, 1:4], method = "pam", k = 3)
print(model_pam)

# Extract clusters
clusters_pam <- model_pam$fit$clusters
table(Cluster = clusters_pam$cluster, Species = iris$Species)
```

### Hierarchical Clustering

```{r}
# Perform hierarchical clustering
model_hclust <- tl_model(iris[, 1:4], method = "hclust")
print(model_hclust)
```

```{r}
# Plot dendrogram
plot(model_hclust$fit$model, labels = FALSE, main = "Hierarchical Clustering of Iris")
```

```{r}
# Cut tree to get clusters
k <- 3
clusters_hc <- cutree(model_hclust$fit$model, k = k)
table(Cluster = clusters_hc, Species = iris$Species)
```

```{r}
# Visualize hierarchical clusters
hc_viz <- pca_scores %>%
  mutate(
    Cluster = as.factor(clusters_hc),
    Species = iris$Species
  )

ggplot(hc_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Hierarchical Clustering Results") +
  theme_minimal()
```

### DBSCAN (Density-Based Clustering)

DBSCAN can find arbitrarily shaped clusters and identify outliers:

```{r, eval=FALSE}
# Perform DBSCAN
model_dbscan <- tl_model(iris[, 1:4], method = "dbscan", eps = 0.5, minPts = 5)
print(model_dbscan)

# Extract clusters (0 = noise/outliers)
clusters_dbscan <- model_dbscan$fit$clusters
table(clusters_dbscan$cluster)

# Compare with species
table(Cluster = clusters_dbscan$cluster, Species = iris$Species)
```

### CLARA (for Large Datasets)

CLARA is efficient for large datasets:

```{r, eval=FALSE}
# Create larger dataset
large_data <- iris[rep(1:nrow(iris), 10), 1:4]

# Perform CLARA
model_clara <- tl_model(large_data, method = "clara", k = 3, samples = 5)
print(model_clara)

# Extract clusters
clusters_clara <- model_clara$fit$clusters
```

## Choosing the Number of Clusters

### Elbow Method

```{r}
# Try different values of k
k_values <- 2:8
within_ss <- numeric(length(k_values))

for (i in seq_along(k_values)) {
  k <- k_values[i]
  model <- tl_model(iris[, 1:4], method = "kmeans", k = k)
  within_ss[i] <- model$fit$model$tot.withinss
}

# Plot elbow curve
elbow_data <- data.frame(k = k_values, within_ss = within_ss)

ggplot(elbow_data, aes(x = k, y = within_ss)) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  labs(
    title = "Elbow Method for Optimal k",
    x = "Number of Clusters (k)",
    y = "Total Within-Cluster Sum of Squares"
  ) +
  theme_minimal()
```

## Predicting on New Data

### Clustering New Observations

```{r}
# Train clustering model
model_train <- tl_model(iris[1:100, 1:4], method = "kmeans", k = 3)

# Predict cluster assignments for new data
new_data <- iris[101:150, 1:4]
new_clusters <- predict(model_train, new_data = new_data)

head(new_clusters)
```

### Transforming New Data with PCA

```{r}
# Train PCA model
pca_train <- tl_model(iris[1:100, 1:4], method = "pca")

# Transform new data
new_pca <- predict(pca_train, new_data = new_data)
head(new_pca)
```

## Combining Multiple Techniques

### PCA followed by Clustering

```{r}
# Reduce dimensions with PCA
pca_model <- tl_model(iris[, 1:4], method = "pca")
pca_data <- predict(pca_model)

# Select first 2 components
pca_reduced <- pca_data %>% select(PC1, PC2)

# Cluster in reduced space
kmeans_pca <- tl_model(pca_reduced, method = "kmeans", k = 3)
clusters_pca <- kmeans_pca$fit$clusters

# Visualize
viz_combined <- pca_data %>%
  mutate(
    Cluster = as.factor(clusters_pca$cluster),
    Species = iris$Species
  )

ggplot(viz_combined, aes(x = PC1, y = PC2, color = Cluster, shape = Species)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Clustering in PCA Space") +
  theme_minimal()
```

## Practical Applications

### Customer Segmentation

```{r}
# Simulate customer data
set.seed(42)
customers <- data.frame(
  age = rnorm(200, 40, 15),
  income = rnorm(200, 50000, 20000),
  spending_score = rnorm(200, 50, 25)
)

# Standardize features
customers_scaled <- scale(customers) %>% as.data.frame()

# Cluster customers
customer_segments <- tl_model(customers_scaled, method = "kmeans", k = 4)
customers$segment <- customer_segments$fit$clusters$cluster

# Visualize segments
ggplot(customers, aes(x = income, y = spending_score, color = as.factor(segment))) +
  geom_point(size = 3, alpha = 0.7) +
  labs(
    title = "Customer Segmentation",
    color = "Segment"
  ) +
  theme_minimal()
```

### Feature Extraction

```{r}
# Use PCA for feature extraction
pca_features <- tl_model(mtcars, method = "pca")

# Keep components explaining 90% of variance
var_exp <- pca_features$fit$variance_explained
cumulative_var <- cumsum(var_exp$prop_variance)
n_components <- which(cumulative_var >= 0.90)[1]

cat("Components needed for 90% variance:", n_components, "\n")
cat("Original features:", ncol(mtcars), "\n")
cat("Dimension reduction:", round((1 - n_components/ncol(mtcars)) * 100, 1), "%\n")
```

## Best Practices

1. **Scale your data** before clustering or PCA for fair feature comparison
2. **Determine optimal k** using elbow method or silhouette analysis
3. **Try multiple methods** - different algorithms work better for different data
4. **Visualize results** to understand cluster structure
5. **Consider domain knowledge** when interpreting clusters
6. **Use PCA for visualization** when data has more than 2-3 dimensions

## Summary

tidylearn provides comprehensive unsupervised learning tools:

- **Dimensionality Reduction**: PCA, MDS for visualization and feature extraction
- **Clustering**: K-means, PAM, CLARA, hierarchical, DBSCAN
- **Unified API**: Same `tl_model()` function for all methods
- **Prediction Support**: Transform new data with learned patterns
- **Flexible**: Works with different data types and sizes

```{r}
# Complete unsupervised workflow
workflow_data <- iris[, 1:4]

# 1. Reduce dimensions
pca_final <- tl_model(workflow_data, method = "pca")

# 2. Cluster in reduced space
pca_coords <- predict(pca_final) %>% select(PC1, PC2)
clusters_final <- tl_model(pca_coords, method = "kmeans", k = 3)

# 3. Visualize
final_viz <- pca_coords %>%
  mutate(
    Cluster = as.factor(clusters_final$fit$clusters$cluster),
    Species = iris$Species
  )

ggplot(final_viz, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Complete Unsupervised Workflow") +
  theme_minimal()
```