## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE ) ## ----load-data---------------------------------------------------------------- library(cclustr) library(mice) if (!requireNamespace("mlbench", quietly = TRUE)) { stop("Package 'mlbench' is required for this vignette. ", "Install it with: install.packages('mlbench')") } data("PimaIndiansDiabetes2", package = "mlbench") # Remove diabetes from the clustering dataset df <- PimaIndiansDiabetes2[, -9] cat("Dimensions:", nrow(df), "x", ncol(df), "\n") cat("\nMissing values per variable:\n") print(colSums(is.na(df))) cat("\nTotal missing values:", sum(is.na(df)), "(", round(mean(is.na(df)) * 100, 1), "%)\n") ## ----imputation--------------------------------------------------------------- imp <- mice(df, m = 20, maxit = 20, method = "pmm", printFlag = FALSE, seed = 202604) # Convert to cclustr standard format mi_pima <- as_mild_list(imp) cat("Number of imputed datasets :", length(mi_pima), "\n") cat("Dimensions of each dataset :", nrow(mi_pima[[1]]), "x", ncol(mi_pima[[1]]), "\n") cat("Any missing values remaining:", any(sapply(mi_pima, anyNA)), "\n") ## ----clustering--------------------------------------------------------------- set.seed(202604) partitions <- cluster_imputations( imp_list = mi_pima, method = "kmeans", k = 2:4, scale_data = "global" ) cat("Available k values:", names(partitions), "\n") cat("Number of partitions per k:", length(partitions$k2), "\n") ## ----consensus---------------------------------------------------------------- consensus_results <- consensus_clustering( partitions = partitions, cluster_method = "ward.D2", consensus_method = "classic" ) cat("Consensus solutions available:", names(consensus_results), "\n") ## ----validation--------------------------------------------------------------- val_table <- validate_clustering( partitions = partitions, consensus_results = consensus_results ) print(val_table) ## ----selection---------------------------------------------------------------- best <- choose_best_clustering( validation_table = val_table, consensus_results = consensus_results, prefer_stability = TRUE, tie_breaker = "silhouette" ) cat("Optimal k selected:", best$best_k, "\n") cat("\nConsensus cluster sizes:\n") print(table(best$best_consensus)) cat("\nWeighted rank scores:\n") print(best$scores_table[, c("k", "pac", "silhouette_mean", "ari_mean_between_imputations", "score")]) ## ----validation-plot, fig.cap = "Validation metrics across k = 2, 3, 4. The optimal value per metric is highlighted in red."---- plot_validation_metrics(best) ## ----heatmap, fig.cap = "Co-assignment matrix for the optimal k. A clear block-diagonal structure indicates a stable consensus."---- plot_consensus_matrix( consensus_result = best, reorder = TRUE, viridis_option = "D", show_labels = FALSE ) ## ----dendrogram, fig.cap = "Consensus dendrogram for the optimal k, derived from the dissimilarity matrix 1 - C."---- plot_consensus_dendrogram( consensus_result = best, rect = TRUE, labels = FALSE ) ## ----cluster-summary---------------------------------------------------------- pima_clustered <- mi_pima[[1]] pima_clustered$cluster <- factor(best$best_consensus) # Mean values per cluster num_vars <- c("pregnant", "glucose", "pressure", "triceps", "insulin", "mass", "pedigree", "age") cat("Mean clinical values by cluster:\n") print( aggregate( pima_clustered[, num_vars], by = list(Cluster = pima_clustered$cluster), FUN = function(x) round(mean(x, na.rm = TRUE), 2) ) ) ## ----boxplots, fig.cap = "Distribution of glucose and BMI by consensus cluster."---- par(mfrow = c(1, 3), mar = c(4, 4, 3, 1)) boxplot( glucose ~ cluster, data = pima_clustered, main = "Plasma Glucose by Cluster", xlab = "Cluster", ylab = "Glucose (mg/dl)", col = c("#4E79A7", "#F28E2B", "#E15759") ) boxplot( mass ~ cluster, data = pima_clustered, main = "Body Mass Index by Cluster", xlab = "Cluster", ylab = "BMI (kg/m²)", col = c("#4E79A7", "#F28E2B", "#E15759") ) boxplot( insulin ~ cluster, data = pima_clustered, main = "Insulin", xlab = "Cluster", ylab = "Insulin (mu U/ml)", col = c("#4E79A7", "#F28E2B", "#E15759") ) par(mfrow = c(1, 1)) ## ----full-pipeline------------------------------------------------------------ set.seed(202604) result <- run_mi_clustering( data = imp, method = "kmeans", k = 2:4, scale_data = "global", consensus_method = "classic", prefer_stability = TRUE, tie_breaker = "silhouette" ) cat("Best k selected:", result$best_k, "\n") cat("\nCluster sizes:\n") print(table(result$best_consensus)) ## ----weighted-ari, eval = requireNamespace("mclust", quietly = TRUE)---------- set.seed(202604) result_w <- run_mi_clustering( data = imp, method = "kmeans", k = 2:4, scale_data = "global", consensus_method = "weighted_ari", prefer_stability = TRUE, tie_breaker = "silhouette" ) cat("Best k (ARI-weighted consensus):", result_w$best_k, "\n") print(table(result_w$best_consensus))