## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 6, fig.height = 5, fig.align = "center" ) set.seed(42) ## ----library, warning=FALSE--------------------------------------------------- library(yaap) library(generics) ## ----load-toy----------------------------------------------------------------- toy <- read.csv(system.file("extdata", "toy.csv", package = "yaap")) head(toy, 3) dim(toy) ## ----plot-toy, fig.cap = "The toy dataset: three clusters near the corners of a triangle."---- plot(toy, asp = 1., frame.plot = FALSE, axes = FALSE, main = "Toy Dataset", xlab = "", ylab = "", pch = 19, cex = 0.6, col = "gray50") ## ----fit-toy------------------------------------------------------------------ fit <- run_aa(toy, K = 3, nrep = 3) fit ## ----inspect------------------------------------------------------------------ coordinates(fit) ## ----reconstruction, eval=FALSE----------------------------------------------- # X_hat <- compositions(fit) %*% coordinates(fit) ## ----identity-check, fig.cap="Reconstruction of toy dataset. Based on figure from `archetypes` package vignette."---- X <- as.matrix(toy) X_hat <- fitted(fit) R <- resid(fit) # residuals: X - X_hat (rss <- norm(R, "F")^2) # residual sum of squares 1 - rss / norm(X, "F")^2 # R^2 plot(X, type = "n", asp = 1., frame.plot = FALSE, axes = FALSE, main = "Toy Dataset Reconstruction", xlab = "", ylab = "") ix <- rowSums(R > 0.5) > 0 # highlight large residuals segments(X[, 1], X[, 2], X_hat[, 1], X_hat[, 2], col = "firebrick", lwd = 0.5) points(X , pch = 20, cex = 0.8, col = "gray50") points(X_hat, pch = 1, cex = 0.7, col = "firebrick") legend("topleft", legend = c("Data", "Reconstruction", "Residuals"), bty = "n", col = c("gray50", "firebrick", "firebrick"), pch = c(20, 1, NA), lwd = c(NA, NA, 0.5)) ## ----broom-tidy-coordinates--------------------------------------------------- tidy(fit) ## ----broom-tidy-compositions-------------------------------------------------- fit |> tidy(matrix = "compositions") |> head() ## ----broom-glance------------------------------------------------------------- glance(fit) ## ----broom-augment------------------------------------------------------------ aug <- augment(fit, data = toy) head(aug) comp_cols <- paste0(".", anames(fit)) range(rowSums(as.matrix(aug[, comp_cols]))) # all should be 1 ## ----plot-coordinates, fig.cap = "Data and estimated archetypes. The triangle connects the three archetypes."---- plot(fit, what = "coordinates", main = "Archetype positions in feature space", xlab = "", ylab = "", asp = 1, frame.plot = FALSE, axes = FALSE, pch = 17, cex = 1.5, col = "firebrick", args.data.scatter = list(pch = 19, cex = 0.5, col = "gray50")) ## ----ggplot-example, eval=FALSE----------------------------------------------- # library(ggplot2) # # plot(fit, what = "coordinates", plot = FALSE) |> # ggplot(aes(x, y)) + # geom_point(aes(color = archetype)) ## ----fit-iris----------------------------------------------------------------- # 'Species' (the response) is discarded; the four measurements are used. fit_iris <- run_aa(Species ~ ., data = iris, K = 3, scale = TRUE, tol = 1e-4) fit_iris ## ----iris-coordinates, fig.cap = "PCA projection of the iris data and its three archetypes (triangles)."---- pal_sp <- c(setosa = "#E69F00", versicolor = "#56B4E9", virginica = "#009E73") iris_col <- pal_sp[as.character(iris$Species)] plot(fit_iris, what = "coordinates", projection = "pca", frame.plot = FALSE, main = "PCA projection of iris data and archetypes", col = "firebrick", pch = 17, cex = 1.5, args.data.scatter = list(col = iris_col, pch = 19, cex = 0.6)) ## ----iris-species-archetypes-match-------------------------------------------- B <- coef(fit_iris) # K x N # Sum coefficients within each species (rows = species, cols = archetypes) coef_by_species <- aggregate(t(B), by = iris["Species"], FUN = sum) knitr::kable(coef_by_species, digits = 3) ## ----iris-rename-archetypes--------------------------------------------------- # Concat all species contributing to an archetype coef_mat <- as.matrix(coef_by_species[, -1]) # drop species column rownames(coef_mat) <- coef_by_species$Species species_contrib <- which(coef_mat > 0.1, arr.ind = TRUE) species_contrib <- aggregate( row ~ col, data = species_contrib, FUN = function(ix) { species <- rownames(coef_mat)[ix] species <- substr(species, 1, 3) # abbreviate species names paste(species, collapse = ".") } ) anames(fit_iris) <- species_contrib$row anames(fit_iris) ## ----iris-palette, include = FALSE-------------------------------------------- pal_aa <- c(set = "#E69F00", set.ver = "#E7E300", vir = "#009E73") pal_aa <- pal_aa[anames(fit_iris)] # reorder to match archetype order ## ----iris-profiles, fig.cap = "Archetype feature profiles. Each group of bars is one feature; colors distinguish archetypes."---- plot(fit_iris, what = "profiles", horiz = TRUE, names.arg = sub("\\.", "\n", colnames(iris[, 1:4])), las = 1, col = pal_aa, xlab = "Feature value (original scale)", ylab = "", main = "Archetype feature profiles") ## ----iris-simplex, fig.cap = "Ternary plot of iris compositions, colored by species. Each point is a sample placed by its three archetype weights."---- plot(fit_iris, what = "simplex", col = pal_sp[as.character(iris$Species)], pch = 19, pca = TRUE, col.pca = "black", robust = FALSE) legend("topright", legend = names(pal_sp), col = pal_sp, pch = 19, bty = "n") ## ----iris-compositions, fig.cap = "Composition barplot: each bar is one iris sample. Single-color bars are near-pure specimens; blended bars sit between archetypes."---- plot(fit_iris, what = "compositions", main = "Archetype compositions across iris samples", cluster_rows = "PC1", cluster_cols = FALSE, col = pal_aa ) ## ----predict------------------------------------------------------------------ new_obs <- matrix( c(5.1, 3.5, 1.4, 0.2, # setosa-like 6.7, 3.0, 5.2, 2.3), # virginica-like nrow = 2, byrow = TRUE, dimnames = list(c("obs1", "obs2"), colnames(iris[, 1:4])) ) print(new_obs) predict(fit_iris, newdata = new_obs) predict(fit_iris, newdata = new_obs, type = "compositions") ## ----k-selection-fit, warning=FALSE------------------------------------------- fit_path <- archetypes_path( toy, K = 6, init = "random", max_iter = 100, nrep = 5 ) ## ----k-selection-plot, results = "hold", fig.width = 9, fig.height = 3.5, out.width = "100%", fig.cap = "RSS, unexplained variance, and adapted AIC across candidate values of K."---- op <- par(mfrow = c(1, 3), mar = c(4, 4, 2.5, 0.5), bty = "n") selected_K <- 3 fit_best <- fit_path[[selected_K]] # here equivalent to fit_path[["K3"]] screeplot( fit_path, y = "loss", col = "steelblue4", frame.plot = FALSE, ylab = "Reconstruction loss", main = "RSS" ) abline(v = selected_K, lty = 2, col = "firebrick") screeplot( fit_path, y = function(fit) 1 - utils::tail(fit$loss$r2, 1L), col = "steelblue4", frame.plot = FALSE, ylab = expression(1 - R^2), main = "Unexplained Variance" ) abline(v = selected_K, lty = 2, col = "firebrick") screeplot( fit_path, col = "steelblue4", frame.plot = FALSE, ylab = "Adapted AIC", main = "Adapted AIC" ) abline(v = selected_K, lty = 2, col = "firebrick") par(op) ## ----furthest-sum-experiment-------------------------------------------------- runs <- do.call(rbind, lapply(1:5, function(k) { fit_fs <- run_aa(iris[, 1:4], K = 3, nrep = 1) fit_uni <- run_aa(iris[, 1:4], K = 3, nrep = 1, init = "random") data.frame( run = k, fs_start = fit_fs$loss$loss[1], fs_end = tail(fit_fs$loss$loss, 1), uni_start = fit_uni$loss$loss[1], uni_end = tail(fit_uni$loss$loss, 1) ) })) colnames(runs) <- c("run", "furthest_sum start", "furthest_sum end", "random start", "random end") knitr::kable(runs, digits = 1) ## ----check-convergence, fig.cap = "RSS across iterations for the iris model."---- fit_iris$converged tail(fit_iris$loss, 1) plot(fit_iris, what = "loss", frame.plot = FALSE, type = "b", main = "Convergence of the iris model") ## ----delta-data-fit, warning=FALSE, fig.cap = "Truncated simplex: Larger delta lets archetypes escape the data hull and approach the true positions."---- K_d <- 3 # number of archetypes (simplex vertices) M_d <- 2 # number of features (2D for easy visualization) N_init <- 1000 # initial number of samples before truncation thresh <- 0.8 # truncation threshold on composition weights # True archetypes: equilateral triangle A_true <- matrix( c(cos(seq(0, 2*pi*(K_d-1)/K_d, length.out = K_d)), sin(seq(0, 2*pi*(K_d-1)/K_d, length.out = K_d))), nrow = K_d, ncol = M_d ) # Dirichlet-like mixing weights; remove observations near any vertex S_raw <- -log(matrix(runif(K_d * N_init), nrow = N_init, ncol = K_d)) S_raw <- S_raw / rowSums(S_raw) S_trunc <- S_raw[rowSums(S_raw > thresh) == 0, ] X_delta <- S_trunc %*% A_true print(nrow(X_delta)) # observations after truncation # Fit models with different delta values delta_vals <- c(0, 0.2, 0.35) res_delta <- lapply( delta_vals, function(d) archetypes_pgd(X_delta, K = K_d, delta = d) ) # Plot all models together with the true archetypes ix <- c(1:3, 1) # close the triangle cols <- c("steelblue", "darkorange", "firebrick") plot(A_true[ix, 1], A_true[ix, 2], type = "l", col = "black", lwd = 2, lty = 3, frame.plot = FALSE, xlab = "", ylab = "", xaxt = "n", yaxt = "n", asp = 1, main = "Effect of delta on recovered archetypes") points(X_delta, pch = 19, cex = 0.4, col = "gray50") for (i in seq_along(res_delta)) { A <- coordinates(res_delta[[i]]) lines(A[ix, 1], A[ix, 2], col = cols[i], lwd = 2) } legend("topright", legend = c("True simplex", paste0("\u03b4 = ", delta_vals)), col = c("black", cols), lty = c(3, 1, 1, 1), lwd = 2, bty = "n") ## ----robust-data-------------------------------------------------------------- # Append five outlier observations to iris N_out <- 5 displacement <- c(2, 2, 0, 0) # displace in the first and second features add_outliers <- function(X, N, displacement) { x_mean <- colMeans(X) + displacement # place outliers at a distance from the mean x_sd <- apply(X, 2, sd) * .75 X_out <- rnorm(N * ncol(X), x_mean, x_sd) # recycle x_mean, x_sd rbind(X, matrix(pmax(X_out, .1), nrow = N, byrow = TRUE)) } X <- iris[, 1:4] |> as.matrix() |> add_outliers(N = N_out, displacement = displacement) set.seed(123) # fix starting positions fit_std <- run_aa(X, K = 3, robust = FALSE, tol = 1e-4, init = "random", nrep = 5) set.seed(123) # same starting positions fit_rob <- run_aa(X, K = 3, robust = TRUE, tol = 1e-4, init = "random", nrep = 5) ## ----robust-compare, fig.cap = "PCA projection of iris with five added outliers (crosses). Standard AA (red) is distorted toward the outlier cluster; robust AA (blue) closely recovers the reference triangle fitted on the clean data (dashed black)."---- # Project all archetypes to PC space of data pca <- prcomp(X, scale. = TRUE) A_std <- predict(pca, coordinates(fit_std)) A_rob <- predict(pca, coordinates(fit_rob)) A_ori <- predict(pca, coordinates(fit_iris)) # reference: fitted on clean iris # Plot data points plot(pca$x, main = "PCA projection of iris data with outliers", col = "gray60", pch = 19, cex = 0.5, frame.plot = FALSE, asp = 1) # Add outliers points(pca$x[nrow(iris) + 1:N_out, ], col = "black", pch = 8) # Add Archetypes ix <- c(1:3, 1) lines(A_ori[ix, ], col = "black", lwd = 1.5, lty = 2) lines(A_std[ix, ], col = "firebrick", lwd = 2) points(A_std, col = "firebrick", pch = 17, cex = 1.5) lines(A_rob[ix, ], col = "steelblue", lwd = 2) points(A_rob, col = "steelblue", pch = 17, cex = 1.5) # Add legend legend("topright", legend = c("Data", "Outliers", "Reference AA", "Standard AA", "Robust AA"), col = c("gray60", "black", "black", "firebrick", "steelblue"), pch = c(19, 8, NA, 17, 17), lty = c(NA, NA, 2, 1, 1), lwd = 2, bty = "n") ## ----missing-data, warning=FALSE---------------------------------------------- X <- as.matrix(iris[, 1:4]) # Introduce ~10 % missing at random missing_rate <- 0.1 n_elem <- length(X) # number of elements X_miss <- X idx_miss <- sample(n_elem, size = round(missing_rate * n_elem)) X_miss[idx_miss] <- NA set.seed(123) fit_miss <- run_aa(X_miss, K = 3, nrep = 10, init = "random") # missing = TRUE auto-detected fit_miss ## ----missing-fitted----------------------------------------------------------- X_hat_miss <- fitted(fit_miss) any(is.na(X_hat_miss)) # FALSE = all positions reconstructed # compare reconstructed vs original values at missing positions cor(X_hat_miss[idx_miss], X[idx_miss]) ## ----benchmark, warning=FALSE------------------------------------------------- toy_mat <- as.matrix(toy) nrep_bench <- 10 seeds <- 2046 + seq_len(nrep_bench) run_single <- function(method, seed, ...) { set.seed(seed) fit <- suppressWarnings(run_aa( toy_mat, K = 3, nrep = 1, method = method, init = "random", ... )) c(n_iter = nrow(fit$loss), final_loss = norm(resid(fit), "F")) } # PGD benchmark t_pgd <- system.time( res_pgd <- vapply(seeds, run_single, numeric(2), method = "pgd") )[["elapsed"]] n_pgd <- round(median(res_pgd["n_iter", ])) # NNLS benchmark t_nnls <- system.time( res_nnls <- vapply(seeds, run_single, numeric(2), method = "nnls") )[["elapsed"]] n_nnls <- round(median(res_nnls["n_iter", ])) # Run by run comparison win_pgd <- sum(res_pgd["final_loss", ] < res_nnls["final_loss", ]) loss_diff <- res_pgd["final_loss", ] - res_nnls["final_loss", ] diff_pgd <- median(loss_diff / res_nnls["final_loss", ]) diff_nnls <- median(loss_diff / res_pgd["final_loss", ]) bench <- data.frame( Method = c("PGD", "NNLS"), `Median iters` = c(n_pgd, n_nnls), `ms / iter` = 1000 * c(t_pgd / n_pgd, t_nnls / n_nnls), `loss change (%)`= 100 * c(diff_pgd, -diff_nnls), `Wins` = c(win_pgd, nrep_bench - win_pgd), check.names = FALSE ) knitr::kable(bench, digits = 2) ## ----loss-df------------------------------------------------------------------ head(fit_iris$loss, 6) ## ----session-info------------------------------------------------------------- sessionInfo()