## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse  = TRUE,
  comment   = "#>",
  fig.width = 6,
  fig.height = 5,
  fig.align = "center"
)
set.seed(42)

## ----library, warning=FALSE---------------------------------------------------
library(yaap)
library(generics)

## ----load-toy-----------------------------------------------------------------
toy <- read.csv(system.file("extdata", "toy.csv", package = "yaap"))
head(toy, 3)
dim(toy)

## ----plot-toy, fig.cap = "The toy dataset: three clusters near the corners of a triangle."----
plot(toy, asp = 1., frame.plot = FALSE, axes = FALSE,
     main = "Toy Dataset", xlab = "", ylab = "",
     pch = 19, cex = 0.6, col = "gray50")

## ----fit-toy------------------------------------------------------------------
fit <- run_aa(toy, K = 3, nrep = 3)
fit

## ----inspect------------------------------------------------------------------
coordinates(fit)

## ----reconstruction, eval=FALSE-----------------------------------------------
# X_hat <- compositions(fit) %*% coordinates(fit)

## ----identity-check, fig.cap="Reconstruction of toy dataset. Based on figure from `archetypes` package vignette."----
X <- as.matrix(toy)
X_hat <- fitted(fit)
R <- resid(fit)  # residuals: X - X_hat
(rss <- norm(R, "F")^2)  # residual sum of squares
1 - rss / norm(X, "F")^2  # R^2

plot(X, type = "n", asp = 1., frame.plot = FALSE, axes = FALSE,
     main = "Toy Dataset Reconstruction", xlab = "", ylab = "")
ix <- rowSums(R > 0.5) > 0  # highlight large residuals
segments(X[, 1], X[, 2], X_hat[, 1], X_hat[, 2], col = "firebrick", lwd = 0.5)
points(X    , pch = 20, cex = 0.8, col = "gray50")
points(X_hat, pch = 1,  cex = 0.7, col = "firebrick")
legend("topleft", legend = c("Data", "Reconstruction", "Residuals"), bty = "n",
       col = c("gray50", "firebrick", "firebrick"),
       pch = c(20, 1, NA),
       lwd = c(NA, NA, 0.5))

## ----broom-tidy-coordinates---------------------------------------------------
tidy(fit)

## ----broom-tidy-compositions--------------------------------------------------
fit |> tidy(matrix = "compositions") |> head()

## ----broom-glance-------------------------------------------------------------
glance(fit)

## ----broom-augment------------------------------------------------------------
aug <- augment(fit, data = toy)
head(aug)

comp_cols <- paste0(".", anames(fit))
range(rowSums(as.matrix(aug[, comp_cols])))  # all should be 1

## ----plot-coordinates, fig.cap = "Data and estimated archetypes. The triangle connects the three archetypes."----
plot(fit, what = "coordinates",
     main = "Archetype positions in feature space", xlab = "", ylab = "",
     asp = 1, frame.plot = FALSE, axes = FALSE,
     pch = 17, cex = 1.5, col = "firebrick",
     args.data.scatter = list(pch = 19, cex = 0.5, col = "gray50"))

## ----ggplot-example, eval=FALSE-----------------------------------------------
# library(ggplot2)
# 
# plot(fit, what = "coordinates", plot = FALSE) |>
#     ggplot(aes(x, y)) +
#     geom_point(aes(color = archetype))

## ----fit-iris-----------------------------------------------------------------
# 'Species' (the response) is discarded; the four measurements are used.
fit_iris <- run_aa(Species ~ ., data = iris, K = 3, scale = TRUE, tol = 1e-4)
fit_iris

## ----iris-coordinates, fig.cap = "PCA projection of the iris data and its three archetypes (triangles)."----
pal_sp <- c(setosa = "#E69F00", versicolor = "#56B4E9", virginica = "#009E73")
iris_col <- pal_sp[as.character(iris$Species)]
plot(fit_iris, what = "coordinates", projection = "pca", frame.plot = FALSE,
     main = "PCA projection of iris data and archetypes",
     col = "firebrick", pch = 17, cex = 1.5,
     args.data.scatter = list(col = iris_col, pch = 19, cex = 0.6))

## ----iris-species-archetypes-match--------------------------------------------
B <- coef(fit_iris)  # K x N

# Sum coefficients within each species (rows = species, cols = archetypes)
coef_by_species <- aggregate(t(B), by = iris["Species"], FUN = sum)
knitr::kable(coef_by_species, digits = 3)

## ----iris-rename-archetypes---------------------------------------------------
# Concat all species contributing to an archetype
coef_mat <- as.matrix(coef_by_species[, -1])  # drop species column
rownames(coef_mat) <- coef_by_species$Species
species_contrib <- which(coef_mat > 0.1, arr.ind = TRUE)
species_contrib <- aggregate(
    row ~ col,
    data = species_contrib,
    FUN = function(ix) {
        species <- rownames(coef_mat)[ix]
        species <- substr(species, 1, 3)  # abbreviate species names
        paste(species, collapse = ".")
    }
)

anames(fit_iris) <- species_contrib$row
anames(fit_iris)

## ----iris-palette, include = FALSE--------------------------------------------
pal_aa <- c(set = "#E69F00", set.ver = "#E7E300", vir = "#009E73")
pal_aa <- pal_aa[anames(fit_iris)]  # reorder to match archetype order

## ----iris-profiles, fig.cap = "Archetype feature profiles. Each group of bars is one feature; colors distinguish archetypes."----
plot(fit_iris, what = "profiles",
     horiz = TRUE,
     names.arg = sub("\\.", "\n", colnames(iris[, 1:4])),
     las = 1,
     col = pal_aa,
     xlab = "Feature value (original scale)", ylab = "",
     main = "Archetype feature profiles")

## ----iris-simplex, fig.cap = "Ternary plot of iris compositions, colored by species. Each point is a sample placed by its three archetype weights."----
plot(fit_iris, what = "simplex",
     col = pal_sp[as.character(iris$Species)], pch = 19,
     pca = TRUE, col.pca = "black", robust = FALSE)
legend("topright", legend = names(pal_sp), col = pal_sp, pch = 19, bty = "n")

## ----iris-compositions, fig.cap = "Composition barplot: each bar is one iris sample. Single-color bars are near-pure specimens; blended bars sit between archetypes."----
plot(fit_iris, what = "compositions",
     main = "Archetype compositions across iris samples",
     cluster_rows = "PC1", cluster_cols = FALSE, col = pal_aa
)

## ----predict------------------------------------------------------------------
new_obs <- matrix(
    c(5.1, 3.5, 1.4, 0.2,   # setosa-like
      6.7, 3.0, 5.2, 2.3),  # virginica-like
    nrow = 2, byrow = TRUE,
    dimnames = list(c("obs1", "obs2"), colnames(iris[, 1:4]))
)
print(new_obs)

predict(fit_iris, newdata = new_obs)

predict(fit_iris, newdata = new_obs, type = "compositions")

## ----k-selection-fit, warning=FALSE-------------------------------------------
fit_path <- archetypes_path(
    toy,
    K = 6,
    init = "random",
    max_iter = 100,
    nrep = 5
)

## ----k-selection-plot, results = "hold", fig.width = 9, fig.height = 3.5, out.width = "100%", fig.cap = "RSS, unexplained variance, and adapted AIC across candidate values of K."----
op <- par(mfrow = c(1, 3), mar = c(4, 4, 2.5, 0.5), bty = "n")

selected_K <- 3
fit_best <- fit_path[[selected_K]] # here equivalent to fit_path[["K3"]]

screeplot(
    fit_path,
    y = "loss",
    col = "steelblue4",
    frame.plot = FALSE,
    ylab = "Reconstruction loss",
    main = "RSS"
)
abline(v = selected_K, lty = 2, col = "firebrick")

screeplot(
    fit_path,
    y = function(fit) 1 - utils::tail(fit$loss$r2, 1L),
    col = "steelblue4",
    frame.plot = FALSE,
    ylab = expression(1 - R^2),
    main = "Unexplained Variance"
)
abline(v = selected_K, lty = 2, col = "firebrick")

screeplot(
    fit_path,
    col = "steelblue4",
    frame.plot = FALSE,
    ylab = "Adapted AIC",
    main = "Adapted AIC"
)
abline(v = selected_K, lty = 2, col = "firebrick")

par(op)

## ----furthest-sum-experiment--------------------------------------------------
runs <- do.call(rbind, lapply(1:5, function(k) {
    fit_fs  <- run_aa(iris[, 1:4], K = 3, nrep = 1)
    fit_uni <- run_aa(iris[, 1:4], K = 3, nrep = 1, init = "random")
    data.frame(
        run       = k,
        fs_start  = fit_fs$loss$loss[1],
        fs_end    = tail(fit_fs$loss$loss, 1),
        uni_start = fit_uni$loss$loss[1],
        uni_end   = tail(fit_uni$loss$loss, 1)
    )
}))

colnames(runs) <- c("run",
                    "furthest_sum start", "furthest_sum end",
                    "random start", "random end")
knitr::kable(runs, digits = 1)

## ----check-convergence, fig.cap = "RSS across iterations for the iris model."----
fit_iris$converged
tail(fit_iris$loss, 1)
plot(fit_iris, what = "loss", frame.plot = FALSE, type = "b",
     main = "Convergence of the iris model")

## ----delta-data-fit, warning=FALSE, fig.cap = "Truncated simplex: Larger delta lets archetypes escape the data hull and approach the true positions."----
K_d <- 3  # number of archetypes (simplex vertices)
M_d <- 2  # number of features (2D for easy visualization)
N_init <- 1000  # initial number of samples before truncation
thresh <- 0.8    # truncation threshold on composition weights

# True archetypes: equilateral triangle
A_true <- matrix(
    c(cos(seq(0, 2*pi*(K_d-1)/K_d, length.out = K_d)),
      sin(seq(0, 2*pi*(K_d-1)/K_d, length.out = K_d))),
    nrow = K_d, ncol = M_d
)

# Dirichlet-like mixing weights; remove observations near any vertex
S_raw   <- -log(matrix(runif(K_d * N_init), nrow = N_init, ncol = K_d))
S_raw   <- S_raw / rowSums(S_raw)
S_trunc <- S_raw[rowSums(S_raw > thresh) == 0, ]
X_delta <- S_trunc %*% A_true
print(nrow(X_delta))  # observations after truncation

# Fit models with different delta values
delta_vals <- c(0, 0.2, 0.35)
res_delta  <- lapply(
    delta_vals,
    function(d) archetypes_pgd(X_delta, K = K_d, delta = d)
)

# Plot all models together with the true archetypes
ix   <- c(1:3, 1)   # close the triangle
cols <- c("steelblue", "darkorange", "firebrick")

plot(A_true[ix, 1], A_true[ix, 2],
     type = "l", col = "black", lwd = 2, lty = 3,
     frame.plot = FALSE, xlab = "", ylab = "",
     xaxt = "n", yaxt = "n", asp = 1,
     main = "Effect of delta on recovered archetypes")
points(X_delta, pch = 19, cex = 0.4, col = "gray50")
for (i in seq_along(res_delta)) {
    A <- coordinates(res_delta[[i]])
    lines(A[ix, 1], A[ix, 2], col = cols[i], lwd = 2)
}
legend("topright",
       legend = c("True simplex", paste0("\u03b4 = ", delta_vals)),
       col = c("black", cols), lty = c(3, 1, 1, 1), lwd = 2, bty = "n")

## ----robust-data--------------------------------------------------------------
# Append five outlier observations to iris
N_out <- 5
displacement <- c(2, 2, 0, 0)  # displace in the first and second features

add_outliers <- function(X, N, displacement) {
  x_mean <- colMeans(X) + displacement # place outliers at a distance from the mean
  x_sd <- apply(X, 2, sd) * .75
  X_out <- rnorm(N * ncol(X), x_mean, x_sd)  # recycle x_mean, x_sd
  rbind(X, matrix(pmax(X_out, .1), nrow = N, byrow = TRUE))
}

X <- iris[, 1:4] |>
    as.matrix() |>
    add_outliers(N = N_out, displacement = displacement)

set.seed(123)  # fix starting positions
fit_std <- run_aa(X, K = 3, robust = FALSE, tol = 1e-4, init = "random", nrep = 5)
set.seed(123)  # same starting positions
fit_rob <- run_aa(X, K = 3, robust = TRUE, tol = 1e-4, init = "random", nrep = 5)

## ----robust-compare, fig.cap = "PCA projection of iris with five added outliers (crosses). Standard AA (red) is distorted toward the outlier cluster; robust AA (blue) closely recovers the reference triangle fitted on the clean data (dashed black)."----
# Project all archetypes to PC space of data
pca <- prcomp(X, scale. = TRUE)
A_std <- predict(pca, coordinates(fit_std))
A_rob <- predict(pca, coordinates(fit_rob))
A_ori <- predict(pca, coordinates(fit_iris))   # reference: fitted on clean iris

# Plot data points
plot(pca$x,
     main = "PCA projection of iris data with outliers",
     col = "gray60", pch = 19, cex = 0.5, frame.plot = FALSE, asp = 1)
# Add outliers
points(pca$x[nrow(iris) + 1:N_out, ], col = "black", pch = 8)

# Add Archetypes
ix <- c(1:3, 1)
lines(A_ori[ix, ], col = "black", lwd = 1.5, lty = 2)
lines(A_std[ix, ], col = "firebrick", lwd = 2)
points(A_std, col = "firebrick", pch = 17, cex = 1.5)
lines(A_rob[ix, ], col = "steelblue", lwd = 2)
points(A_rob, col = "steelblue", pch = 17, cex = 1.5)

# Add legend
legend("topright",
       legend = c("Data", "Outliers", "Reference AA", "Standard AA", "Robust AA"),
       col    = c("gray60", "black", "black", "firebrick", "steelblue"),
       pch    = c(19, 8, NA, 17, 17),
       lty    = c(NA, NA, 2, 1, 1),
       lwd    =  2,
       bty    = "n")

## ----missing-data, warning=FALSE----------------------------------------------
X <- as.matrix(iris[, 1:4])

# Introduce ~10 % missing at random
missing_rate <- 0.1
n_elem <- length(X)  # number of elements
X_miss <- X
idx_miss <- sample(n_elem, size = round(missing_rate * n_elem))
X_miss[idx_miss] <- NA

set.seed(123)
fit_miss <- run_aa(X_miss, K = 3, nrep = 10, init = "random")  # missing = TRUE auto-detected
fit_miss

## ----missing-fitted-----------------------------------------------------------
X_hat_miss <- fitted(fit_miss)
any(is.na(X_hat_miss))   # FALSE = all positions reconstructed

# compare reconstructed vs original values at missing positions
cor(X_hat_miss[idx_miss], X[idx_miss])

## ----benchmark, warning=FALSE-------------------------------------------------
toy_mat    <- as.matrix(toy)
nrep_bench <- 10
seeds      <- 2046 + seq_len(nrep_bench)

run_single <- function(method, seed, ...) {
    set.seed(seed)
    fit <- suppressWarnings(run_aa(
        toy_mat, K = 3, nrep = 1, method = method, init = "random", ...
    ))
    c(n_iter = nrow(fit$loss), final_loss = norm(resid(fit), "F"))
}

# PGD benchmark
t_pgd  <- system.time(
  res_pgd <- vapply(seeds, run_single, numeric(2), method = "pgd")
)[["elapsed"]]
n_pgd  <- round(median(res_pgd["n_iter", ]))

# NNLS benchmark
t_nnls <- system.time(
    res_nnls <- vapply(seeds, run_single, numeric(2), method = "nnls")
)[["elapsed"]]
n_nnls <- round(median(res_nnls["n_iter", ]))

# Run by run comparison
win_pgd   <- sum(res_pgd["final_loss", ] < res_nnls["final_loss", ])
loss_diff <- res_pgd["final_loss", ] - res_nnls["final_loss", ]
diff_pgd  <- median(loss_diff / res_nnls["final_loss", ])
diff_nnls <- median(loss_diff / res_pgd["final_loss", ])

bench <- data.frame(
  Method           = c("PGD", "NNLS"),
  `Median iters`   = c(n_pgd, n_nnls),
  `ms / iter`      = 1000 * c(t_pgd / n_pgd, t_nnls / n_nnls),
  `loss change (%)`= 100 * c(diff_pgd, -diff_nnls),
  `Wins`           = c(win_pgd, nrep_bench - win_pgd),
  check.names      = FALSE
)
knitr::kable(bench, digits = 2)

## ----loss-df------------------------------------------------------------------
head(fit_iris$loss, 6)

## ----session-info-------------------------------------------------------------
sessionInfo()