--- title: "toscca-vignette" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{toscca-vignette} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(toscca) library(ggplot2) ``` ## toscca ```{r simulate data for toscca, echo=FALSE} #sample size etc N = 100 p = 2500 q = 500 ################################################# #Simulate noise X0 = sapply(1:p, function(x) rnorm(N)) Y0 = sapply(1:q, function(x) rnorm(N)) colnames(X0) = paste0("x", 1:p) colnames(Y0) = paste0("y", 1:q) ################################### # Manual signal #Signal 1 (columns 1:10) Z1 = rnorm(N,0,1) Z2 = rnorm(N,-3,1) Z3 = rnorm(N,10,1) #Some associations with the true signal alpha1 = (1:10) / 10 beta1 = -(1:20) / 10 alpha2 = ((p/2):(p/2 + 100)) / 1500 beta2 = -(40:120) / 120 alpha3 = (200:220) / 300 beta3 = -(200:250) / 300 loc_alpha1 = 1:length(alpha1) # sample(1:p, length(alpha1), replace = FALSE) # loc_beta1 = 1:length(beta1) # sample(1:q, length(beta1), replace = FALSE) # loc_alpha2 = (p/2):(p/2 + length(alpha2) - 1) # sample((1:p)[!(1:p %in% loc_alpha1)], length(alpha2), replace = FALSE) # loc_beta2 = 40:(40 + length(beta2) -1) # sample((1:q)[!(1:q %in% loc_beta1)], length(beta2), replace = FALSE) # loc_alpha3 = 200:(200+length(alpha3) - 1) # sample((1:p)[!(1:p %in% c(loc_alpha1, loc_alpha2))], length(alpha3), replace = FALSE) # loc_beta3 = 200:(200 + length(beta3) -1) # sample((1:q)[!(1:q %in% c(loc_beta1, loc_beta2))], length(beta3), replace = FALSE) # # ordered signal for(i in 1:length(alpha1)) X0[, loc_alpha1[i]] = alpha1[i] * Z1 + rnorm(N,0,0.3) for(i in 1:length(beta1)) Y0[, loc_beta1[i]] = beta1[i] * Z1 + rnorm(N,0,0.3) for(i in 1:length(alpha2)) X0[, loc_alpha2[i]] = alpha2[i] * Z2 + rnorm(N,0,0.03) for(i in 1:length(beta2)) Y0[, loc_beta2[i]] = beta2[i] * Z2 + rnorm(N,0,0.03) for(i in 1:length(alpha3)) X0[, loc_alpha3[i]] = alpha3[i] * Z3 + rnorm(N,0,0.5) for(i in 1:length(beta3)) Y0[, loc_beta3[i]] = beta3[i] * Z3 + rnorm(N,0,0.5) ``` ## Canonical Correlation Analysis We use the method described in the paper, Thresholded Ordered Sparse Canonical Correlation Analysis (TOSCCA), to uncover the underlying processes linking the data. ```{r run toscca, message=FALSE} X = standardVar(X0) Y = standardVar(Y0) K = 4 # number of components to be estimated nonz_x = rep(100, K) # number of nonzero variables for X nonz_y = rep(100, K) # number of nonzero variables for Y init = "uniform" # type of initialisation cca_toscca = toscca(X, Y, nonz_x, nonz_y, K, init, combination = FALSE, silent = TRUE, toPlot = FALSE) cpev_toscca = sapply(1:K, function(k) cpev.toscca(X, cca_toscca$alpha[,1:k])) # perm_toscca = perm.toscca(X, Y, nonz_x, nonz_y, K = K, init, draws = 100, cancor = cca_toscca$cancor) ``` ## tosccamm ```{r simulate data for tosccamm, include=FALSE} set.seed(1234) n = 100 #This can be varied to simulate different signals # alpha1 = c(runif(10, 0.5, 1), rep(0, 990)) # beta1 = c(runif(5, 0.5, 1), rep(0, 195)) # alpha2 = c(rep(0, 990), runif(10, 0.5, 1)) # beta2 = c(rep(0, 195),runif(5, 0.5, 1)) beta2 = c(rep(0, 195), rep(0.5,5)) alpha2 = c(rep(0, 990), rep(0.5,10)) beta1 = c(rep(0.5,5), rep(0, 195)) alpha1 = c(rep(0.5,10), rep(0, 990)) X1 = list() X2 = list() i = 1 K = 3 sg = matrix(c(1, 0.6, 0.3, rep(0, 7), 0.6, 1, 0.6, 0.3, rep(0, 6), 0.3, 0.6, 1, 0.6, 0.3, rep(0,5), 0, 0.3, 0.6, 1, 0.6, 0.3, rep(0, 4), rep(0, 2), 0.3, 0.6, 1, 0.6, 0.3, rep(0,3), rep(0,3), 0.3, 0.6, 1, 0.6, 0.3, rep(0, 2), rep(0, 4), 0.3, 0.6, 1, 0.6, 0.3, 0, rep(0, 5), 0.3, 0.6, 1, 0.6, 0.3, rep(0,6), 0.3, 0.6, 1, 0.6, rep(0,7), 0.3, 0.6, 1), ncol = 10) for(i in 1:n) { print(i) #Simulate times of measurement times = 1:10 #random effect ui = rnorm(1,0,0.9) #Shared canonical vector (with some time effect) Zi1 = -(1+times/max(times))^3 + times * 0.5 + ui Zi2 = (sin(100*times))^times + times * 0.65 +rnorm(1,0,0.95) Zi = cbind(Zi1, Zi2) alpha = cbind(alpha1, alpha2) beta = cbind(beta1, beta2) #Simulate data and add some noise X1i = sapply(1:nrow(alpha), function(a) MASS::mvrnorm(1, (Zi %*% t(alpha))[,a], Sigma = sg)) X2i = sapply(1:nrow(beta), function(a) MASS::mvrnorm(1, (Zi %*% t(beta))[,a], Sigma = sg)) # X1i = sapply(alpha1, function(a) rnorm(length(times), Zi1 * a, 0.5)) # X2i = sapply(beta1, function(a) rnorm(length(times), Zi1 * a, 0.5)) # X1i = sapply(alpha2, function(a) rnorm(length(times), Zi2 * a, 0.5)) # X2i = sapply(beta2, function(a) rnorm(length(times), Zi2 * a, 0.5)) colnames(X1i) = paste0("X", 1:ncol(X1i)) colnames(X2i) = paste0("Y", 1:ncol(X2i)) #Check the simulated cross correlation #image(cor(X1i, X2i)) #Remove some observations # p_observed = 1 X1i = cbind(id=i, time=times, X1i)#[rbinom(length(times),1,p_observed)==1,] X2i = cbind(id=i, time=times, X2i)#[rbinom(length(times),1,p_observed)==1,] X1[[i]] = X1i X2[[i]] = X2i } X1 = do.call("rbind", X1) X2 = do.call("rbind", X2) dev.new(); image(cor(X1,X2)) verwijder_X = sample(1:nrow(X1), 0.2*nrow(X1), replace=FALSE) verwijder_Y = sample(1:nrow(X2), 0.3*nrow(X2), replace=FALSE) # table(verwijder_X) # table(verwijder_Y) X1=X1[-verwijder_X,] X2=X2[-verwijder_Y,] # XX2 = scale_rm(data.frame(X1), centre = T); YY2 = scale_rm(data.frame(X2), centre = T) XX2 = data.frame(X1); YY2 = data.frame(X2) nonz_a = seq(50, 5, length.out = 10) # seq(100, 1000, 100) nonz_b = round(seq(50, 5, length.out = 5)) # seq(100, 1000, 100) ``` Estimate the canonical weights and latent paths for $K$ components. ```{r run tosccamm} res_k = list() X.temp = XX2 Y.temp = YY2 for (k in 1:1) { if(k > 1) { # residualise for subsequent components X.temp = data.frame(X.temp[,c(1,2)],toscca::residualisation(as.matrix(X.temp[,-c(1,2)]), res_k[[k-1]]$alpha, type = "basic") ) Y.temp = data.frame(Y.temp[,c(1,2)],toscca::residualisation(as.matrix(Y.temp[,-c(1,2)]), res_k[[k-1]]$beta, type = "basic") ) nz_a_gen = as.numeric(table(res_k[[k-1]]$alpha != 0)[2]) nz_b_gen = as.numeric(table(res_k[[k-1]]$beta != 0)[2]) } res_k[[k]] <- tosccamm(X.temp, Y.temp, folds = 2, nonzero_a = nonz_a, nonzero_b = nonz_b, model = "lme", lmeformula = " ~ 0 + poly(time,3) + (1|id)", silent = TRUE) } ``` ### Results #### Latent paths for $k=1$ and $k=2$ Figures 3.a and 3.b in manuscript. ```{r plots latent paths, echo=FALSE, warning=FALSE, message=FALSE} lv1 = data.frame(id = XX2$id, time = XX2$time, K1=as.matrix(XX2[,-c(1,2)])%*% (res_k[[1]]$alpha)) shed_blue_60 <- toscca:::mpalette # Update plot1 (no x-axis label) plot1 <- ggplot(lv1, aes(x = time, y = scale(K1), group = id)) + geom_line(alpha = 0.1, aes(color = "Estimated", linetype = "Estimated", linewidth = "Estimated")) + # Estimated (black) geom_line(data = data.frame(time = sort(unique(lv1$time)), z = scale(Zi2)), aes(x = time, y = scale(z), color = "True", group = 1, , linetype = "True", linewidth = "True")) + # True (red) geom_line(data = data.frame(time = sort(unique(lv1$time)), K1 = aggregate(lv1, by = list(lv1$time), FUN = mean)$K1), aes(x = time, y = scale(K1), group = 1, color = "Smoothed", linetype = "Smoothed", linewidth = "Smoothed")) + # Smoothed (blue) scale_color_manual(values = c("Estimated" = "black", "True" = toscca:::mpalette[3], "Smoothed" = shed_blue_60)) + # Specify color manually scale_linewidth_manual(values =c("Estimated" = 0.8, "True" = 1.5, "Smoothed" = 1.5)) + scale_linetype_manual(values = c("Estimated" = 3, "True" = 1, "Smoothed" = 2)) + # Specify line types scale_x_continuous(breaks = unique(lv1$time)) + # Ensure x-axis is integers labs(x = "Time", y = "Values", color = "Legend" ) + # Remove x label theme_minimal() + theme(legend.position = "none", text = element_text(family = "Latex"), axis.title.y = element_text(size = 20, margin = margin(r = 10)), # Increase Y axis title size axis.title.x = element_text(size = 20, margin = margin(t = 15)), axis.text = element_text(size = 15), # Increase axis text size plot.title = element_text(size = 25), # Increase plot title size axis.text.x =element_text(size = 15) # Increase X axis text size ) print(plot1) ```