---
title: "toscca-vignette"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{toscca-vignette}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

```{r setup}
library(toscca)
library(ggplot2)
```

## toscca

```{r simulate data for toscca, echo=FALSE}
#sample size etc
N = 100
p = 2500
q = 500
#################################################
#Simulate noise 
X0 = sapply(1:p, function(x) rnorm(N))
Y0 = sapply(1:q, function(x) rnorm(N))

colnames(X0) = paste0("x", 1:p)
colnames(Y0) = paste0("y", 1:q)

###################################
  # Manual signal
  #Signal 1 (columns 1:10)

  Z1 = rnorm(N,0,1)
  Z2 = rnorm(N,-3,1)
  Z3 = rnorm(N,10,1)

  #Some associations with the true signal
  alpha1 = (1:10) / 10
  beta1  = -(1:20) / 10
  alpha2 = ((p/2):(p/2 + 100)) / 1500
  beta2  = -(40:120) / 120
  alpha3 = (200:220) / 300
  beta3  = -(200:250) / 300
  
  loc_alpha1 = 1:length(alpha1) # sample(1:p, length(alpha1), replace = FALSE) # 
  loc_beta1  = 1:length(beta1)  # sample(1:q, length(beta1), replace = FALSE)  # 
  loc_alpha2 = (p/2):(p/2 + length(alpha2) - 1) # sample((1:p)[!(1:p %in% loc_alpha1)], length(alpha2), replace = FALSE) # 
  loc_beta2  = 40:(40 + length(beta2) -1) # sample((1:q)[!(1:q %in% loc_beta1)], length(beta2), replace = FALSE)   # 
  loc_alpha3 = 200:(200+length(alpha3) - 1) # sample((1:p)[!(1:p %in% c(loc_alpha1, loc_alpha2))], length(alpha3), replace = FALSE) #
  loc_beta3  = 200:(200 + length(beta3) -1) # sample((1:q)[!(1:q %in% c(loc_beta1, loc_beta2))], length(beta3), replace = FALSE)    # 
  
# ordered signal

  for(i in 1:length(alpha1))
    X0[, loc_alpha1[i]] =  alpha1[i] * Z1 + rnorm(N,0,0.3)

  for(i in 1:length(beta1))
    Y0[, loc_beta1[i]] =  beta1[i] * Z1 + rnorm(N,0,0.3)

  for(i in 1:length(alpha2))
    X0[, loc_alpha2[i]] =  alpha2[i] * Z2 + rnorm(N,0,0.03)

  for(i in 1:length(beta2))
    Y0[, loc_beta2[i]] =  beta2[i] * Z2 + rnorm(N,0,0.03)

  for(i in 1:length(alpha3))
    X0[, loc_alpha3[i]] =  alpha3[i] * Z3 + rnorm(N,0,0.5)

  for(i in 1:length(beta3))
    Y0[, loc_beta3[i]] =  beta3[i] * Z3 + rnorm(N,0,0.5)
  
```

## Canonical Correlation Analysis
We use the method described in the paper, Thresholded Ordered Sparse Canonical Correlation Analysis (TOSCCA), to uncover the underlying processes linking the data. 

```{r run toscca, message=FALSE}
X = standardVar(X0)
Y = standardVar(Y0)
K = 4                                       # number of components to be estimated
nonz_x = rep(100, K)                        # number of nonzero variables for X
nonz_y = rep(100, K)                        # number of nonzero variables for Y
init   = "uniform"                          # type of initialisation
cca_toscca  = toscca(X, Y, nonz_x, nonz_y, K, init, combination = FALSE, silent = TRUE, toPlot = FALSE)
cpev_toscca = sapply(1:K, function(k) cpev.toscca(X, cca_toscca$alpha[,1:k]))

# perm_toscca = perm.toscca(X, Y, nonz_x, nonz_y, K = K, init, draws = 100, cancor = cca_toscca$cancor)
```


## tosccamm

```{r simulate data for tosccamm, include=FALSE}
set.seed(1234)
n = 100

#This can be varied to simulate different signals
# alpha1 = c(runif(10, 0.5, 1), rep(0, 990))
# beta1 = c(runif(5, 0.5, 1), rep(0, 195))
# alpha2 = c(rep(0, 990), runif(10, 0.5, 1))
# beta2 = c(rep(0, 195),runif(5, 0.5, 1))


beta2 = c(rep(0, 195), rep(0.5,5))
alpha2 = c(rep(0, 990), rep(0.5,10))
beta1 = c(rep(0.5,5), rep(0, 195))
alpha1 = c(rep(0.5,10), rep(0, 990))

X1 = list()
X2 = list()

i = 1
K = 3
sg = matrix(c(1, 0.6, 0.3, rep(0, 7), 0.6, 1, 0.6, 0.3,
rep(0, 6), 0.3, 0.6, 1, 0.6, 0.3,
rep(0,5), 0, 0.3, 0.6, 1, 0.6, 0.3,
rep(0, 4), rep(0, 2), 0.3, 0.6, 1, 0.6, 0.3,
rep(0,3), rep(0,3), 0.3, 0.6, 1, 0.6, 0.3, rep(0, 2),
rep(0, 4), 0.3, 0.6, 1, 0.6, 0.3, 0,
rep(0, 5), 0.3, 0.6, 1, 0.6, 0.3, rep(0,6), 0.3, 0.6, 1, 0.6,
rep(0,7), 0.3, 0.6, 1), ncol = 10)
for(i in 1:n)
{
print(i)
#Simulate  times of measurement
times = 1:10
#random effect
ui = rnorm(1,0,0.9)
#Shared canonical vector (with some time effect)
Zi1 = -(1+times/max(times))^3 + times * 0.5 + ui
Zi2 = (sin(100*times))^times +   times * 0.65 +rnorm(1,0,0.95)
Zi = cbind(Zi1, Zi2)
alpha = cbind(alpha1, alpha2)
beta = cbind(beta1, beta2)
#Simulate data and add some noise
X1i = sapply(1:nrow(alpha), function(a) MASS::mvrnorm(1, (Zi %*% t(alpha))[,a], Sigma = sg))
X2i = sapply(1:nrow(beta), function(a) MASS::mvrnorm(1, (Zi %*% t(beta))[,a], Sigma = sg))
# X1i = sapply(alpha1, function(a) rnorm(length(times), Zi1 * a, 0.5))
# X2i = sapply(beta1, function(a) rnorm(length(times), Zi1 * a, 0.5))
# X1i = sapply(alpha2, function(a) rnorm(length(times), Zi2 * a, 0.5))
# X2i = sapply(beta2, function(a) rnorm(length(times), Zi2 * a, 0.5))
colnames(X1i) = paste0("X", 1:ncol(X1i))
colnames(X2i) = paste0("Y", 1:ncol(X2i))
#Check the simulated cross correlation
  #image(cor(X1i, X2i))

  #Remove some observations
  # p_observed = 1
  X1i = cbind(id=i, time=times, X1i)#[rbinom(length(times),1,p_observed)==1,]
  X2i = cbind(id=i, time=times, X2i)#[rbinom(length(times),1,p_observed)==1,]


  X1[[i]] = X1i
  X2[[i]] = X2i
}

X1 = do.call("rbind", X1)
X2 = do.call("rbind", X2)
dev.new(); image(cor(X1,X2))


verwijder_X = sample(1:nrow(X1), 0.2*nrow(X1), replace=FALSE)
verwijder_Y = sample(1:nrow(X2), 0.3*nrow(X2), replace=FALSE)
# table(verwijder_X)
# table(verwijder_Y)
X1=X1[-verwijder_X,]
X2=X2[-verwijder_Y,]
# XX2 = scale_rm(data.frame(X1), centre = T); YY2 = scale_rm(data.frame(X2), centre = T)
XX2 = data.frame(X1); YY2 = data.frame(X2)
nonz_a = seq(50, 5, length.out = 10) # seq(100, 1000, 100)
nonz_b = round(seq(50, 5, length.out = 5)) # seq(100, 1000, 100)
```


Estimate the canonical weights and latent paths for $K$ components.
```{r run  tosccamm}
res_k = list()

X.temp = XX2
Y.temp = YY2
for (k in 1:1) {
  if(k > 1) {
    # residualise for subsequent components
    X.temp = data.frame(X.temp[,c(1,2)],toscca::residualisation(as.matrix(X.temp[,-c(1,2)]), res_k[[k-1]]$alpha, type = "basic") )
    Y.temp = data.frame(Y.temp[,c(1,2)],toscca::residualisation(as.matrix(Y.temp[,-c(1,2)]), res_k[[k-1]]$beta, type = "basic") )

    nz_a_gen = as.numeric(table(res_k[[k-1]]$alpha != 0)[2])
    nz_b_gen = as.numeric(table(res_k[[k-1]]$beta != 0)[2])
  }

  res_k[[k]] <- tosccamm(X.temp, Y.temp, folds = 2,
                                            nonzero_a = nonz_a, nonzero_b = nonz_b,
                                            model = "lme", lmeformula = " ~ 0 + poly(time,3) + (1|id)", silent = TRUE)

}
```


### Results
#### Latent paths for $k=1$ and $k=2$
Figures 3.a and 3.b in manuscript.
```{r plots latent paths, echo=FALSE, warning=FALSE, message=FALSE}
lv1 = data.frame(id = XX2$id, time = XX2$time, K1=as.matrix(XX2[,-c(1,2)])%*% (res_k[[1]]$alpha))
shed_blue_60 <- toscca:::mpalette

# Update plot1 (no x-axis label)
plot1 <- ggplot(lv1, aes(x = time, y = scale(K1), group = id)) +
  geom_line(alpha = 0.1, aes(color = "Estimated", linetype = "Estimated", linewidth = "Estimated")) +  # Estimated (black)
  geom_line(data = data.frame(time = sort(unique(lv1$time)), z = scale(Zi2)), aes(x = time, y = scale(z), color = "True", group = 1, , linetype = "True", linewidth = "True")) +  # True (red)
  geom_line(data = data.frame(time = sort(unique(lv1$time)), K1 = aggregate(lv1, by = list(lv1$time), FUN = mean)$K1), aes(x = time, y = scale(K1), group = 1, color = "Smoothed", linetype = "Smoothed", linewidth = "Smoothed")) +  # Smoothed (blue)
  scale_color_manual(values = c("Estimated" = "black", "True" =  toscca:::mpalette[3], "Smoothed" = shed_blue_60)) +  # Specify color manually
  scale_linewidth_manual(values =c("Estimated" = 0.8, "True" = 1.5, "Smoothed" = 1.5)) +
  scale_linetype_manual(values = c("Estimated" = 3, "True" = 1, "Smoothed" = 2)) +  # Specify line types
  scale_x_continuous(breaks = unique(lv1$time)) +  # Ensure x-axis is integers
  labs(x = "Time", y = "Values", color = "Legend" ) +  # Remove x label
  theme_minimal() +
  theme(legend.position = "none",
        text = element_text(family = "Latex"),
        axis.title.y = element_text(size = 20, margin = margin(r = 10)),  # Increase Y axis title size
        axis.title.x = element_text(size = 20, margin = margin(t = 15)),
        axis.text = element_text(size = 15),     # Increase axis text size
        plot.title = element_text(size = 25),    # Increase plot title size
        axis.text.x =element_text(size = 15)    # Increase X axis text size
  )
print(plot1)
```