--- title: "Splitknockoff" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Splitknockoff} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(SplitKnockoff) ``` # Vignette for Splitknockoff **Author : Yuxuan Chen, Haoxue Wang, Yang Cao, Xinwei Sun, Yuan Yao** #### Introduction Split Knockoff is a data adaptive variable selection framework for controlling the (directional) false discovery rate (FDR) in structural sparsity, where variable selection on linear transformation of parameters is of concern. This proposed scheme relaxes the linear subspace constraint to its neighborhood, often known as variable splitting in optimization, which leads to better incoherence and possible performance improvement in both power and FDR. **This vignette illustrates the usage of SplitKnockoff with simulation experiments in Cao et al. (2023) and will help you apply the split knockoff method in a light way.** ```R install.packages("SplitKnockoff") # just one line code to install our package ``` This is a R implement on the Matlab version of Split Knockoffs. This R package is more convenient as **glmnet** can be used directly by ```R install.packages("glmnet") # just one line code to install the glmnet tool ``` **Please update your glmnet package to the latest version for smooth usage of this package**, the examples illustrated here are tested with *glmnet 4.1-8*. For more information, please see the manual inside this package. #### Key function **sk.filter(X, D, y, option)** : the main function, Split Knockoff filter, for variable selection in structural sparsity problem. #### Function involved frequently **sk.create(X, y, D, nu, option)**: generate the split knockoff copy for structural sparsity problem. **sk.W_path(X, D, y, nu, option)**: generate the $W$ statistics for split knockoff on a split LASSO path. **sk.W_fixed(X, D, y, nu, option)**: generate the $W$ statistics for split knockoff based on a fixed $\beta(\lambda) = \hat{\beta}$. **select(W, q, method)**: this function is for variable selection based on $W$ statistics. ### **For reproduction of the simulation, you can see the code as the following.** #### Simulation details ##### Install all the packages and library them. ```R install.packages("SplitKnockoff") # install our package library(latex2exp) library(ggplot2) library(Matrix) library(glmnet) library(SplitKnockoff) ``` ##### Set the parameter for the simulation ```R k <- 20 # sparsity level A <- 1 # magnitude n <- 500 # sample size p <- 100 # dimension of variables c <- 0.5 # feature correlation sigma <-1 # noise level option <- list() # the target (directional) FDR control option$q <- 0.2 # whether to normalize the dataset option$normalize <- 'true' # fraction of data used to estimate \beta(\lambda) option$frac = 2/5 # choice on the set of regularization parameters for split LASSO path option$lambda <- 10.^seq(0, -6, by=-0.01) option$lambda_cv <- 10.^seq(0, -6, by=-0.01) # choice of nu for split knockoffs expo = seq(0, 2, by = 0.2) option$nu <- 10.^expo option$nu_cv <- 10.^expo num_nu <- length(option$nu) # set random seed option$seed = 1 # the number of simulation instances option$tests = 200 option$W = 's' ``` ##### Generate 3 types of transformation ```R D_G = matrix(0, p-1, p) for (i in 1:(p-1)){ D_G[i, i] = 1 D_G[i, i+1] = -1 } # generate D1, D2, and D3 D_1 = diag(p) D_2 = D_G D_3 = rbind(diag(p), D_G) D_s = list(D_1 = D_1, D_2 = D_2, D_3 = D_3) ``` ##### Generate $X$ ```R # generate Sigma Sigma = matrix(0, p, p) for( i in 1: p){ for(j in 1: p){ Sigma[i, j] <- c^(abs(i - j)) } } ``` ##### Package **mvtnorm** needed for this generation, please install it in advance ```R library(mvtnorm) # package mvtnorm needed for this generation set.seed(100) X <- rmvnorm(n,matrix(0, p, 1), Sigma) # generate X ``` ##### Generate $\beta$ ```R beta_true <- matrix(0, p, 1) for( i in 1: k){ beta_true[i, 1] = A if ( i%%3 == 1){ beta_true[i, 1] = 0 } } ``` #### **Key step** **Split knockoff for all $\nu$** ```R # choice on the way generating the W statistics option$beta = 'path' # save Z t_Z r t_Z for making plots rawvalue = list() # save y Y = list() tests = option$tests # the number of experiments # create matrices to store results fdr_split = array(0, dim = c(3, tests, num_nu, 2)) power_split = array(0, dim = c(3, tests, num_nu, 2)) for (test in 1:tests){ # generate varepsilon set.seed(test) # generate noise and y varepsilon = matrix(rnorm(n), ncol = 1) * sqrt(sigma) y = X %*% beta_true + varepsilon Y[[test]] = y raw = list() for (j in 1:3){ # choose the respective D for each example D = D_s[[j]] gamma_true <- D %*% beta_true sk_results = sk.filter(X, D, y, option) results = sk_results$results raw[[j]] = sk_results$stats for (i in 1:num_nu){ r_sign = sk_results$stats[[i]]$r result = results[[i]]$sk fdr_split[j, test, i, 1] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1) power_split[j, test, i, 1] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0) result = results[[i]]$sk_plus fdr_split[j, test, i, 2] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1) power_split[j, test, i, 2] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0) } } rawvalue[[test]] = raw } ``` **Split knockoff for $\nu$ chosen by cross validation** ```R # choice on the way generating the W statistics option$beta = 'cv_all' # create matrices to store results fdr_cv = array(0, dim = c(3, tests, 2)) power_cv = array(0, dim = c(3, tests, 2)) for (test in 1:tests){ y <- Y[[test]] for (j in 1:3){ # choose the respective D for each example D = D_s[[j]] gamma_true <- D %*% beta_true sk_results = sk.filter(X, D, y, option) results = sk_results$results r_sign = sk_results$stats$r result = results$sk fdr_cv[j, test, 1] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1) power_cv[j, test, 1] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0) result = results$sk_plus fdr_cv[j, test, 2] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1) power_cv[j, test, 2] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0) } } mean_fdr_cv <- apply(fdr_cv, c(1, 3), mean) sd_fdr_cv <- apply(fdr_cv, c(1, 3), sd) mean_power_cv <- apply(power_cv, c(1, 3), mean) sd_power_cv <- apply(power_cv, c(1, 3), sd) ``` **Knockoff** ```R library(knockoff) # package knockoff needed # create matrices to store results fdr_knockoff = array(0, dim = c(2, tests, 2)) power_knockoff = array(0, dim = c(2, tests, 2)) D = D_s[[2]] # Calculate X_bar, y_bar Z <- t(D) %*% solve(D %*% t(D)) XF <- X %*% rep(1, p) U_X <- XF / norm(XF, "F") UU_X <- cbind(U_X, matrix(0, n, n-1)) qrresult <- qr(UU_X) Qreslt <- qr.Q(qrresult) UX_perp <- Qreslt[,2:n] y_bar <- t(UX_perp) %*% y X_bar <- t(UX_perp) %*% X %*% Z for (test in 1:tests){ y <- Y[[test]] # choose D_1 for each example D = D_s[[1]] gamma_true <- D %*% beta_true k_results = knockoff.filter(X, y, knockoffs = {function(x) create.fixed(x, y=y, method='equi')}, statistic = stat.lasso_lambdasmax) W_k = k_results$statistic result = select(W_k, option$q, 'knockoff') fdr_knockoff[1, test, 1] = sum(gamma_true[result] == 0) / max(length(result), 1) power_knockoff[1, test, 1] = sum(gamma_true[result] != 0) / sum(gamma_true != 0) result = select(W_k, option$q, 'knockoff+') fdr_knockoff[1, test, 2] = sum(gamma_true[result] == 0) / max(length(result), 1) power_knockoff[1, test, 2] = sum(gamma_true[result] != 0) / sum(gamma_true != 0) # choose the D_2 for each example D = D_s[[2]] gamma_true <- D %*% beta_true k_results = knockoff.filter(X_bar, y_bar, knockoffs = {function(x) create.fixed(x, y=y_bar, method='equi')}, statistic = stat.lasso_lambdasmax) W_k = k_results$statistic result = select(W_k, option$q, 'knockoff') fdr_knockoff[2, test, 1] = sum(gamma_true[result] == 0) / max(length(result), 1) power_knockoff[2, test, 1] = sum(gamma_true[result] != 0) / sum(gamma_true != 0) result = select(W_k, option$q, 'knockoff+') fdr_knockoff[2, test, 2] = sum(gamma_true[result] == 0) / max(length(result), 1) power_knockoff[2, test, 2] = sum(gamma_true[result] != 0) / sum(gamma_true != 0) } ``` **Plot figure 3** ```R x <- expo t_value = qt(c(0.1, 0.9), tests - 1) lower_bound = t_value[1] upper_bound = t_value[2] fdr_knockoff_mean <- apply(fdr_knockoff, c(1, 3), mean) power_knockoff_mean <- apply(power_knockoff, c(1, 3), mean) fdr_knockoff_sd <- apply(fdr_knockoff, c(1, 3), sd) power_knockoff_sd <- apply(power_knockoff, c(1, 3), sd) fdr_split_mean <- apply(fdr_split, c(1, 3, 4), mean) fdr_split_sd <- apply(fdr_split, c(1, 3, 4), sd) power_split_mean <- apply(power_split, c(1, 3, 4), mean) power_split_sd <- apply(power_split, c(1, 3, 4), sd) fdr_split_top <- fdr_split_mean + fdr_split_sd * upper_bound fdr_split_bot <- fdr_split_mean + fdr_split_sd * lower_bound power_split_top <- power_split_mean + power_split_sd * upper_bound power_split_bot <- power_split_mean + power_split_sd * lower_bound ## for D_1 ## plot for FDR png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_31_fdr.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 4), y = c(fdr_split_mean[1, , 1], fdr_split_mean[1, , 2], rep(fdr_knockoff_mean[1, 1], num_nu), rep(fdr_knockoff_mean[1, 2], num_nu)), type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[1, , 1], ymax = fdr_split_top[1, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[1, , 2], ymax = fdr_split_top[1, , 2]), fill = "blue", alpha = 0.05) + geom_abline(intercept = option$q, slope=0, linetype="dashed") + labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) + ggtitle(expression('D'[1])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off() ## plot for Power png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_31_power.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 4), y = c(power_split_mean[1, , 1], power_split_mean[1, , 2], rep(power_knockoff_mean[1, 1], num_nu), rep(power_knockoff_mean[1, 2], num_nu)), type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = power_split_bot[1, , 1], ymax = power_split_top[1, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = power_split_bot[1, , 2], ymax = power_split_top[1, , 2]), fill = "blue", alpha = 0.05) + labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") + ggtitle(expression('D'[1])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off() ## for D_2 ## plot for FDR png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_32_fdr.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 4), y = c(fdr_split_mean[2, , 1], fdr_split_mean[2, , 2], rep(fdr_knockoff_mean[2, 1], num_nu), rep(fdr_knockoff_mean[2, 2], num_nu)), type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[2, , 1], ymax = fdr_split_top[2, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[2, , 2], ymax = fdr_split_top[2, , 2]), fill = "blue", alpha = 0.05) + geom_abline(intercept = option$q, slope=0, linetype="dashed") + labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) + ggtitle(expression('D'[2])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off() ## plot for Power png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_32_power.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 4), y = c(power_split_mean[2, , 1], power_split_mean[2, , 2], rep(power_knockoff_mean[2, 1], num_nu), rep(power_knockoff_mean[2, 2], num_nu)), type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = power_split_bot[2, , 1], ymax = power_split_top[2, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = power_split_bot[2, , 2], ymax = power_split_top[2, , 2]), fill = "blue", alpha = 0.05) + labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") + ggtitle(expression('D'[2])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off() ## for D_3 ## plot for FDR png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_33_fdr.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 2), y = c(fdr_split_mean[3, , 1], fdr_split_mean[3, , 2]), type = rep(c("Split Knockoff", "Split Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[3, , 1], ymax = fdr_split_top[3, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[3, , 2], ymax = fdr_split_top[3, , 2]), fill = "blue", alpha = 0.05) + geom_abline(intercept = option$q, slope=0, linetype="dashed") + labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) + ggtitle(expression('D'[3])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off() ## plot for Power png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_33_power.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 2), y = c(power_split_mean[3, , 1], power_split_mean[3, , 2]), type = rep(c("Split Knockoff", "Split Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = power_split_bot[3, , 1], ymax = power_split_top[3, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = power_split_bot[3, , 2], ymax = power_split_top[3, , 2]), fill = "blue", alpha = 0.05) + labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") + ggtitle(expression('D'[3])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off() ``` ### Save results for table 1 ```R saveRDS(list(mean_fdr_D1_knockoff = fdr_knockoff_mean[1, 1], sd_fdr_D1_knockoff = fdr_knockoff_sd[1, 1], mean_power_D1_knockoff = power_knockoff_mean[1, 1], sd_power_D1_knockoff = power_knockoff_sd[1, 1], mean_fdr_D2_knockoff = fdr_knockoff_mean[2, 1], sd_fdr_D2_knockoff = fdr_knockoff_sd[2, 1], mean_power_D2_knockoff = power_knockoff_mean[2, 1], sd_power_D2_knockoff = power_knockoff_sd[2, 1], mean_fdr_D1_sk = mean_fdr_cv[1, 1], sd_fdr_D1_sk = sd_fdr_cv[1, 1], mean_power_D1_sk = mean_power_cv[1, 1], sd_power_D1_sk = sd_power_cv[1, 1], mean_fdr_D2_sk = mean_fdr_cv[2, 1], sd_fdr_D2_sk = sd_fdr_cv[2, 1], mean_power_D2_sk = mean_power_cv[2, 1], sd_power_D2_sk = sd_power_cv[2, 1], mean_fdr_D3_sk = mean_fdr_cv[3, 1], sd_fdr_D3_sk = sd_fdr_cv[3, 1], mean_power_D3_sk = mean_power_cv[3, 1], sd_power_D3_sk = sd_power_cv[3, 1], mean_fdr_D1_knockoff_pl = fdr_knockoff_mean[1, 2], sd_fdr_D1_knockoff_pl = fdr_knockoff_sd[1, 2], mean_power_D1_knockoff_pl = power_knockoff_mean[1, 2], sd_power_D1_knockoff_pl = power_knockoff_sd[1, 2], mean_fdr_D2_knockoff_pl = fdr_knockoff_mean[2, 2], sd_fdr_D2_knockoff_pl = fdr_knockoff_sd[2, 2], mean_power_D2_knockoff_pl = power_knockoff_mean[2, 2], sd_power_D2_knockoff_pl = power_knockoff_sd[2, 2], mean_fdr_D1_sk_pl = mean_fdr_cv[1, 2], sd_fdr_D1_sk_pl = sd_fdr_cv[1, 2], mean_power_D1_sk_pl = mean_power_cv[1, 2], sd_power_D1_sk_pl = sd_power_cv[1, 2], mean_fdr_D2_sk_pl = mean_fdr_cv[2, 2], sd_fdr_D2_sk_pl = sd_fdr_cv[2, 2], mean_power_D2_sk_pl = mean_power_cv[2, 2], sd_power_D2_sk_pl = sd_power_cv[2, 2], mean_fdr_D3_sk_pl = mean_fdr_cv[3, 2], sd_fdr_D3_sk_pl = sd_fdr_cv[3, 2], mean_power_D3_sk_pl = mean_power_cv[3, 2], sd_power_D3_sk_pl = sd_power_cv[3, 2]), file = 'D:/SplitKnockoff_results/simu_experiments/Table/table1.rds') ```