## ----eval=FALSE, echo=TRUE----------------------------------------------- # install.packages("pacman") # pacman::p_install_gh("sahirbhatnagar/eclust") ## ----setup, message=FALSE, echo=FALSE------------------------------------ library(knitr) library(eclust) library(data.table) options(scipen = 1, digits = 5) ## ---- echo=FALSE--------------------------------------------------------- pander::pander(data.frame(`function name` = grep("^r_",pacman::p_functions("eclust"), value = T))) ## ---- echo=FALSE--------------------------------------------------------- pander::pander(data.frame(`function name` = grep("^s_",pacman::p_functions("eclust"), value = T))) ## ---- echo=FALSE--------------------------------------------------------- pander::pander(data.frame(`function name` = grep("^u_",pacman::p_functions("eclust"), value = T))) ## ---- eval = TRUE-------------------------------------------------------- # load the data data("tcgaov") tcgaov[1:5,1:6, with = FALSE] # use log survival as the response Y <- log(tcgaov[["OS"]]) # specify the environment variable E <- tcgaov[["E"]] # specify the matrix of genes only genes <- as.matrix(tcgaov[,-c("OS","rn","subtype","E","status"),with = FALSE]) # for this example the training set will be all subjects. # change `p` argument to create a train and test set. trainIndex <- drop(caret::createDataPartition(Y, p = 1, list = FALSE, times = 1)) testIndex <- trainIndex ## ------------------------------------------------------------------------ cluster_res <- r_cluster_data(data = genes, response = Y, exposure = E, train_index = trainIndex, test_index = testIndex, cluster_distance = "corr", eclust_distance = "diffcorr", measure_distance = "euclidean", clustMethod = "hclust", cutMethod = "dynamic", method = "average", nPC = 1, minimum_cluster_size = 30) # the number of clusters determined by the similarity matrices specified # in the cluster_distance and eclust_distance arguments. This will always be larger # than cluster_res$clustersAll$nclusters which is based on the similarity matrix # specified in the cluster_distance argument cluster_res$clustersAddon$nclusters # the number of clusters determined by the similarity matrices specified # in the cluster_distance argument only cluster_res$clustersAll$nclusters # what's in the cluster_res object names(cluster_res) ## ------------------------------------------------------------------------ # prepare data for use with earth function avg_eclust_interaction <- r_prepare_data(data = cbind(cluster_res$clustersAddon$averageExpr, Y = Y[trainIndex], E = E[trainIndex]), response = "Y", exposure = "E") head(avg_eclust_interaction[["X"]]) ## ------------------------------------------------------------------------ # install and load earth package pacman::p_load(char = "earth") fit_earth <- earth::earth(x = avg_eclust_interaction[["X"]], y = avg_eclust_interaction[["Y"]], pmethod = "backward", keepxy = TRUE, degree = 2, trace = 1, nk = 1000) coef(fit_earth) ## ------------------------------------------------------------------------ u_extract_selected_earth(fit_earth) ## ------------------------------------------------------------------------ # Genes in cluster 5 cluster_res$clustersAddonMembership[cluster %in% 5] # variable importance earth::evimp(fit_earth) ## ------------------------------------------------------------------------ pacman::p_load(eclust) d0 <- s_modules(n = 100, p = 1000, rho = 0, exposed = FALSE, modProportions = c(0.15,0.15,0.15,0.15,0.15,0.25), minCor = 0.01, maxCor = 1, corPower = 1, propNegativeCor = 0.3, backgroundNoise = 0.5, signed = FALSE, leaveOut = 1:4) d1 <- s_modules(n = 100, p = 1000, rho = 0.9, exposed = TRUE, modProportions = c(0.15,0.15,0.15,0.15,0.15,0.25), minCor = 0.4, maxCor = 1, corPower = 0.3, propNegativeCor = 0.3, backgroundNoise = 0.5, signed = FALSE) # get the true cluster labels truemodule1 <- d1$setLabels table(truemodule1) ## ------------------------------------------------------------------------ pacman::p_load(magrittr) X <- rbind(d0$datExpr, d1$datExpr) %>% magrittr::set_colnames(paste0("Gene", 1:1000)) %>% magrittr::set_rownames(paste0("Subject",1:200)) ## ---- fig.show='hold', tidy=FALSE---------------------------------------- pacman::p_load(pheatmap) pacman::p_load(viridis) pheatmap::pheatmap(cor(X[1:100,]), show_rownames = F, show_colnames = F, color = viridis(100)) pheatmap::pheatmap(cor(X[101:200,]), show_rownames = F, show_colnames = F, color = viridis(100)) ## ---- eval=T------------------------------------------------------------- betaMainEffect <- vector("double", length = 1000) betaMainInteractions <- vector("double", length = 1000) # the first 25 in the 3rd block are active betaMainEffect[which(truemodule1 %in% 3)[1:50]] <- runif(50, 0.9, 1.1) # the first 25 in the 4th block are active betaMainEffect[which(truemodule1 %in% 4)[1:50]] <- runif(50, 0.9, 1.1) # the interaction effects betaMainInteractions[which(betaMainEffect!=0)] <- runif(50, 0.4, 0.6) # the environment effect betaE <- 2 # the total beta vector beta <- c(betaMainEffect, betaE, betaMainInteractions) ## ------------------------------------------------------------------------ result <- s_generate_data(p = 1000, X = X, beta = beta, include_interaction = TRUE, cluster_distance = "tom", n = 200, n0 = 100, eclust_distance = "difftom", signal_to_noise_ratio = 1, distance_method = "euclidean", cluster_method = "hclust", cut_method = "dynamic", agglomeration_method = "average", nPC = 1) names(result) ## ---- echo=FALSE--------------------------------------------------------- pander::pander(data.frame(`function name` = c("s_pen_separate", "s_pen_clust", "s_mars_separate", "s_mars_clust"), `General Approach` = rep(c("SEPARATE","CLUST, ECLUST"), 2), `model` = c(rep(c("lasso, elasticnet, mcp, scad"), 2), rep("MARS",2)))) ## ------------------------------------------------------------------------ # Provide ECLUST clustering results to the gene_groups argument pen_res <- s_pen_clust(x_train = result[["X_train"]], x_test = result[["X_test"]], y_train = result[["Y_train"]], y_test = result[["Y_test"]], s0 = result[["S0"]], gene_groups = result[["clustersAddon"]], summary = "pc", model = "lasso", exp_family = "gaussian", clust_type = "ECLUST", include_interaction = TRUE) unlist(pen_res) ## ---- echo=FALSE--------------------------------------------------------- pander::pander(data.frame(`object name` = c("tom_train_all","tom_train_diff","tom_train_e1","tom_train_e0","corr_train_all","corr_train_diff","corr_train_e1","corr_train_e0","fisherScore","corScor"))) ## ------------------------------------------------------------------------ # check that the object is of class similarity class(result$tom_train_e1) # get clustering tree hc <- hclust(as.dist(1 - result$tom_train_e1), method = "average") plot(result$tom_train_e1, truemodule = truemodule1, cluster_rows = hc, cluster_cols = hc, active = as.numeric(betaMainEffect!=0)) ## ------------------------------------------------------------------------ class(cluster_res) ## ------------------------------------------------------------------------ plot(cluster_res, show_column_names = FALSE)