## ----include=FALSE------------------------------------------------------------ knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- # load the package library("CNVreg") ## ----echo=FALSE, warning=FALSE, message=FALSE--------------------------------- # load packages required for Rmd library("kableExtra") library("tidyverse") library("ggplot2") library("patchwork") ## ----------------------------------------------------------------------------- # load the example dataset data("CNVCOVY", package = "CNVreg") ## ----------------------------------------------------------------------------- # view the dataset summary(CNV) ## ----------------------------------------------------------------------------- # view the dataset summary(Cov) ## ----------------------------------------------------------------------------- # view the dataset summary(Y_QT) ## ----------------------------------------------------------------------------- # view the dataset summary(Y_BT) ## ----warning=FALSE------------------------------------------------------------ # data preprocessing for a quantitative(continuous) outcome Y_QT frag_data_QT <- prep(CNV = CNV, Y = Y_QT, Z = Cov, rare.out = 0.05) ## ----------------------------------------------------------------------------- # Format of `prep()` funtion output str(frag_data_QT) ## ----warning=FALSE------------------------------------------------------------ # data preprocessing with a binary trait frag_data_BT <- prep(CNV = CNV, Y = Y_BT, Z = Cov, rare.out = 0.05) ## ----eval = FALSE------------------------------------------------------------- # ## copy frag_data_QT # frag_data_BT <- frag_data_QT # # ### replace Y with Y_BT in the correct format: ordered named vector # ### order the sample in Y_BT as in frag_data_QT$Y # rownames(Y_BT) <- Y_BT$ID # # frag_data_BT$Y <- Y_BT[names(frag_data_QT$Y), "Y"] |> drop() # names(frag_data_QT$Y) <- rownames(frag_data_QT$Y) ## ----------------------------------------------------------------------------- QT_TUNE <- withr::with_seed( 12345, cvfit_WTSMTH(data = frag_data_QT, lambda1 = seq(-8, -3, 1), lambda2 = seq(12, 25, 2), weight = "eql", family = "gaussian", cv.control = list(n.fold = 5L, n.core = 1L, stratified = FALSE), verbose = FALSE)) ## ----echo=FALSE--------------------------------------------------------------- # loss matrix of candidate tuning parameters QT_TUNE$Loss %>% format( digits = 6) %>% mutate( across(2:ncol(QT_TUNE$Loss), ~ cell_spec(.x, color = ifelse(.x > min(QT_TUNE$Loss[,2:ncol(QT_TUNE$Loss)])+0.0001, "black", "white"), background = ifelse(.x <= min(QT_TUNE$Loss[, 2:ncol(QT_TUNE$Loss)])+0.0001, "red", "white") ) ) )%>% kable(booktabs = FALSE, linesep = "", align = "c", format = "html", escape = F, caption = "Average loss for each pair of candidate tuning parameters") %>%add_header_above(c(" " = 1, "Lambda 1" = ncol(QT_TUNE$Loss)-1)) ## ----------------------------------------------------------------------------- # selected optimal tuning parameters with minimum loss QT_TUNE$selected.lambda ## ----------------------------------------------------------------------------- ##coefficients of intercept and covariates QT_TUNE$coef[c(1, 21:22), c("Vnames", "coef") ] ## ----------------------------------------------------------------------------- # estimated coefficents for CNV QT_TUNE$coef[2:20, ] # non-zero coefficients # QT_TUNE$coef[which(abs(QT_TUNE$coef$coef)>0), ] ## ----warning=FALSE, echo=FALSE------------------------------------------------ # plot the coefficients # keep CNV fragments and exclude intercept and covariates(Age, Sex) in ploting, row(2:20) CNVR <- frag_data_QT$CNVR.info %>% group_by(CNV.id, deldup)%>% summarise(CNV.start = min(CNV.start), CNV.end = max(CNV.end), nfrag = length(CNV.id), .groups = 'drop') CNVR_adj <- CNVR[CNVR$nfrag > 1, ] CNV_coef <- QT_TUNE$coef[2:20,] P <- ggplot(CNV_coef, aes(x= CNV_coef$CNV.start , y=CNV_coef$coef*1000))+theme_bw()+ geom_point(aes(color = ifelse(abs(CNV_coef$coef*1000) > 1, "red", ifelse(abs(CNV_coef$coef*1000) < 10^(-5), "white", "black")))) + scale_color_identity()+ theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+ scale_x_continuous("Genomic position", labels = as.character(CNVR_adj$CNV.start), breaks = CNVR_adj$CNV.start)+ ylab("CNV coefficients (×10^(-3))")+ geom_segment(x=CNV_coef$CNV.start, xend=CNV_coef$CNV.end, y=CNV_coef$coef*1000, yend=CNV_coef$coef*1000)+ geom_rect(aes(xmin=CNVR_adj$CNV.start[1]-1000000, xmax=CNVR_adj$CNV.end[1]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+ annotate("text", x = CNVR_adj$CNV.start[1], y = max(CNV_coef$coef*1000) +0.2, label = "A", color="red")+ geom_rect(aes(xmin=CNVR_adj$CNV.start[2]-1000000, xmax=CNVR_adj$CNV.end[2]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+ geom_rect(aes(xmin=CNVR_adj$CNV.start[3]-1000000, xmax=CNVR_adj$CNV.end[3]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+ annotate("text", x = CNVR_adj$CNV.start[3], y = max(CNV_coef$coef*1000)+0.2, label = "B", color="red")+ geom_rect(aes(xmin=CNVR_adj$CNV.start[4]-1000000, xmax=CNVR_adj$CNV.end[4]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02) PA <- ggplot(CNV_coef[2:5,], aes(x= 1/2*(CNV_coef$CNV.start[2:5]+CNV_coef$CNV.end[2:5]) , y=CNV_coef$coef[2:5]*1000))+theme_bw()+ ylab("")+ geom_point(aes(color = ifelse(abs(CNV_coef$coef[2:5]*1000) > 1, "red", ifelse(abs(CNV_coef$coef[2:5]) < 10^(-5), "white", "black")))) + scale_color_identity()+ theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+ scale_x_continuous("", labels = as.character(CNV_coef$CNV.start[2:5]), breaks = CNV_coef$CNV.start[2:5])+ geom_segment(x=CNV_coef$CNV.start[2:5], xend=CNV_coef$CNV.end[2:5], y=CNV_coef$coef[2:5]*1000, yend=CNV_coef$coef[2:5]*1000)+ geom_rect(aes(xmin=CNVR_adj$CNV.start[1]-10, xmax=CNVR_adj$CNV.end[1]+10, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+ annotate("text", x =1/2*( CNVR_adj$CNV.start[1]+CNVR_adj$CNV.end[1]), y = max(CNV_coef$coef*1000) +0.0002, label = "Zoom in on region A", color="red") PB <- ggplot(CNV_coef[11:14,], aes(x= 1/2*(CNV_coef$CNV.start[11:14]+CNV_coef$CNV.end[11:14]) , y=CNV_coef$coef[11:14]*1000))+theme_bw()+ ylab("")+ geom_point(aes(color = ifelse(abs(CNV_coef$coef[11:14]*1000) > 1, "red", ifelse(abs(CNV_coef$coef[11:14]*1000) < 10^(-5), "white", "black")))) + scale_color_identity()+ theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+ scale_x_continuous("Genomic position", labels = as.character(CNV_coef$CNV.start[11:14]), breaks = CNV_coef$CNV.start[11:14])+ geom_segment(x=CNV_coef$CNV.start[11:14], xend=CNV_coef$CNV.end[11:14], y=CNV_coef$coef[11:14]*1000, yend=CNV_coef$coef[11:14]*1000)+ geom_rect(aes(xmin=CNVR_adj$CNV.start[3]-10, xmax=CNVR_adj$CNV.end[3]+10, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+ annotate("text", x =1/2*( CNVR_adj$CNV.start[3]+CNVR_adj$CNV.end[3]), y = max(CNV_coef$coef*1000) + 0.2, label = "Zoom in on region B", color="red") P + (PA/PB) + plot_annotation(title = "CNV coefficient estiamte across the genomic region - A fine-tuned model")+ plot_layout(axes = "collect", widths = c(2,1)) + plot_layout( guide = "collect") & theme(legend.position="Top", legend.text = element_text(size=12), legend.title = element_text(size=12)) ## ----------------------------------------------------------------------------- BT_TUNE <- withr::with_seed( 12345, cvfit_WTSMTH(frag_data_BT, lambda1 = seq(-5.25, -4.75, 0.25), lambda2 = seq(2, 8, 2), weight = "eql", family = "binomial", cv.control = list(n.fold = 5L, n.core = 1L, stratified = FALSE), iter.control = list(max.iter = 8L, tol.beta = 10^(-3), tol.loss = 10^(-6)), verbose = FALSE)) ## ----echo=FALSE--------------------------------------------------------------- # loss matrix of candidate tuning parameters BT_TUNE$Loss %>% format( digits = 6) %>% mutate( across(2:ncol(BT_TUNE$Loss), ~ cell_spec(.x, color = ifelse(.x > min(BT_TUNE$Loss[,2:ncol(BT_TUNE$Loss)])+0.000001, "black", "white"), background = ifelse(.x <= min(BT_TUNE$Loss[, 2:ncol(BT_TUNE$Loss)])+0.000001, "red", "white") ) ) )%>% kable(booktabs = FALSE, linesep = "", align = "c", format = "html", escape = F, caption = "Average loss for each pair of candidate tuning parameters") %>%add_header_above(c(" " = 1, "Lambda 1" = ncol(BT_TUNE$Loss)-1)) ## ----------------------------------------------------------------------------- # selected optimal tuning parameters with minimum loss BT_TUNE$selected.lambda ## ----------------------------------------------------------------------------- BT_TUNE$coef[c(1, 21:22), c("Vnames", "coef") ] ## ----------------------------------------------------------------------------- BT_TUNE$coef[2:20, ] ## ----warning=FALSE, echo=FALSE------------------------------------------------ CNV_coef <- BT_TUNE$coef[2:20,] P <- ggplot(CNV_coef, aes(x= CNV_coef$CNV.start , y=CNV_coef$coef*1000))+theme_bw()+ geom_point(aes(color = ifelse(abs(CNV_coef$coef*1000) > 1, "red", ifelse(abs(CNV_coef$coef*1000) < 10^(-5), "white", "black")))) + scale_color_identity()+ theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+ scale_x_continuous("Genomic position", labels = as.character(CNVR_adj$CNV.start), breaks = CNVR_adj$CNV.start)+ ylab("CNV coefficients (×10^(-3))")+ geom_segment(x=CNV_coef$CNV.start, xend=CNV_coef$CNV.end, y=CNV_coef$coef*1000, yend=CNV_coef$coef*1000)+ geom_rect(aes(xmin=CNVR_adj$CNV.start[1]-1000000, xmax=CNVR_adj$CNV.end[1]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+ annotate("text", x = CNVR_adj$CNV.start[1], y = max(CNV_coef$coef*1000)+0.2, label = "A", color="red")+ geom_rect(aes(xmin=CNVR_adj$CNV.start[2]-1000000, xmax=CNVR_adj$CNV.end[2]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+ geom_rect(aes(xmin=CNVR_adj$CNV.start[3]-1000000, xmax=CNVR_adj$CNV.end[3]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+ annotate("text", x = CNVR_adj$CNV.start[3], y = max(CNV_coef$coef*1000)+0.2, label = "B", color="red")+ geom_rect(aes(xmin=CNVR_adj$CNV.start[4]-1000000, xmax=CNVR_adj$CNV.end[4]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02) PA <- ggplot(CNV_coef[2:5,], aes(x= 1/2*(CNV_coef$CNV.start[2:5]+CNV_coef$CNV.end[2:5]) , y=CNV_coef$coef[2:5]*1000))+theme_bw()+ ylab("")+ geom_point(aes(color = ifelse(abs(CNV_coef$coef[2:5]*1000) > 0.001, "red", ifelse(abs(CNV_coef$coef[2:5]*1000) < 10^(-8), "white", "black")))) + scale_color_identity()+ theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+ scale_x_continuous("", labels = as.character(CNV_coef$CNV.start[2:5]), breaks = CNV_coef$CNV.start[2:5])+ geom_segment(x=CNV_coef$CNV.start[2:5], xend=CNV_coef$CNV.end[2:5], y=CNV_coef$coef[2:5]*1000, yend=CNV_coef$coef[2:5]*1000)+ geom_rect(aes(xmin=CNVR_adj$CNV.start[1]-10, xmax=CNVR_adj$CNV.end[1]+10, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+ annotate("text", x =1/2*( CNVR_adj$CNV.start[1]+CNVR_adj$CNV.end[1]), y = max(CNV_coef$coef*1000) + 0.4, label = "Zoom in on region A", color="red") PB <- ggplot(CNV_coef[11:14,], aes(x= 1/2*(CNV_coef$CNV.start[11:14]+CNV_coef$CNV.end[11:14]) , y=CNV_coef$coef[11:14]*1000))+theme_bw()+ ylab("")+ geom_point(aes(color = ifelse(abs(CNV_coef$coef[11:14]*1000) > 1, "red", ifelse(abs(CNV_coef$coef[11:14]*1000) < 10^(-5), "white", "black")))) + scale_color_identity()+ theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+ scale_x_continuous("Genomic position", labels = as.character(CNV_coef$CNV.start[11:14]), breaks = CNV_coef$CNV.start[11:14])+ geom_segment(x=CNV_coef$CNV.start[11:14], xend=CNV_coef$CNV.end[11:14], y=CNV_coef$coef[11:14]*1000, yend=CNV_coef$coef[11:14]*1000)+ geom_rect(aes(xmin=CNVR_adj$CNV.start[3]-10, xmax=CNVR_adj$CNV.end[3]+10, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+ annotate("text", x =1/2*( CNVR_adj$CNV.start[3]+CNVR_adj$CNV.end[3]), y = max(CNV_coef$coef*1000) +0.4, label = "Zoom in on region B", color="red") P + (PA/PB) +plot_annotation(title = "CNV coefficient estiamte across the genomic region - A fine-tuned model")+ plot_layout(axes = "collect", widths = c(2,1)) + plot_layout( guide = "collect") & theme(legend.position="Top", legend.text = element_text(size=12), legend.title = element_text(size=12)) ## ----------------------------------------------------------------------------- # we know the optimal tuning parameters and directly apply it to reproduce the analysis for a continuous outcome. QT_fit <- fit_WTSMTH(frag_data_QT, lambda1 = -5, lambda2 = 20, weight="eql", family="gaussian") ## ----------------------------------------------------------------------------- QT_fit[c(1, 21:22), c("Vnames", "coef") ] ## ----------------------------------------------------------------------------- QT_fit[2:20, ]