## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE, # fig.path = "man/figures/README-", out.width = "100%" ) ## ----eval = FALSE------------------------------------------------------------- # install.packages('dsb') # library(dsb) # # adt_norm = DSBNormalizeProtein( # cell_protein_matrix = cells_citeseq_mtx, # empty_drop_matrix = empty_drop_citeseq_mtx, # denoise.counts = TRUE, # use.isotype.control = TRUE, # isotype.control.name.vec = rownames(cells_citeseq_mtx)[67:70] # ) ## ----eval = FALSE------------------------------------------------------------- # library(dsb) # # # read raw data using the Seurat function "Read10X" # raw = Seurat::Read10X("data/raw_feature_bc_matrix/") # cells = Seurat::Read10X("data/filtered_feature_bc_matrix/") # # # define cell-containing barcodes and separate cells and empty drops # stained_cells = colnames(cells$`Gene Expression`) # background = setdiff(colnames(raw$`Gene Expression`), stained_cells) # # # split the data into separate matrices for RNA and ADT # prot = raw$`Antibody Capture` # rna = raw$`Gene Expression` ## ----eval = FALSE------------------------------------------------------------- # # create metadata of droplet QC stats used in standard scRNAseq processing # mtgene = grep(pattern = "^MT-", rownames(rna), value = TRUE) # used below # # md = data.frame( # rna.size = log10(Matrix::colSums(rna)), # prot.size = log10(Matrix::colSums(prot)), # n.gene = Matrix::colSums(rna > 0), # mt.prop = Matrix::colSums(rna[mtgene, ]) / Matrix::colSums(rna) # ) # # add indicator for barcodes Cell Ranger called as cells # md$drop.class = ifelse(rownames(md) %in% stained_cells, 'cell', 'background') # # # remove barcodes with no evidence of capture in the experiment # md = md[md$rna.size > 0 & md$prot.size > 0, ] # ## ----eval = FALSE------------------------------------------------------------- # ggplot(md, aes(x = log10(n.gene), y = prot.size )) + # theme_bw() + # geom_bin2d(bins = 300) + # scale_fill_viridis_c(option = "C") + # facet_wrap(~drop.class) ## ----eval = FALSE------------------------------------------------------------- # background_drops = rownames( # md[ md$prot.size > 1.5 & # md$prot.size < 3 & # md$rna.size < 2.5, ] # ) # background.adt.mtx = as.matrix(prot[ , background_drops]) ## ----eval = FALSE------------------------------------------------------------- # # # calculate statistical thresholds for droplet filtering. # cellmd = md[md$drop.class == 'cell', ] # # # filter drops with + / - 3 median absolute deviations from the median library size # rna.mult = (3*mad(cellmd$rna.size)) # prot.mult = (3*mad(cellmd$prot.size)) # rna.lower = median(cellmd$rna.size) - rna.mult # rna.upper = median(cellmd$rna.size) + rna.mult # prot.lower = median(cellmd$prot.size) - prot.mult # prot.upper = median(cellmd$prot.size) + prot.mult # # # filter rows based on droplet qualty control metrics # qc_cells = rownames( # cellmd[cellmd$prot.size > prot.lower & # cellmd$prot.size < prot.upper & # cellmd$rna.size > rna.lower & # cellmd$rna.size < rna.upper & # cellmd$mt.prop < 0.14, ] # ) ## ----eval = FALSE------------------------------------------------------------- # length(qc_cells) ## ----eval = FALSE------------------------------------------------------------- # cell.adt.raw = as.matrix(prot[ , qc_cells]) # cell.rna.raw = rna[ ,qc_cells] # cellmd = cellmd[qc_cells, ] ## ----eval = FALSE------------------------------------------------------------- # # flter # pm = sort(apply(cell.adt.raw, 1, max)) # pm2 = apply(background.adt.mtx, 1, max) # head(pm) ## ----eval = FALSE------------------------------------------------------------- # # remove the non staining protein # cell.adt.raw = cell.adt.raw[!rownames(cell.adt.raw) == 'CD34_TotalSeqB', ] # background.adt.mtx = background.adt.mtx[!rownames(background.adt.mtx) == 'CD34_TotalSeqB', ] # ## ----eval = FALSE------------------------------------------------------------- # # define isotype controls # isotype.controls = c("IgG1_control_TotalSeqB", "IgG2a_control_TotalSeqB", # "IgG2b_control_TotalSeqB") # # # normalize and denoise with dsb with # cells.dsb.norm = DSBNormalizeProtein( # cell_protein_matrix = cell.adt.raw, # empty_drop_matrix = background.adt.mtx, # denoise.counts = TRUE, # use.isotype.control = TRUE, # isotype.control.name.vec = isotype.controls # ) # # note: normalization takes ~ 20 seconds # # system.time() # # user system elapsed # # 20.799 0.209 21.783 ## ----eval = FALSE------------------------------------------------------------- # # dsb with non-standard options # cell.adt.dsb.2 = DSBNormalizeProtein( # cell_protein_matrix = cell.adt.raw, # empty_drop_matrix = background.adt.mtx, # denoise.counts = TRUE, # use.isotype.control = TRUE, # isotype.control.name.vec = rownames(cell.adt.raw)[29:31], # define.pseudocount = TRUE, # pseudocount.use = 1, # scale.factor = 'mean.subtract', # quantile.clipping = TRUE, # quantile.clip = c(0.01, 0.99), # return.stats = TRUE # ) # # ## ----eval = FALSE------------------------------------------------------------- # # Seurat workflow # library(Seurat) # # # integrating with Seurat # stopifnot(isTRUE(all.equal(rownames(cellmd), colnames(cell.adt.raw)))) # stopifnot(isTRUE(all.equal(rownames(cellmd), colnames(cell.rna.raw)))) # # # create Seurat object note: min.cells is a gene filter, not a cell filter # s = Seurat::CreateSeuratObject(counts = cell.rna.raw, # meta.data = cellmd, # assay = "RNA", # min.cells = 20) # # # add dsb normalized matrix "cell.adt.dsb" to the "CITE" data (not counts!) slot # s[["CITE"]] = Seurat::CreateAssayObject(data = cells.dsb.norm) # ## ----eval = FALSE------------------------------------------------------------- # # define proteins to use in clustering (non-isptype controls) # prots = rownames(s@assays$CITE@data)[1:28] # # # cluster and run umap # s = Seurat::FindNeighbors(object = s, dims = NULL,assay = 'CITE', # features = prots, k.param = 30, # verbose = FALSE) # # # direct graph clustering # s = Seurat::FindClusters(object = s, resolution = 1, # algorithm = 3, # graph.name = 'CITE_snn', # verbose = FALSE) # # umap (optional) # # s = Seurat::RunUMAP(object = s, assay = "CITE", features = prots, # # seed.use = 1990, min.dist = 0.2, n.neighbors = 30, # # verbose = FALSE) # # # make results dataframe # d = cbind(s@meta.data, # as.data.frame(t(s@assays$CITE@data)) # # s@reductions$umap@cell.embeddings) # ) ## ----eval = FALSE------------------------------------------------------------- # library(magrittr) # # calculate the median protein expression separately for each cluster # adt_plot = d %>% # dplyr::group_by(CITE_snn_res.1) %>% # dplyr::summarize_at(.vars = prots, .funs = median) %>% # tibble::remove_rownames() %>% # tibble::column_to_rownames("CITE_snn_res.1") # # plot a heatmap of the average dsb normalized values for each cluster # pheatmap::pheatmap(t(adt_plot), # color = viridis::viridis(25, option = "B"), # fontsize_row = 8, border_color = NA) # ## ----------------------------------------------------------------------------- # # use pearson residuals as normalized values for pca # DefaultAssay(s) = "RNA" # s = NormalizeData(s, verbose = FALSE) %>% # FindVariableFeatures(selection.method = 'vst', verbose = FALSE) %>% # ScaleData(verbose = FALSE) %>% # RunPCA(verbose = FALSE) # # # set up dsb values to use in WNN analysis (do not normalize with CLR, use dsb normalized values) # DefaultAssay(s) = "CITE" # VariableFeatures(s) = prots # s = s %>% # ScaleData() %>% # RunPCA(reduction.name = 'apca', verbose = FALSE) # # # run WNN # s = FindMultiModalNeighbors( # s, reduction.list = list("pca", "apca"), # dims.list = list(1:30, 1:18), # modality.weight.name = "RNA.weight", # verbose = FALSE # ) # # # cluster # s <- FindClusters(s, graph.name = "wsnn", # algorithm = 3, # resolution = 1.5, # verbose = FALSE, # random.seed = 1990) ## ----------------------------------------------------------------------------- # ## WNN with dsb values # # use RNA pearson residuals as normalized values for RNA pca # DefaultAssay(s) = "RNA" # s = NormalizeData(s, verbose = FALSE) %>% # FindVariableFeatures(selection.method = 'vst', verbose = FALSE) %>% # ScaleData(verbose = FALSE) %>% # RunPCA(verbose = FALSE) # # # # set up dsb values to use in WNN analysis # DefaultAssay(s) = "CITE" # VariableFeatures(s) = prots # # # run true pca to initialize dr pca slot for WNN # ## Not used { # s = ScaleData(s, assay = 'CITE', verbose = FALSE) # s = RunPCA(s, reduction.name = 'pdsb',features = VariableFeatures(s), verbose = FALSE) # # } # # # make matrix of normalized protein values to add as dr embeddings # pseudo = t(GetAssayData(s,slot = 'data',assay = 'CITE'))[,1:29] # colnames(pseudo) = paste('pseudo', 1:29, sep = "_") # s@reductions$pdsb@cell.embeddings = pseudo # # # run WNN directly using dsb normalized values. # s = FindMultiModalNeighbors( # object = s, # reduction.list = list("pca", "pdsb"), # weighted.nn.name = "dsb_wnn", # knn.graph.name = "dsb_knn", # modality.weight.name = "dsb_weight", # snn.graph.name = "dsb_snn", # dims.list = list(1:30, 1:29), # verbose = FALSE # ) # # # cluster based on the join RNA and protein graph # s = FindClusters(s, graph.name = "dsb_knn", # algorithm = 3, # resolution = 1.5, # random.seed = 1990, # verbose = FALSE) ## ----------------------------------------------------------------------------- # # create multimodal heatmap # vf = VariableFeatures(s,assay = "RNA") # # # find marker genes for the joint clusters # Idents(s) = "dsb_knn_res.1.5" # DefaultAssay(s) = "RNA" # rnade = FindAllMarkers(s, features = vf, only.pos = TRUE, verbose = FALSE) # gene_plot = rnade %>% # dplyr::filter(avg_log2FC > 1 ) %>% # dplyr::group_by(cluster) %>% # dplyr::top_n(3) %$% gene %>% unique # # # cite_data = GetAssayData(s,slot = 'data',assay = 'CITE') %>% t() # rna_subset = GetAssayData(s,assay = 'RNA',slot = 'data')[gene_plot, ] %>% # as.data.frame() %>% # t() %>% # as.matrix() # # # combine into dataframe # d = cbind(s@meta.data, cite_data, rna_subset) # # # calculate the median protein expression per cluster # dat_plot = d %>% # dplyr::group_by(dsb_knn_res.1.5) %>% # dplyr::summarize_at(.vars = c(prots, gene_plot), .funs = median) %>% # tibble::remove_rownames() %>% # tibble::column_to_rownames("dsb_knn_res.1.5") # ## ----fig.height=4, fig.width=3------------------------------------------------ # # make a combined plot # suppressMessages(library(ComplexHeatmap)); ht_opt$message = FALSE # # # protein heatmap # # protein heatmap # prot_col = circlize::colorRamp2(breaks = seq(-1,25, by = 1), # colors = viridis::viridis(n = 27, option = "B")) # p1 = Heatmap(t(dat_plot)[prots, ], # name = "protein", # col = prot_col, # use_raster = T, # row_names_gp = gpar(color = "black", fontsize = 5) # ) # # # # mRNA heatmap # mrna = t(dat_plot)[gene_plot, ] # rna_col = circlize::colorRamp2(breaks = c(-2,-1,0,1,2), # colors = colorspace::diverge_hsv(n = 5)) # p2 = Heatmap(t(scale(t(mrna))), # name = "mRNA", # col = rna_col, # use_raster = T, # clustering_method_columns = 'average', # column_names_gp = gpar(color = "black", fontsize = 7), # row_names_gp = gpar(color = "black", fontsize = 5)) # # # # combine heatmaps # ht_list = p1 %v% p2 # draw(ht_list) # ## ----eval = FALSE------------------------------------------------------------- # library(Seurat) # umi = Read10X(data.dir = 'data/raw_feature_bc_matrix/') # k = 3 # barcode.whitelist = # rownames( # CreateSeuratObject(counts = umi, # min.features = k, # retain all barcodes with at least k raw mRNA # min.cells = 800, # this just speeds up the function by removing genes. # )@meta.data # ) # # write.table(barcode.whitelist, # file =paste0(your_save_path,"barcode.whitelist.tsv"), # sep = '\t', quote = FALSE, col.names = FALSE, row.names = FALSE) #