## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, echo = FALSE------------------------------------------------------ library(BinaryDosage) ## ----vcf_files---------------------------------------------------------------- vcf1afile <- system.file("extdata", "set1a.vcf", package = "BinaryDosage") vcf1ainfo <- system.file("extdata", "set1a.info", package = "BinaryDosage") vcf1bfile <- system.file("extdata", "set1b.vcf", package = "BinaryDosage") vcf1binfo <- system.file("extdata", "set1b.info", package = "BinaryDosage") ## ----vcf_convert, message = FALSE, warning = FALSE---------------------------- bdfile1a <- tempfile() bdfile1b <- tempfile() # Without information file vcftobdlegacy(vcffiles = vcf1afile, bdfiles = bdfile1a) # With information file — embeds aaf, maf, avgcall, rsq in the header vcftobdlegacy(vcffiles = c(vcf1bfile, vcf1binfo), bdfiles = bdfile1b) ## ----vcf_gz, message = FALSE, warning = FALSE--------------------------------- vcf1agzfile <- system.file("extdata", "set1a.vcf.gz", package = "BinaryDosage") bdfile1a_gz <- tempfile() vcftobdlegacy(vcffiles = vcf1agzfile, bdfiles = bdfile1a_gz, gz = TRUE) ## ----vcf_dosageonly, message = FALSE, warning = FALSE------------------------- vcf2afile <- system.file("extdata", "set2a.vcf", package = "BinaryDosage") bdfile2a <- tempfile() vcftobdlegacy(vcffiles = vcf2afile, bdfiles = bdfile2a) ## ----snpidformat, message = FALSE, warning = FALSE---------------------------- vcf1brsfile <- system.file("extdata", "set1b_rssnp.vcf", package = "BinaryDosage") bdfile_id0 <- tempfile() bdfile_id1 <- tempfile() bdfile_id2 <- tempfile() bdfile_idm1 <- tempfile() vcftobdlegacy(vcffiles = vcf1brsfile, bdfiles = bdfile_id0) vcftobdlegacy(vcffiles = vcf1brsfile, bdfiles = bdfile_id1, snpidformat = 1L) vcftobdlegacy(vcffiles = vcf1brsfile, bdfiles = bdfile_id2, snpidformat = 2L) vcftobdlegacy(vcffiles = vcf1brsfile, bdfiles = bdfile_idm1, snpidformat = -1L) snpnames <- data.frame( format0 = getbdinfo(bdfile_id0)$snps$snpid, format1 = getbdinfo(bdfile_id1)$snps$snpid, format2 = getbdinfo(bdfile_id2)$snps$snpid, formatm1 = getbdinfo(bdfile_idm1)$snps$snpid ) knitr::kable(snpnames, caption = "SNP IDs by snpidformat") ## ----bdoptions, message = FALSE, warning = FALSE------------------------------ bdfile1a_calc <- tempfile() vcftobdlegacy(vcffiles = vcf1afile, bdfiles = bdfile1a_calc, bdoptions = c("aaf", "maf", "rsq")) bdcalcinfo <- getbdinfo(bdfile1a_calc) knitr::kable(data.frame(bdcalcinfo$snpinfo), caption = "Calculated per-SNP statistics", digits = 3) ## ----gen_files---------------------------------------------------------------- gen3afile <- system.file("extdata", "set3a.imp", package = "BinaryDosage") gen3asample <- system.file("extdata", "set3a.sample", package = "BinaryDosage") gen3bfile <- system.file("extdata", "set3b.imp", package = "BinaryDosage") gen3bsample <- system.file("extdata", "set3b.sample", package = "BinaryDosage") ## ----gen_convert, message = FALSE, warning = FALSE---------------------------- bdfile3a <- tempfile() bdfile3b <- tempfile() gentobd(genfiles = c(gen3afile, gen3asample), snpcolumns = c(0L, 2L:5L), bdfiles = bdfile3a) gentobd(genfiles = c(gen3bfile, gen3bsample), snpcolumns = c(0L, 2L:5L), bdfiles = bdfile3b) ## ----gen_snpcolumns, message = FALSE, warning = FALSE------------------------- gen3bchrfile <- system.file("extdata", "set3b.chr.imp", package = "BinaryDosage") bdfile3b_chr <- tempfile() # chromosome is in column 1, SNP ID in column 2, location in column 3, etc. gentobd(genfiles = c(gen3bchrfile, gen3bsample), bdfiles = bdfile3b_chr) ## ----gen_impformat, message = FALSE, warning = FALSE-------------------------- gen2bfile <- system.file("extdata", "set2b.imp", package = "BinaryDosage") sample2bfile <- system.file("extdata", "set2b.sample", package = "BinaryDosage") bdfile2b <- tempfile() gentobd(genfiles = c(gen2bfile, sample2bfile), snpcolumns = c(1L, 3L, 2L, 4L, 5L), impformat = 1L, bdfiles = bdfile2b) ## ----gen_gz, message = FALSE, warning = FALSE--------------------------------- gen4bgzfile <- system.file("extdata", "set4b.imp.gz", package = "BinaryDosage") sample4bfile <- system.file("extdata", "set4b.sample", package = "BinaryDosage") bdfile4b_gz <- tempfile() gentobd(genfiles = c(gen4bgzfile, sample4bfile), snpcolumns = c(1L, 2L, 4L, 5L, 6L), startcolumn = 7L, impformat = 2L, gz = TRUE, bdfiles = bdfile4b_gz) ## ----merge, message = FALSE, warning = FALSE---------------------------------- bd1afile <- system.file("extdata", "vcf1a.bdose", package = "BinaryDosage") bd1bfile <- system.file("extdata", "vcf1b.bdose", package = "BinaryDosage") bd1merged <- tempfile() bdmerge(mergefiles = bd1merged, bdfiles = c(bd1afile, bd1bfile)) bd1ainfo <- getbdinfo(bd1afile) bd1binfo <- getbdinfo(bd1bfile) bd1merinfo <- getbdinfo(bd1merged) cat("Set 1a subjects:", nrow(bd1ainfo$samples), "\n") cat("Set 1b subjects:", nrow(bd1binfo$samples), "\n") cat("Merged subjects:", nrow(bd1merinfo$samples), "\n") ## ----getbdinfo, message = FALSE, warning = FALSE------------------------------ bd1ainfo <- getbdinfo(bdfiles = bd1afile) ## ----bdapply, message = FALSE, warning = FALSE-------------------------------- calculateaaf <- function(dosage, p0, p1, p2) { mean(dosage, na.rm = TRUE) / 2 } bd1ainfo <- getbdinfo(bd1afile) aaf_vals <- unlist(bdapply(bdinfo = bd1ainfo, func = calculateaaf)) aaf_table <- data.frame(snpid = bd1ainfo$snps$snpid, aaf = aaf_vals) knitr::kable(aaf_table, caption = "Alternate allele frequencies", digits = 4) ## ----bdapply_getaaf, message = FALSE, warning = FALSE------------------------- aaf_vals2 <- unlist(bdapply(bdinfo = bd1ainfo, func = getaaf)) ## ----getsnp, message = FALSE, warning = FALSE--------------------------------- snp_data <- data.frame(getsnp(bdinfo = bd1ainfo, snp = "1:12000:T:C", dosageonly = FALSE)) snp_table <- cbind(sid = bd1ainfo$samples$sid, snp_data) knitr::kable(head(snp_table, 10), caption = "SNP 1:12000:T:C (first 10 subjects)", digits = 4) ## ----vcfinfo, message = FALSE, warning = FALSE-------------------------------- vcfinfo <- getvcfinfo(vcffiles = vcf1afile, index = TRUE) aaf_vcf <- unlist(vcfapply(vcfinfo = vcfinfo, func = getaaf)) aaf_compare <- data.frame(snpid = vcfinfo$snps$snpid, aaf = aaf_vcf) knitr::kable(aaf_compare, caption = "AAF from VCF via vcfapply", digits = 4) ## ----geninfo, message = FALSE, warning = FALSE-------------------------------- geninfo <- getgeninfo(genfiles = c(gen3bchrfile, gen3bsample), index = TRUE) aaf_gen <- unlist(genapply(geninfo = geninfo, func = getaaf)) aaf_gen_table <- data.frame(snpid = geninfo$snps$snpid, aaf = aaf_gen) knitr::kable(aaf_gen_table, caption = "AAF from GEN via genapply", digits = 4) ## ----full_workflow, message = FALSE, warning = FALSE-------------------------- library(BinaryDosage) # --- Input files --- vcf1afile <- system.file("extdata", "set1a.vcf", package = "BinaryDosage") vcf1ainfo <- system.file("extdata", "set1a.info", package = "BinaryDosage") vcf1bfile <- system.file("extdata", "set1b.vcf", package = "BinaryDosage") vcf1binfo <- system.file("extdata", "set1b.info", package = "BinaryDosage") gen3afile <- system.file("extdata", "set3a.imp", package = "BinaryDosage") gen3asample <- system.file("extdata", "set3a.sample", package = "BinaryDosage") gen3bfile <- system.file("extdata", "set3b.imp", package = "BinaryDosage") gen3bsample <- system.file("extdata", "set3b.sample", package = "BinaryDosage") # --- Convert to binary dosage --- bdfile1a <- tempfile(); bdfile1b <- tempfile() bdfile3a <- tempfile(); bdfile3b <- tempfile() vcftobdlegacy(vcffiles = c(vcf1afile, vcf1ainfo), bdfiles = bdfile1a) vcftobdlegacy(vcffiles = c(vcf1bfile, vcf1binfo), bdfiles = bdfile1b) gentobd(genfiles = c(gen3afile, gen3asample), snpcolumns = c(0L, 2L:5L), bdfiles = bdfile3a) gentobd(genfiles = c(gen3bfile, gen3bsample), snpcolumns = c(0L, 2L:5L), bdfiles = bdfile3b) # --- Merge --- mergebd1 <- tempfile(); mergebd3 <- tempfile() bdmerge(mergefiles = mergebd1, bdfiles = c(bdfile1a, bdfile1b)) bdmerge(mergefiles = mergebd3, bdfiles = c(bdfile3a, bdfile3b)) # --- Apply function --- mergebd1info <- getbdinfo(mergebd1) mergebd3info <- getbdinfo(mergebd3) aaf1 <- unlist(bdapply(mergebd1info, getaaf)) aaf3 <- unlist(bdapply(mergebd3info, getaaf)) aaf <- cbind(mergebd1info$snps[, c("chromosome", "location", "snpid")], aaf_vcf = aaf1, aaf_gen = aaf3) knitr::kable(aaf, caption = "AAF: VCF vs GEN sources", digits = 4, row.names = FALSE) ## ----full_workflow_snp, message = FALSE, warning = FALSE---------------------- # --- Extract SNP 6 --- set1snp6 <- getsnp(mergebd1info, 6L) set3snp6 <- getsnp(mergebd3info, 6L) snpdf <- data.frame( subjectid = mergebd1info$samples$sid, dosage_vcf = unlist(set1snp6), dosage_gen = unlist(set3snp6) ) knitr::kable(head(snpdf, 10), caption = "SNP 6 dosage: VCF vs GEN (first 10 subjects)", digits = 4, row.names = FALSE)