## ----setup, include=FALSE--------------------------------------------------------------------------------------------- knitr::opts_chunk$set(echo = TRUE, tidy.opts=list(width.cutoff=100), tidy=TRUE, comment=NA) options(width=120, max.print=1000) ## ----getstarted, echo=TRUE, eval=FALSE-------------------------------------------------------------------------------- # # load the library, load and preview at demo dataset # library(haplo.stats) # ls(name="package:haplo.stats") # help(haplo.em) # example(haplo.em) ## ----rsettingsecho=FALSE, eval=TRUE--------------------------------------------------------------- options(width=100) rm(list=ls()) require(haplo.stats) ## ----hladat, echo=TRUE---------------------------------------------------------------------------- # load and preview demo dataset stored in ~/haplo.stats/data/hla.demo.tab data(hla.demo) names(hla.demo) # attach hla.demo to make columns available in the session #attach(hla.demo) ## ----label=makegeno, echo=TRUE-------------------------------------------------------------------- geno <- hla.demo[,c(17,18,21:24)] genolabel <-c("DQB","DRB","B") ## ----summarygeno, echo=TRUE----------------------------------------------------------------------- geno.desc <- summaryGeno(geno, miss.val=c(0,NA)) print(geno.desc[c(1:10,80:85,135:140),]) ## ----tabledesc, echo=TRUE------------------------------------------------------------------------- # how many samples missing 2 alleles at the 3 markers table(geno.desc[,3]) ## ----rmmiss, echo=TRUE, eval=FALSE---------------------------------------------------------------- # ## create an index of people missing all alleles # miss.all <- which(geno.desc[,3]==3) # # # use index to subset hla.demo # hla.demo.updated <- hla.demo[-miss.all,] ## ----setseed, echo=TRUE--------------------------------------------------------------------------- # this is how to set the seed for reproducing results where haplo.em is # involved, and also if simulations are run. In practice, don't reset seed. seed <- c(17, 53, 1, 40, 37, 0, 62, 56, 5, 52, 12, 1) set.seed(seed) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----label=runEM, echo=TRUE----------------------------------------------------------------------- save.em <- haplo.em(geno=geno, locus.label=genolabel, miss.val=c(0,NA)) names(save.em) print(save.em, nlines=10) ## ----summaryEM, echo=TRUE------------------------------------------------------------------------- summary(save.em, nlines=7) ## ----summaryEMshow, echo=TRUE--------------------------------------------------------------------- # show full haplotypes, instead of codes summary(save.em, show.haplo=TRUE, nlines=7) ## ----emcontrol, echo=TRUE, eval=FALSE------------------------------------------------------------- # # demonstrate only the syntax of control parameters # save.em.try20 <- haplo.em(geno=geno, locus.label=genolabel, miss.val=c(0, NA), # control = haplo.em.control(n.try = 20, insert.batch.size=2)) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----groupBin, echo=TRUE-------------------------------------------------------------------------- ## run haplo.em on sub-groups ## create ordinal and binary variables y.bin <- 1*(hla.demo$resp.cat=="low") group.bin <- haplo.group(y.bin, geno, locus.label=genolabel, miss.val=0) print(group.bin, nlines=15) ## ----hapPowerQT, echo=TRUE------------------------------------------------------------------------ # load a set of haplotypes data(hapPower.demo) #### an example using save.em hla markers may go like this. # keep <- which(save.em$hap.prob > .004) # get an index of non-rare haps # hfreq <- save.em$hap.prob[keep] # hmat <- save.em$haplotype[keep,] # hrisk <- which(hmat[,1]==31 & hmat[,2]==11) # contains 3 haps with freq=.01 # hbase <- 4 # 4th hap has mas freq of .103 #### ## separate the haplotype matrix and the frequencies hmat <- hapPower.demo[,-6] hfreq <- hapPower.demo[,6] ## Define risk haplotypes as those with "1" allele at loc2 and loc3 hrisk <- which(hmat$loc.2==1 & hmat$loc.3==1) # define index for baseline haplotype hbase <- 1 hbeta.list <- find.haplo.beta.qt(haplo=hmat, haplo.freq=hfreq, base.index=hbase, haplo.risk=hrisk, r2=.01, y.mu=0, y.var=1) hbeta.list ss.qt <- haplo.power.qt(hmat, hfreq, hbase, hbeta.list$beta, y.mu=0, y.var=1, alpha=.05, power=.80) ss.qt power.qt <- haplo.power.qt(hmat, hfreq, hbase, hbeta.list$beta, y.mu=0, y.var=1, alpha=.05, sample.size=2826) power.qt ## ----hapPowerCC, echo=TRUE------------------------------------------------------------------------ ## get power and sample size for quantitative response ## get beta vector based on odds ratios cc.OR <- 1.5 # determine beta regression coefficients for risk haplotypes hbeta.cc <- numeric(length(hfreq)) hbeta.cc[hrisk] <- log(cc.OR) # Compute sample size for stated power ss.cc <- haplo.power.cc(hmat, hfreq, hbase, hbeta.cc, case.frac=.5, prevalence=.1, alpha=.05, power=.8) ss.cc # Compute power for given sample size power.cc <- haplo.power.cc(hmat, hfreq, hbase, hbeta.cc, case.frac=.5, prevalence=.1, alpha=.05, sample.size=4566) power.cc ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----scoregauss, echo=TRUE------------------------------------------------------------------------ # score statistics w/ Gaussian trait score.gaus.add <- haplo.score(hla.demo$resp, geno, trait.type="gaussian", min.count=5, locus.label=genolabel, simulate=FALSE) print(score.gaus.add, nlines=10) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----scorebinom, echo=TRUE------------------------------------------------------------------------ # scores, binary trait y.bin <- 1*(hla.demo$resp.cat=="low") score.bin <- haplo.score(y.bin, geno, trait.type="binomial", x.adj = NA, min.count=5, haplo.effect="additive", locus.label=genolabel, miss.val=0, simulate=FALSE) print(score.bin, nlines=10) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----scoreOrd, echo=TRUE-------------------------------------------------------------------------- # scores w/ ordinal trait y.ord <- as.numeric(hla.demo$resp.cat) score.ord <- haplo.score(y.ord, geno, trait.type="ordinal", x.adj = NA, min.count=5, locus.label=genolabel, miss.val=0, simulate=FALSE) print(score.ord, nlines=7) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----scoreAdj, echo=TRUE-------------------------------------------------------------------------- # score w/gaussian, adjusted by covariates x.ma <- with(hla.demo, cbind(male, age)) score.gaus.adj <- haplo.score(hla.demo$resp, geno, trait.type="gaussian", x.adj = x.ma, min.count=5, locus.label=genolabel, simulate=FALSE) print(score.gaus.adj, nlines=10) ## ----scorePlot, fig=TRUE, echo=TRUE--------------------------------------------------------------- ## plot score vs. frequency, gaussian response plot(score.gaus.add, pch="o") ## locate and label pts with their haplotypes ## works similar to locator() function #> pts.haplo <- locator.haplo(score.gaus) pts.haplo <- list(x.coord=c(0.05098, 0.03018, .100), y.coord=c(2.1582, 0.45725, -2.1566), hap.txt=c("62:2:7", "51:1:35", "21:3:8")) text(x=pts.haplo$x.coord, y=pts.haplo$y.coord, labels=pts.haplo$hap.txt) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----scoremin10, echo=TRUE------------------------------------------------------------------------ # increase skip.haplo, expected hap counts = 10 score.gaus.min10 <- haplo.score(hla.demo$resp, geno, trait.type="gaussian", x.adj = NA, min.count=10, locus.label=genolabel, simulate=FALSE) print(score.gaus.min10) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----scoreDom, echo=TRUE-------------------------------------------------------------------------- # score w/gaussian, dominant effect score.gaus.dom <- haplo.score(hla.demo$resp, geno, trait.type="gaussian", x.adj=NA, min.count=5, haplo.effect="dominant", locus.label=genolabel, simulate=FALSE) print(score.gaus.dom, nlines=10) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----scorePerm, echo=TRUE------------------------------------------------------------------------- # simulations when binary response score.bin.sim <- haplo.score(y.bin, geno, trait.type="binomial", x.adj = NA, locus.label=genolabel, min.count=5, simulate=TRUE, sim.control = score.sim.control() ) print(score.bin.sim) ## ----glmGeno, echo=TRUE--------------------------------------------------------------------------- # set up data for haplo.glm, include geno.glm, # covariates age and male, and responses resp and y.bin geno <- hla.demo[,c(17,18,21:24)] geno.glm <- setupGeno(geno, miss.val=c(0,NA), locus.label=genolabel) attributes(geno.glm) y.bin <- 1*(hla.demo$resp.cat=="low") glm.data <- data.frame(geno.glm, age=hla.demo$age, male=hla.demo$male, y=hla.demo$resp, y.bin=y.bin) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----glmGauss, echo=TRUE-------------------------------------------------------------------------- # glm fit with haplotypes, additive gender covariate on gaussian response fit.gaus <- haplo.glm(y ~ male + geno.glm, family=gaussian, data=glm.data, na.action="na.geno.keep", locus.label = genolabel, x=TRUE, control=haplo.glm.control(haplo.freq.min=.02)) summary(fit.gaus) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----glmInter, echo=TRUE-------------------------------------------------------------------------- # glm fit haplotypes with covariate interaction fit.inter <- haplo.glm(formula = y ~ male * geno.glm, family = gaussian, data=glm.data, na.action="na.geno.keep", locus.label = genolabel, control = haplo.glm.control(haplo.min.count = 10)) summary(fit.inter) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----glmBinom, echo=TRUE-------------------------------------------------------------------------- # gender and haplotypes fit on binary response, # return model matrix fit.bin <- haplo.glm(y.bin ~ male + geno.glm, family = binomial, data=glm.data, na.action = "na.geno.keep", locus.label=genolabel, control = haplo.glm.control(haplo.min.count=10)) summary(fit.bin) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----glmDom, echo=TRUE---------------------------------------------------------------------------- # control dominant effect of haplotypes (haplo.effect) # by using haplo.glm.control fit.dom <- haplo.glm(y ~ male + geno.glm, family = gaussian, data = glm.data, na.action = "na.geno.keep", locus.label = genolabel, control = haplo.glm.control(haplo.effect='dominant', haplo.min.count=8)) summary(fit.dom) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----glmBaseline, echo=TRUE----------------------------------------------------------------------- # control baseline selection, perform the same exact run as fit.bin, # but different baseline by using haplo.base chosen from haplo.common fit.bin$haplo.common fit.bin$haplo.freq.init[fit.bin$haplo.common] fit.bin.base77 <- haplo.glm(y.bin ~ male + geno.glm, family = binomial, data = glm.data, na.action = "na.geno.keep", locus.label = genolabel, control = haplo.glm.control(haplo.base=77, haplo.min.count=8)) summary(fit.bin.base77) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----glmEPS1, eval=TRUE--------------------------------------------------------------------------- glm.data$ybig <- glm.data$y*50 fit.gausbig <- haplo.glm(formula = ybig ~ male + geno.glm, family = gaussian, data = glm.data, na.action = "na.geno.keep", locus.label = genolabel, control = haplo.glm.control(haplo.freq.min = 0.02), x = TRUE) summary(fit.gausbig) fit.gausbig$rank.info fit.gaus$rank.info ## ----echo=FALSE, eval=TRUE------------------------------------------------------------------------ set.seed(seed) ## ----glmEPS2-------------------------------------------------------------------------------------- fit.gausbig.eps <- haplo.glm(formula = ybig ~ male + geno.glm, family = gaussian, data = glm.data, na.action = "na.geno.keep", locus.label = genolabel, control = haplo.glm.control(eps.svd=1e-10, haplo.freq.min = 0.02), x = TRUE) summary(fit.gausbig.eps) fit.gausbig.eps$rank.info ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----glmRare2, echo=TRUE-------------------------------------------------------------------------- ## set haplo.freq.min and haplo.min.info to same value to show how the ## rare coefficient may be modeled but standard error estimate is not ## calculated because all haps are below haplo.min.info ## warning expected fit.bin.rare02 <- haplo.glm(y.bin ~ geno.glm, family = binomial, data = glm.data, na.action = "na.geno.keep", locus.label = genolabel, control = haplo.glm.control(haplo.freq.min=.02, haplo.min.info=.02)) summary(fit.bin.rare02) ## ----vcovGLM-------------------------------------------------------------------------------------- varmat <- vcov(fit.gaus) dim(varmat) varmat <- vcov(fit.gaus, freq=FALSE) dim(varmat) print(varmat, digits=2) ## ----anovaGLM------------------------------------------------------------------------------------- fit.gaus$lrt glmfit.gaus <- glm(y~male, family="gaussian", data=glm.data) anova.haplo.glm(glmfit.gaus, fit.gaus) anova.haplo.glm(fit.gaus, fit.inter) anova.haplo.glm(glmfit.gaus, fit.gaus, fit.inter) ## ----scoreMerge,echo=TRUE------------------------------------------------------------------------- # merge haplo.score and haplo.group results merge.bin <- haplo.score.merge(score.bin, group.bin) print(merge.bin, nlines=10) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----hapCC, echo=TRUE----------------------------------------------------------------------------- # demo haplo.cc where haplo.min.count is specified # use geno, and this function prepares it for haplo.glm y.bin <- 1*(hla.demo$resp.cat=="low") cc.hla <- haplo.cc(y=y.bin, geno=geno, locus.label = genolabel, control=haplo.glm.control(haplo.freq.min=.02)) print(cc.hla, nlines=25, digits=2) names(cc.hla) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----label=scoreSlide, echo=TRUE, eval=TRUE------------------------------------------------------- # haplo.score on 11 loci, slide on 3 consecutive loci at a time geno.11 <- hla.demo[,-c(1:4)] label.11 <- c("DPB","DPA","DMA","DMB","TAP1","TAP2","DQB","DQA","DRB","B","A") score.slide.gaus <- haplo.score.slide(hla.demo$resp, geno.11, trait.type = "gaussian", n.slide=3, min.count=5, locus.label=label.11) print(score.slide.gaus) ## ----label=plotSlide, fig=TRUE, echo=TRUE--------------------------------------------------------- # plot global p-values for sub-haplotypes from haplo.score.slide plot.haplo.score.slide(score.slide.gaus) ## ----eval=TRUE, echo=FALSE------------------------------------------------------------------------ set.seed(seed) ## ----hapScan, echo=TRUE, eval=TRUE---------------------------------------------------------------- geno.11 <- hla.demo[,-c(1:4)] y.bin <- 1*(hla.demo$resp.cat=="low") hla.summary <- summaryGeno(geno.11, miss.val=c(0,NA)) # track those subjects with too many possible haplotype pairs ( > 50,000) many.haps <- (1:length(y.bin))[hla.summary[,4] > 50000] # For speed, or even just so it will finish, make y.bin and geno.scan # for genotypes that don't have too many ambigous haplotypes geno.scan <- geno.11[-many.haps,] y.scan <- y.bin[-many.haps] # scan haplotypes for regions within width of 3 for each locus. # test statistic measures difference in haplotype counts in cases and controls # p-values are simulated for each locus and the maximum statistic, # we do 100 simuations here, should use default settings for analysis scan.hla <- haplo.scan(y.scan, geno.scan, width=3, sim.control=score.sim.control(min.sim=100, max.sim=100), em.control=haplo.em.control()) print(scan.hla) ## ----label=hapDesign1, echo=TRUE------------------------------------------------------------------ # create a matrix of haplotype effect columns from haplo.em result hap.effect.frame <- haplo.design(save.em) names(hap.effect.frame) hap.effect.frame[1:10,1:8] ## ----label=hapDesign2,echo=TRUE------------------------------------------------------------------- # create haplotype effect cols for haps 4 and 138 hap4.hap138.frame <- haplo.design(save.em, hapcodes=c(4,138), haplo.effect="dominant") hap4.hap138.frame[1:10,] dat.glm <- data.frame(resp=hla.demo$resp, male=hla.demo$male, age=hla.demo$age, hap.4=hap4.hap138.frame$hap.4, hap.138=hap4.hap138.frame$hap.138) glm.hap4.hap138 <- glm(resp ~ male + age + hap.4 + hap.138, family="gaussian", data=dat.glm) summary(glm.hap4.hap138)