## ----eval = FALSE------------------------------------------------------------- # if (!require("BiocManager")) # install.packages("BiocManager") # BiocManager::install("BiocHail") ## ----getlib,message=FALSE----------------------------------------------------- library(BiocHail) library(ggplot2) ## ----get1, eval=FALSE--------------------------------------------------------- # hl <- hail_init() # mt <- get_1kg(hl) # mt # print(mt$rows()$select()$show(5L)) # limited info ## ----getdim, eval=FALSE------------------------------------------------------- # mt$count() ## ----dodim, eval=FALSE-------------------------------------------------------- # dim.hail.matrixtable.MatrixTable <- function(x) { # tmp <- x$count() # c(tmp[[1]], tmp[[2]]) # } # dim(mt) # ncol.hail.matrixtable.MatrixTable <- function(x) { # dim(x)[2] # } # nrow.hail.matrixtable.MatrixTable <- function(x) { # dim(x)[1] # } # nrow(mt) ## ----domo, eval=FALSE--------------------------------------------------------- # annopath <- path_1kg_annotations() # tab <- hl$import_table(annopath, impute=TRUE)$key_by("Sample") ## ----getsup, eval=FALSE------------------------------------------------------- # mt$aggregate_cols(hl$agg$counter(mt$pheno$SuperPopulation)) ## ----lkcaf, eval=FALSE-------------------------------------------------------- # uu <- mt$aggregate_cols(hl$agg$stats(mt$pheno$CaffeineConsumption)) # names(uu) # uu$mean # uu$stdev ## ----dohist, eval=FALSE------------------------------------------------------- # p_hist <- mt$aggregate_entries( # hl$expr$aggregators$hist(mt$DP, 0L, 30L, 30L)) # names(p_hist) # length(p_hist$bin_edges) # length(p_hist$bin_freq) # midpts <- function(x) diff(x)/2+x[-length(x)] # dpdf <- data.frame(x=midpts(p_hist$bin_edges), y=p_hist$bin_freq) # ggplot(dpdf, aes(x=x,y=y)) + geom_col() + ggtitle("DP") + ylab("Frequency") ## ----lkagg, eval=FALSE-------------------------------------------------------- # names(hl$expr$aggregators) ## ----update, eval=FALSE------------------------------------------------------- # mt <- hl$sample_qc(mt) ## ----lkcr, eval=FALSE--------------------------------------------------------- # callrate <- mt$sample_qc$call_rate$collect() # graphics::hist(callrate) ## ----after, eval=FALSE-------------------------------------------------------- # dim(mt) ## ----addvarqc, eval=FALSE----------------------------------------------------- # mt = hl$variant_qc(mt) ## ----dolam, eval=FALSE-------------------------------------------------------- # pv = gwas$p_value$collect() # x2 = stats::qchisq(1-pv,1) # lam = stats::median(x2, na.rm=TRUE)/stats::qchisq(.5,1) # lam ## ----doqq, eval=FALSE--------------------------------------------------------- # qqplot(-log10(ppoints(length(pv))), -log10(pv), xlim=c(0,8), ylim=c(0,8), # ylab="-log10 p", xlab="expected") # abline(0,1) ## ----eval=FALSE--------------------------------------------------------------- # locs <- gwas$locus$collect() # conts <- sapply(locs, function(x) x$contig) # pos <- sapply(locs, function(x) x$position) # library(igvR) # mytab <- data.frame(chr=as.character(conts), pos=pos, pval=pv) # gt <- GWASTrack("simp", mytab, chrom.col=1, pos.col=2, pval.col=3) # igv <- igvR() # setGenome(igv, "hg19") # displayTrack(igv, gt) ## ----lkpcs, eval=FALSE-------------------------------------------------------- # sc <- pcastuff[[2]]$scores$collect() # pc1 = sapply(sc, "[", 1) # pc2 = sapply(sc, "[", 2) # fac = mt$pheno$SuperPopulation$collect() # myd = data.frame(pc1, pc2, pop=fac) # library(ggplot2) # ggplot(myd, aes(x=pc1, y=pc2, colour=factor(pop))) + geom_point() ## ----dolam2, eval=FALSE------------------------------------------------------- # pv = gwas2$p_value$collect() # x2 = stats::qchisq(1-pv,1) # lam = stats::median(x2, na.rm=TRUE)/stats::qchisq(.5,1) # lam ## ----doman, eval=FALSE-------------------------------------------------------- # locs <- gwas2$locus$collect() # conts <- sapply(locs, function(x) x$contig) # pos <- sapply(locs, function(x) x$position) # mytab <- data.frame(chr=as.character(conts), pos=pos, pval=pv) # ggplot(mytab[mytab$chr=="8",], aes(x=pos, y=-log10(pval))) + geom_point() ## ----sessionInfo-------------------------------------------------------------- sessionInfo()