library(CSAMA2014RangesAnnotationLab) vcf.file <- NA12878_pg.chr20.vcf.bgz header <- scanVcfHeader(vcf.file) header samples(header) geno(header) info(header)["END",] vcf <- readVcf(vcf.file, genome="hg19") print(object.size(vcf), unit="auto") print(object.size(vcf) * 40L, unit="auto") library(BSgenome.Hsapiens.UCSC.hg19) seqinfo(Hsapiens) ranges.chr20 <- as(seqinfo(Hsapiens)["chr20"], "GRanges") ranges.chr20 param <- ScanVcfParam(which=ranges.chr20) vcf.chr20 <- readVcf(vcf.file, genome="hg19", param=param) vcf rowData(vcf) head(geno(vcf)$GT) variants <- vcf[geno(vcf)$GT[,1] != "0/0",] alt(variants) snvs <- variants[isSNV(variants),] snvs <- expand(snvs) head(alt(snvs)) prefilters <- FilterRules(list(restrictToVariants=function(text) { !grepl("0/0", text, fixed=TRUE) })) filters <- FilterRules(list(restrictToSNVs=function(vcf) isSNV(vcf))) filterVcf(vcf.file, genome="hg19", "snvs.vcf", index=TRUE, prefilters=prefilters, filters=filters, param=ScanVcfParam(info=NA)) cbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") color.scheme <- cbPalette my.theme <- lattice::trellis.par.get() my.theme <- within(my.theme, { box.rectangle$col <- "black" box.rectangle$fill <- "gray90" box.umbrella$col <- "black" box.umbrella$lty <- 1 strip.background$col <- rep("gray90", 7) plot.symbol$col <- "black" plot.symbol$pch <- 19 box.dot$cex <- 1 box.dot$pch <- "|" add.line$col <- "red" plot.polygon$col <- "gray90" superpose.symbol$fill <- color.scheme superpose.symbol$col <- color.scheme superpose.line$fill <- color.scheme superpose.line$col <- color.scheme superpose.polygon$fill <- color.scheme superpose.polygon$col <- color.scheme }) lattice.options(default.theme = my.theme) pdf(file="fig/cov.pdf",height=4,width=4) library(lattice) densityplot(~ geno(snvs)$DP, xlim=c(0, 2*median(geno(snvs)$DP, na.rm=TRUE)), plot.points=FALSE) dev.off() pdf(file="fig/freq-cov-bin.pdf",height=4,width=8) rowData(snvs)$coverage.bin <- cut(geno(snvs)$DP, c(0, 20, 80, Inf)) rowData(snvs)$variant.freq <- geno(snvs)$AD[,,2]/geno(snvs)$DP[,1] densityplot(~ variant.freq | coverage.bin, as.data.frame(rowData(snvs)), plot.points=FALSE, layout=c(3,1), xlab="variant frequency by coverage bin") dev.off() fixed(header(snvs))$FILTER table(unlist(strsplit(filt(snvs), ";", fixed=TRUE))) snvs <- snvs[grep("PASS", filt(snvs), fixed=TRUE),] table(rowData(snvs)$coverage.bin) pdf(file="fig/freq-cov-bin-2.pdf",height=4,width=4) densityplot(~ variant.freq, as.data.frame(rowData(snvs)), plot.points=FALSE, xlab="variant frequency") dev.off() indels <- variants[isIndel(variants)] indel.windows <- rowData(indels) + 10 rowData(snvs)$near.indel <- snvs %over% indel.windows xtabs(~ near.indel, mcols(rowData(snvs))) chr20.sequence <- getSeq(Hsapiens, "chr20") chr20.hp <- ranges(Rle(as.raw(chr20.sequence))) chr20.hp <- chr20.hp[width(chr20.hp) > 6L] rowData(snvs)$over.hp <- ranges(snvs) %over% chr20.hp xtabs(~ over.hp, mcols(rowData(snvs))) data(selfChains) gene.models <- TxDb.Hsapiens.UCSC.hg19.knownGene locations <- locateVariants(snvs, gene.models, CodingVariants()) colnames(mcols(locations)) rowData(snvs)$coding.tx <- NA_integer_ rowData(snvs)$coding.tx[locations$QUERYID] <- locations$TXID syms <- unlist(mget(locations$GENEID[!is.na(locations$GENEID)], org.Hs.egSYMBOL, ifnotfound=NA)) locations$SYMBOL[!is.na(locations$GENEID)] <- syms coding <- predictCoding(snvs, gene.models, Hsapiens) colnames(mcols(coding)) table(coding$CONSEQUENCE) CEUvcf <- file.path(system.file("extdata", package="VariantFiltering"), "CEUtrio.vcf.bgz") CEUped <- file.path(system.file("extdata", package="VariantFiltering"), "CEUtrio.ped") param <- VariantFilteringParam(vcfFilenames=CEUvcf, pedFilename=CEUped) reHo <- autosomalRecessiveHomozygous(param) reHo aim <- reportVariants(reHo) maxMAF(reHo) <- 0.05 MAFmask <- MAFpop(reHo) MAFmask MAFpop(reHo) <- !MAFmask MAFpop(reHo, "ASN_AFKG") <- TRUE MAFpop(reHo) minCRYP5ss(reHo) <- 0 reHo filteredVariants(reHo) ranges.20 <- ranges.chr20 seqlevelsStyle(ranges.20) <- "NCBI" broad.vcf <- readVcf("~/projects/papers/ScalableGenomics/work/CEUTrio.HiSeq.WGS.b37.bestPractices.phased.b37.vcf.gz", param=ScanVcfParam(which=ranges.20), genome="hg19") broad.snvs <- broad.vcf[isSNV(broad.vcf) & geno(broad.vcf)$GT[,1] != "0/0",1] seqlevelsStyle(broad.snvs) <- "UCSC" snvs <- dropSeqlevels(snvs, "chrM") broad.vr <- as(broad.snvs, "VRanges") illumina.vr <- as(snvs, "VRanges") illumina.vr$in.broad <- illumina.vr %in% broad.vr mean(illumina.vr$in.broad) illumina.vr$broad.freq <- altFraction(broad.vr)[match(illumina.vr, broad.vr)] pdf(file="fig/scatter.pdf",height=4,width=4) xyplot(broad.freq ~ altFraction(illumina.vr), as.data.frame(illumina.vr), panel=panel.smoothScatter, xlab="Illumina frequency", ylab="Broad frequency") dev.off() runs <- vcf[!is.na(info(vcf)$END),] end(rowData(runs)) <- info(runs)$END sum(width(runs)) / seqlengths(runs)["chr20"]