library(CSAMA2014ScalableComputingLab) fastq.sampler <- FastqSampler(NA12878.20.fastq, n = 1e6, readerBlockSize = 1e6) set.seed(1975) fastq <- yield(fastq.sampler) length(fastq) qa.result <- qa(fastq, "NA12878") qa.result[["baseCalls"]] bam.file <- ScalableGenomics::NA12878.20.bam param <- ScanBamParam(what=c("pos", "qwidth"), flag=scanBamFlag(isUnmappedQuery=FALSE)) bam <- scanBam(bam.file, param=param)[[1]] aln.ranges <- IRanges(bam$pos, width=bam$qwidth) cov <- coverage(aln.ranges) cov <- coverage(bam.file, param=param) cov[[20]] compression <- length(cov) / (length(runValue(cov))*2) compression cov.cutoff <- 150L high.cov <- cov > cov.cutoff length(high.cov) / (length(runValue(high.cov))*2) high.cov.vector <- as.vector(high.cov) microbenchmark(sum(high.cov), sum(high.cov.vector)) summary(bm)[,c("expr", "median")] high.cov.views <- slice(cov[[20]], cov.cutoff) head(high.cov.views) mean.high.cov <- viewMeans(high.cov.views) head(mean.high.cov) genome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19 object.size(genome) chr20.views <- Views(genome$chr20, ranges(high.cov.views)) unmasked(genome$chr20)@shared subject(chr20.views)@shared high.cov.freq <- alphabetFrequency(chr20.views, as.prob=TRUE, collapse=TRUE) chr20.freq <- alphabetFrequency(genome$chr20, as.prob=TRUE) rbind(high.cov.freq, chr20.freq)[,DNA_BASES] cov20 <- cov[[20]] session <- browserSession() chr20.range <- GRangesForUCSCGenome("hg19", "chr20") repeats.table <- getTable(session, "rmsk", chr20.range) simple.classes <- c("Simple_repeat", "Low_complexity") repeats.simple <- subset(repeats.table, repClass %in% simple.classes) repeats <- with(repeats.simple, IRanges(genoStart, genoEnd)) hits <- findOverlaps(repeats, high.cov.views) f <- factor(subjectHits(hits), seq_len(subjectLength(hits))) repeats.olap <- pintersect(repeats[queryHits(hits)], ranges(high.cov.views)[subjectHits(hits)]) repeats.split <- split(repeats.olap, f) class(repeats.split) repeat.cov <- sum(width(repeats.split)) length(repeat.cov) sum(repeat.cov) / sum(width(high.cov.views)) sum(width(reduce(repeats))) / width(chr20.range) chr20.set <- as(chr20.views, "DNAStringSet") first.seq <- chr20.set[[1]] first.seq@shared unmasked(genome$chr20)@shared repeats.tree <- GIntervalTree(GRanges("20", repeats)) bam.stream <- BamFile(path(bam.file), yieldSize = 1e6) maskRule <- function(reads) { strand <- Rle(strand("*"), nrow(reads)) ga <- with(reads, GAlignments(rname, pos, cigar, strand)) ga %outside% repeats.tree } rules <- FilterRules(list(mask = maskRule)) dest <- paste0("masked-", basename(path(bam.file))) param <- ScanBamParam(what=c("rname", "pos", "cigar"), flag=scanBamFlag(isUnmappedQuery=FALSE)) filtered.bam <- filterBam(bam.file, dest, param=param, filter=rules) bam.stream <- BamFile(path(filtered.bam), yieldSize = 1e6) cov2 <- RleList(lapply(seqlengths(bam.stream), Rle, values=0L), compress=FALSE) open(bam.stream) while(isIncomplete(bam.stream)) { reads <- readGAlignments(bam.stream) cov2 <- cov2 + coverage(reads) } close(bam.stream) sum(gc(reset=TRUE)[,6]) bam.stream <- Rsamtools::BamFile(path(filtered.bam), yieldSize = 1e6) coverage(GenomicAlignments::readGAlignments(bam.stream)) sum(gc()[,6]) tiles <- tileGenome(seqinfo(filtered.bam)["20"], tilewidth=1e6) cov3 <- Reduce(`+`, lapply(tiles, function(tile) { param <- ScanBamParam(which=tile) reads <- readGAlignments(filtered.bam, param=param) intersection <- restrict(unlist(grglist(reads)), start(tile), end(tile)) coverage(intersection) })) identical(cov2, cov3) export(cov3, "NA12878.20.coverage.bw") register(SerialParam()) pileup.bams <- BamFileList(NA12878 = filtered.bam) bpparam <- MulticoreParam(workers=2) library(VariantTools) # will not work on Windows or older Macs pileup <- do.call(c, bplapply(tiles, function(tile) { pileupVariants(pileup.bams, genome, PileupParam(which=tile)) }, BPPARAM=bpparam)) cov800 <- summary(BigWigFile("NA12878.20.coverage.bw"), size = 3846) bigwig.track <- ggplot() + geom_bar(cov800[[20]]) + ylab("coverage") filtered.bam <- BamFile("masked-NA12878.HiSeq.WGS.bwa.cleaned.recal.b37.20.bam") cov.estimate <- biovizBase::estimateCoverage(filtered.bam) cov.estimate <- keepSeqlevels(cov.estimate, "20") estimate.track <- ggplot() + geom_bar(cov.estimate) + ylab("coverage") p <- tracks(bigwig = bigwig.track, estimate = estimate.track) cov800 <- summary(BigWigFile("NA12878.20.coverage.bw"), size = 800) p high.cov.gr <- GRanges("20", ranges(high.cov.views)) genome(high.cov.gr) <- "hg19" high.cov.gr <- high.cov.gr[width(high.cov.gr) > 50L] roi <- high.cov.gr[200] * 2L reference.track <- autoplot(genome, which = roi) tally.track <- autoplot(filtered.bam, stat = "mismatch", bsgenome = genome, which = roi, geom = "bar") vcf <- ScalableGenomics::NA12878.20.vcf variant.track <- autoplot(vcf, type = "fixed", ref.show = FALSE, which = roi) p <- tracks(tally = tally.track, variant = variant.track, reference = reference.track, heights = c(4, 1, 1)) + theme_bw() pdf(file="../pkg/vignettes/fig/tracks.pdf",height=5,width=9) p dev.off() png(filename="fig/tracks-zoom.png",height=400,width=800) p + xlim(roi + 10) dev.off()