## ----style-knitr, eval=TRUE, echo=FALSE, results="asis"------------------ BiocStyle::latex2() ## ----load-purecn, echo=FALSE, message=FALSE------------------------------ library(PureCN) set.seed(1234) ## ----examplecoverage----------------------------------------------------- bam.file <- system.file("extdata", "ex1.bam", package="PureCN", mustWork = TRUE) interval.file <- system.file("extdata", "ex1_intervals.txt", package="PureCN", mustWork = TRUE) calculateBamCoverageByInterval(bam.file=bam.file, interval.file=interval.file, output.file="ex1_coverage.txt") ## ----examplegc----------------------------------------------------------- interval.file <- system.file("extdata", "ex2_intervals.txt", package = "PureCN", mustWork = TRUE) reference.file <- system.file("extdata", "ex2_reference.fa", package = "PureCN", mustWork = TRUE) calculateGCContentByInterval(interval.file, reference.file, output.file = "ex2_gc_file.txt") ## ----example_files, message=FALSE, warning=FALSE, results='hide'--------- normal.coverage.file <- system.file("extdata", "example_normal.txt", package="PureCN") normal2.coverage.file <- system.file("extdata", "example_normal2.txt", package="PureCN") normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) tumor.coverage.file <- system.file("extdata", "example_tumor.txt", package="PureCN") seg.file <- system.file("extdata", "example_seg.txt", package = "PureCN") vcf.file <- system.file("extdata", "example_vcf.vcf", package="PureCN") gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", package="PureCN") ## ----gccorrect----------------------------------------------------------- correctCoverageBias(normal.coverage.file, gc.gene.file, "example_normal_loess.txt") ## ----normaldb------------------------------------------------------------ normalDB <- createNormalDatabase(normal.coverage.files) # serialize, so that we need to do this only once for each assay saveRDS(normalDB, file="normalDB.rds") ## ----normaldbpca--------------------------------------------------------- normalDB <- readRDS("normalDB.rds") # get the best normal best.normal.coverage.file <- findBestNormal(tumor.coverage.file, normalDB) ## ----normaldbpcapool----------------------------------------------------- # get the best 2 normals and average them best.normal.coverage.files <- findBestNormal(tumor.coverage.file, normalDB, num.normals=2) pool <- poolCoverage(lapply(best.normal.coverage.files, readCoverageGatk), remove.chrs=c('chrX', 'chrY')) ## ----targetweightfile1--------------------------------------------------- target.weight.file <- "target_weights.txt" createTargetWeights(tumor.coverage.file, normal.coverage.files, target.weight.file) ## ----snpblacklist-------------------------------------------------------- snp.blacklist <- c("SNP_blacklist.csv", "SNP_blacklist_segmented.csv") recreateBlacklists <- FALSE if (recreateBlacklists) { mutect.normal.files <- dir("poolofnormals", pattern="vcf$", full.names=TRUE) snp.bl <- createSNPBlacklist(mutect.normal.files) write.csv(snp.bl[[1]], file=snp.blacklist[1]) write.csv(snp.bl[[2]], file=snp.blacklist[2], row.names=FALSE, quote=FALSE) } ## ----ucsc_segmental------------------------------------------------------ # Instead of using a pool of normals to find low quality regions, # we use suitable BED files, for example from the UCSC genome browser. # We do not download these in this vignette to avoid build failures # due to internet connectivity problems. downloadFromUCSC <- FALSE if (downloadFromUCSC) { library(rtracklayer) mySession <- browserSession("UCSC") genome(mySession) <- "hg19" tbl.segmentalDups <- getTable( ucscTableQuery(mySession, track="Segmental Dups", table="genomicSuperDups")) tbl.simpleRepeats <- getTable( ucscTableQuery(mySession, track="Simple Repeats", table="simpleRepeat")) write.table(tbl.segmentalDups[,-1], file="hg19_segmentalDuplications.txt", sep="\t", row.names=FALSE) write.table(tbl.simpleRepeats[,-1], file="hg19_simpleRepeats.txt", sep="\t", row.names=FALSE) } snp.blacklist <- c('hg19_segmentalDuplications.txt', 'hg19_simpleRepeats.txt') ## ----runpurecn----------------------------------------------------------- ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file, tumor.coverage.file=tumor.coverage.file, vcf.file=vcf.file, genome="hg19", sampleid='Sample1', gc.gene.file=gc.gene.file, # args.filterVcf=list(snp.blacklist=snp.blacklist, # stats.file=mutect.stats.file), args.filterTargets=list(normalDB=normalDB), args.segmentation=list(target.weight.file=target.weight.file), post.optimize=FALSE, plot.cnv=FALSE, verbose=FALSE) ## ----createoutput-------------------------------------------------------- file.rds <- 'Sample1_PureCN.rds' saveRDS(ret, file=file.rds) pdf('Sample1_PureCN.pdf', width=10, height=12) plotAbs(ret, type='all') dev.off() ## ----figureexample1, fig.show='hide', fig.width=6, fig.height=6---------- plotAbs(ret, type="overview") ## ----figureexample2, fig.show='hide', fig.width=6, fig.height=6---------- plotAbs(ret, 1, type="hist") ## ----figureexample3, fig.show='hide', fig.width=8, fig.height=8---------- plotAbs(ret, 1, type="BAF") ## ----figureexample4, fig.show='hide', fig.width=8, fig.height=8---------- plotAbs(ret, 1, type="AF") ## ----curationfile-------------------------------------------------------- createCurationFile(file.rds) ## ----readcurationfile---------------------------------------------------- ret <- readCurationFile(file.rds) ## ----curationfileshow---------------------------------------------------- read.csv('Sample1_PureCN.csv') ## ----figureexample1b, fig.show='hide', fig.width=6, fig.height=6--------- ret <- autoCurateResults(ret) plotAbs(ret, type="overview") ## ----curationauto-------------------------------------------------------- file.curated.rds <- 'Sample1_PureCN_autocurated.rds' saveRDS(ret, file=file.curated.rds) pdf('Sample1_PureCN_autocurated.pdf', width=10, height=12) plotAbs(ret, type='all') dev.off() createCurationFile(file.curated.rds) ## ----customseg----------------------------------------------------------- retSegmented <- runAbsoluteCN(seg.file=seg.file, gc.gene.file=gc.gene.file, vcf.file=vcf.file, max.candidate.solutions=1, genome="hg19", test.purity=seq(0.3,0.7,by=0.05), verbose=FALSE, plot.cnv=FALSE) ## ----figurecustombaf, fig.show='hide', fig.width=8, fig.height=8--------- plotAbs(retSegmented, 1, type="BAF") ## ----customlogratio------------------------------------------------------ # We still use the log-ratio exactly as normalized by PureCN for this # example log.ratio <- calculateLogRatio(readCoverageGatk(normal.coverage.file), readCoverageGatk(tumor.coverage.file), verbose=FALSE) retLogRatio <- runAbsoluteCN(log.ratio=log.ratio, gc.gene.file=gc.gene.file, vcf.file=vcf.file, max.candidate.solutions=1, genome="hg19", test.purity=seq(0.3,0.7,by=0.05), verbose=FALSE, args.filterTargets=list(normalDB=normalDB), plot.cnv=FALSE) ## ----output1------------------------------------------------------------- names(ret) ## ----output3------------------------------------------------------------- head(predictSomatic(ret), 3) ## ----output4------------------------------------------------------------- vcf <- predictSomatic(ret, return.vcf=TRUE) writeVcf(vcf, file="Sample1_PureCN.vcf") ## ----calling2------------------------------------------------------------ gene.calls <- callAlterations(ret) head(gene.calls) ## ----loh----------------------------------------------------------------- loh <- callLOH(ret) head(loh) ## ----power1, fig.show='hide', fig.width=6, fig.height=6------------------ purity <- c(0.1,0.15,0.2,0.25,0.4,0.6,1) coverage <- seq(5,35,1) power <- lapply(purity, function(p) sapply(coverage, function(cv) calculatePowerDetectSomatic(coverage=cv, purity=p, ploidy=2, verbose=FALSE)$power)) # Figure S7b in Carter et al. plot(coverage, power[[1]], col=1, xlab="Sequence coverage", ylab="Detection power", ylim=c(0,1), type="l") for (i in 2:length(power)) lines(coverage, power[[i]], col=i) abline(h=0.8, lty=2, col="grey") legend("bottomright", legend=paste("Purity", purity), fill=seq_along(purity)) ## ----power2, fig.show='hide', fig.width=6, fig.height=6------------------ coverage <- seq(5,350,1) power <- lapply(purity, function(p) sapply(coverage, function(cv) calculatePowerDetectSomatic(coverage=cv, purity=p, ploidy=2, cell.fraction=0.2, verbose=FALSE)$power)) plot(coverage, power[[1]], col=1, xlab="Sequence coverage", ylab="Detection power", ylim=c(0,1), type="l") for (i in 2:length(power)) lines(coverage, power[[i]], col=i) abline(h=0.8, lty=2, col="grey") legend("bottomright", legend=paste("Purity", purity), fill=seq_along(purity)) ## ----annotatesymbols----------------------------------------------------- tblGenes <- NULL if (downloadFromUCSC) { library(rtracklayer) mySession <- browserSession("UCSC") genome(mySession) <- "hg19" tblGenes <- getTable( ucscTableQuery(mySession, track="RefSeq Genes", table="refGene")) } .annotateIntervals <- function(gc.gene.file, tblGenes, output.file = NULL) { grGenes <- GRanges(seqnames=tblGenes$chrom, IRanges(start=tblGenes$cdsStart, end=tblGenes$cdsEnd)) gc <- read.delim(gc.gene.file, as.is=TRUE) # misuse this function to convert interval string into data.frame gc.data <- readCoverageGatk(gc.gene.file) grGC <- GRanges(seqnames=gc.data$chr, IRanges(start=gc.data$probe_start, end=gc.data$probe_end)) ov <- findOverlaps(grGC, grGenes, select="first") gc$Gene <- tblGenes$name2[ov] if (!is.null(output.file)) { write.table(gc, file = output.file, row.names = FALSE, quote = FALSE, sep = "\t") } invisible(gc) } if (!is.null(tblGenes)) .annotateIntervals(gc.gene.file, tblGenes) ## ----sessioninfo, results='asis', echo=FALSE----------------------------- toLatex(sessionInfo())