\documentclass{article} \usepackage{amsmath} \usepackage{float} \usepackage{amscd} \usepackage[tableposition=top]{caption} \usepackage{ifthen} \usepackage[utf8]{inputenc} \usepackage{hyperref} \usepackage[margin=1in,footskip=0.25in]{geometry} %\VignetteIndexEntry{Using CODEX} \begin{document} \SweaveOpts{concordance=TRUE} \title{CODEX vignette} \author{Yuchao Jiang \\ \href{mailto:yuchaoj@wharton.upenn.edu}{yuchaoj@wharton.upenn.edu}} \maketitle This is a demo for using the \verb@CODEX@ package in R. \verb@CODEX@ is a normalization and copy number variation calling procedure for whole exome DNA sequencing data. \verb@CODEX@ relies on the availability of multiple samples processed using the same sequencing pipeline for normalization, and does not require matched controls. The normalization model in \verb@CODEX@ includes terms that specifically remove biases due to GC content, exon capture and amplification efficiency, and latent systemic artifacts. \verb@CODEX@ also includes a Poisson likelihood-based recursive segmentation procedure that explicitly models the count-based exome sequencing data. Below is an example on calling copy number variation using whole-exome sequencing data of 46 HapMap samples sequenced at the Washington University Genome Sequencing Center. Only the 401-500 exons from chromosome 22 are analysed for illustration purposes. R packages are available at Bioconductor for \href{http://www.bioconductor.org/packages/devel/bioc/html/CODEX.html}{CODEX} and the toy dataset \href{http://www.bioconductor.org/packages/devel/data/experiment/html/WES.1KG.WUGSC.html}{WES.1KG.WUGSC}. \section*{1. Website and online forum} CODEX's website with usage and installation information: {\href{https://github.com/yuchaojiang/CODEX}{https://github.com/yuchaojiang/CODEX}.\\ Online Q\&A forum: \href{https://groups.google.com/d/forum/codex_wes_cnv}{https://groups.google.com/d/forum/codex\_wes\_cnv}.\\ If you've any questions regarding the software, please don't hesitate emailing us at \href{mailto:codex_wes_cnv@googlegroups.com}{codex\_wes\_cnv@googlegroups.com}. \section*{2. CODEX workflow:} \subsection*{2.1 Install CODEX.} Install the current release from Bioconductor: <<install1, eval=FALSE>>= ## try http:// if https:// URLs are not supported if (!requireNamespace("BiocManager", quietly=TRUE)) install.packages("BiocManager") BiocManager::install("CODEX") @ Install the devel version from GitHub: <<install2, eval=FALSE>>= install.packages("devtools") library(devtools) install_github("yuchaojiang/CODEX/package") @ \subsection*{2.2 Get directories of .bam files, read in exon target positions from .bed files, and get sample names.} The direct input of \verb@CODEX@ include: \verb@bamdir@, which is a vector indicating the directories of all .bam files; \verb@sampname@, which is a column vector with row entries of sample names; \verb@bedFile@, which indicates the directory of the .bed file (WES bait file, no header, sorted by start and end positions); and \verb@chr@, which specifies the chromosome. \verb@CODEX@ processes the entire genome chromosome by chromosome; make sure the chromosome formats are consistent between the .bed and the .bam files. <<bambedObj1>>= library(CODEX) library(WES.1KG.WUGSC) # Load Toy data from the 1000 Genomes Project. dirPath <- system.file("extdata", package = "WES.1KG.WUGSC") bamFile <- list.files(dirPath, pattern = '*.bam$') bamdir <- file.path(dirPath, bamFile) sampname <- as.matrix(read.table(file.path(dirPath, "sampname"))) bedFile <- file.path(dirPath, "chr22_400_to_500.bed") chr <- 22 bambedObj <- getbambed(bamdir = bamdir, bedFile = bedFile, sampname = sampname, projectname = "CODEX_demo", chr) bamdir <- bambedObj$bamdir; sampname <- bambedObj$sampname ref <- bambedObj$ref; projectname <- bambedObj$projectname; chr <- bambedObj$chr @ \subsection*{2.3 Get raw read depth from the .bam files. Read lengths across all samples are also returned.} <<coverageObj1>>= coverageObj <- getcoverage(bambedObj, mapqthres = 20) Y <- coverageObj$Y; readlength <- coverageObj$readlength @ \subsection*{2.4 Compute GC content and mappability for each exon target.} <<gcmapp1>>= gc <- getgc(chr, ref) mapp <- getmapp(chr, ref) @ \subsection*{2.5 Take a sample-wise and exon-wise quality control procedure on the depth of coverage matrix.} <<qcObj1>>= qcObj <- qc(Y, sampname, chr, ref, mapp, gc, cov_thresh = c(20, 4000), length_thresh = c(20, 2000), mapp_thresh = 0.9, gc_thresh = c(20, 80)) Y_qc <- qcObj$Y_qc; sampname_qc <- qcObj$sampname_qc; gc_qc <- qcObj$gc_qc mapp_qc <- qcObj$mapp_qc; ref_qc <- qcObj$ref_qc; qcmat <- qcObj$qcmat write.table(qcmat, file = paste(projectname, '_', chr, '_qcmat', '.txt', sep=''), sep='\t', quote=FALSE, row.names=FALSE) @ \subsection*{2.6 Fit Poisson latent factor model for normalization of the read depth data.} <<normObj1>>= normObj <- normalize(Y_qc, gc_qc, K = 1:9) Yhat <- normObj$Yhat; AIC <- normObj$AIC; BIC <- normObj$BIC RSS <- normObj$RSS; K <- normObj$K @ If the WES is designed under case-control setting, CODEX estimates the exon-wise Poisson latent factor using only the read depths in the control cohort, and then computes the sample-wise latent factor terms for the case samples by regression. \verb@normal_index@ specifies the indices of normal samples and the normalization function to use under this setting is \verb$normalize2$. <<normObj2, eval=FALSE>>= normObj <- normalize2(Y_qc, gc_qc, K = 1:9, normal_index=seq(1,45,2)) Yhat <- normObj$Yhat; AIC <- normObj$AIC; BIC <- normObj$BIC RSS <- normObj$RSS; K <- normObj$K @ \subsection*{2.7 Determine the number of latent factors. AIC, BIC, and deviance reduction plots are generated in a .pdf file.} CODEX reports all three statistical metrics (AIC, BIC, percent of Variance explained) and uses BIC as the default method to determine the number of Poisson factors. Since false positives can be screened out through a closer examination of the post-segmentation data, whereas CNV signals removed in the normalization step cannot be recovered, CODEX opts for a more conservative normalization that, when in doubt, uses a smaller value of K. <<choiceofK, eval=FALSE>>= choiceofK(AIC, BIC, RSS, K, filename = paste(projectname, "_", chr, "_choiceofK", ".pdf", sep = "")) @ \begin{figure}[H] \begin{center} <<label=fig1,fig=TRUE,echo=FALSE,height=2.5, width=7.5 >>= filename <- paste(projectname, "_", chr, "_choiceofK", ".pdf", sep = "") Kmax <- length(AIC) par(mfrow = c(1, 3)) plot(K, RSS, type = "b", xlab = "Number of latent variables") plot(K, AIC, type = "b", xlab = "Number of latent variables") plot(K, BIC, type = "b", xlab = "Number of latent variables") @ \end{center} \caption{Determination of K via AIC, BIC, and deviance reduction. Optimal K is set at 2.} \label{fig:one} \end{figure} \subsection*{2.8 Fit the Poisson log-likelihood ratio based segmentation procedure to determine CNV regions across all samples.} For germline CNV detection, \verb@CODEX@ uses the "integer" mode; for CNV detection involving large recurrent chromosomal aberrations in mixture populations (e.g. somatic CNV detection in cancer), \verb@CODEX@ opts to use the "fraction" mode. \\ The output file is tab delimited and has 13 columns with rows corresponding to CNV events. The columns include sample\_name (sample names), chr (chromosome), cnv (deletion or duplication), st\_bp (cnv start position in base pair, the start position of the first exon in the cnv), ed\_bp (cnv end position in base pair, the end position of the last exon in the cnv), length\_kb (CNV length in kb), st\_exon (the first exon after QC in the cnv, integer value numbered in qcObj\$ref\_qc), ed\_exon (the last exon after QC in the cnv, integer value numbered in qcObj\$ref\_qc), raw\_cov (raw coverage), norm\_cov (normalized coverage), copy\_no (copy number estimate), lratio (likelihood ratio of CNV event versus copy neutral event), mBIC (modified BIC value, used to determine the stop point of segmentation), pvalue (p-values computed by the Wilk's theorem from the likelihood ratio test). \\ For the "fraction" mode post segmentation thresholding is necessary to filter out long CNV events with factional copy numbers close to 2. <<segment1>>= optK = K[which.max(BIC)] finalcall <- segment(Y_qc, Yhat, optK = optK, K = K, sampname_qc, ref_qc, chr, lmax = 200, mode = "integer") finalcall <<segment2, eval=FALSE>>= write.table(finalcall, file = paste(projectname, '_', chr, '_', optK, '_CODEX_frac.txt', sep=''), sep='\t', quote=FALSE, row.names=FALSE) save.image(file = paste(projectname, '_', chr, '_image', '.rda', sep=''), compress='xz') @ \section*{3. Citation} CODEX: a normalization and copy number variation detection method for whole exome sequencing Yuchao Jiang; Derek A. Oldridge; Sharon J. Diskin; Nancy R. Zhang Nucleic Acids Research 2015; doi: 10.1093/nar/gku1363 (\href{http://nar.oxfordjournals.org/content/early/2015/01/23/nar.gku1363.full?keytype=ref&ijkey=8L3rzDiYw9OWSb4}{html}, \href{http://nar.oxfordjournals.org/content/early/2015/01/23/nar.gku1363.full.pdf?keytype=ref&ijkey=8L3rzDiYw9OWSb4}{pdf}). \section*{4. Session information:} Output of sessionInfo on the system on which this document was compiled: <<sessionInfo, results=tex, echo=FALSE>>= toLatex(sessionInfo()) @ \end{document}