% -*- mode: noweb; noweb-default-code-mode: R-mode; -*-
%\VignetteIndexEntry{panelcn.mops: Manual for the R package}
%\VignetteDepends{panelcn.mops}
%\VignettePackage{panelcn.mops}
%\VignetteEngine{knitr::knitr}
%\VignetteKeywords{copy number analysis, mixture distribution, latent variables,
% Poisson distribution, EM algorithm, NGS, CNV, copy number variant}


\documentclass[article]{bioinf}


\usepackage{amsmath,amssymb}
\usepackage{hyperref}
\usepackage{float}
\usepackage[authoryear]{natbib}

\hypersetup{colorlinks=false,
    pdfborder=0 0 0,
    pdftitle={panelcn.MOPS - CNV detection tool for targeted NGS panel 
    data},
    pdfauthor={Gundula Povysil}}

\title{panelcn.MOPS - CNV detection tool for targeted NGS panel data}
\author{Verena Haunschmid and Gundula Povysil}
\affiliation{Institute of Bioinformatics, Johannes Kepler University
Linz\\Altenberger Str. 69, 4040 Linz, Austria\\
\email{povysil@bioinf.jku.at}}

\newcommand{\panelcnmops}{\texttt{panelcn.mops}}
\newcommand{\cnmops}{\texttt{cn.mops}}

\newcommand{\method}[1]{{\fontfamily{phv}\fontshape{rm}\selectfont #1}}
\newcommand{\R}{R}
\newcommand{\Real}{\mathbb{R}}

\renewcommand{\vec}[1]{\mathbf{#1}}

\setkeys{Gin}{width=0.55\textwidth}


<<include=FALSE>>=
library(knitr)
opts_chunk$set(

)
@


\begin{document}

<<include=FALSE>>=
library(knitr)
opts_chunk$set(
concordance=TRUE
)
@

<<echo=FALSE, message=FALSE>>=
options(width=65)
set.seed(0)
library(panelcn.mops)
panelcn.mopsVersion <- packageDescription("panelcn.mops")$Version
@
\newcommand{\cnmopsVersion}{\Sexpr{panelcn.mopsVersion}}
\manualtitlepage[Version \cnmopsVersion, \today]

%\section*{Scope and Purpose of this Document}
%
%This document is a user manual for the \R\ package \panelcn.mops.
%It is only meant as a gentle introduction into how to use the basic
%functions implemented in this package. Not all features of the \R\
%package are described in full detail. Such details can be obtained
%from the documentation enclosed in the \R\ package. Further note
%the following: (1) this is neither an introduction to CNV detection from NGS
%data; (2) this is not an introduction to \R.
%If you lack the background for understanding this manual, you first
%have to read introductory literature on these subjects.
%


\vspace{1cm}

\newlength{\auxparskip}
\setlength{\auxparskip}{\parskip}
\setlength{\parskip}{0pt}
\tableofcontents
\clearpage
\setlength{\parskip}{\auxparskip}

\newlength{\Nboxwidth}
\setlength{\Nboxwidth}{\textwidth}
\addtolength{\Nboxwidth}{-2\fboxrule}
\addtolength{\Nboxwidth}{-2\fboxsep}

\newcommand{\notebox}[1]{%
\begin{center}
\fbox{\begin{minipage}{\Nboxwidth}
\noindent{\sffamily\bfseries Note:} #1
\end{minipage}}
\end{center}}

\section{Introduction}
The \panelcnmops\ package is based on the
\cnmops\ package and allows to detect copy number variations (CNVs) from
targeted NGS panel data. Please visit
\url{http://www.bioinf.jku.at/software/panelcnmops/index.html} for additional
information.\par


\section{Getting started and quick start}

To load the package, enter the following in your \R\ session:
<<echo=TRUE>>=
library(panelcn.mops)
data(panelcn.mops)
@

The whole pipeline will only take a few steps, if BAM files are available
(for read count matrices directly go to step 2):

\begin{enumerate}
\item Getting count windows from the BED file (also see Section \ref{s:input}).
<<eval=FALSE>>=
bed <- "Genes_part.bed"
countWindows <- getWindows(bed)
@

\item Getting read counts (RCs) from BAM file (also see Section \ref{s:input}).
Note that the BAM file is not included so do not try to run this code. However, 
the resulting test object is included as part of the data.
<<eval=FALSE>>=
testbam <- "SAMPLE1.bam"
test <- countBamListInGRanges(countWindows = countWindows,
                                bam.files = testbam, read.width = 150)
@

\item Running the algorithm (also see Section \ref{s:panelcn.mops}).
<<eval=FALSE>>=

selectedGenes <- c("ATM")

XandCB <- test
elementMetadata(XandCB) <- cbind(elementMetadata(XandCB), 
                                elementMetadata(control))

resultlist <- runPanelcnMops(XandCB, 
                            testiv = 1:ncol(elementMetadata(test)), 
                            countWindows = countWindows, 
                            selectedGenes = selectedGenes)
@

\item Visualization of the detected CNV regions. For more information about
the result objects and visualization see Section \ref{s:results} and 
Section \ref{s:plot}.


<<echo=FALSE,results='hide'>>=

XandCB <- test
elementMetadata(XandCB) <- cbind(elementMetadata(XandCB), 
                                elementMetadata(control))
selectedGenes <- "ATM"
@


<<>>=
sampleNames <- colnames(elementMetadata(test))
resulttable <- createResultTable(resultlist = resultlist, XandCB = XandCB, 
                                    countWindows = countWindows, 
                                    selectedGenes = selectedGenes, 
                                    sampleNames = sampleNames)

(tail(resulttable[[1]]))
@


<<eval=FALSE>>=
plotBoxplot(result = resultlist[[1]], sampleName = sampleNames[1], 
            countWindows = countWindows,
            selectedGenes = selectedGenes, showGene = 1)
@


<<fig.keep='none',echo=FALSE,results='hide'>>=

sampleNames <- colnames(elementMetadata(test))
selectedGenes <- "ATM"

pdf("001.pdf", width = 10)
plotBoxplot(result = resultlist[[1]], sampleName = sampleNames[1], 
            countWindows = countWindows, selectedGenes = selectedGenes, 
            showGene = 1)
dev.off()
@

\begin{figure}[H]
\begin{center}
\includegraphics[angle=0,width= 0.9\columnwidth]{001.pdf}
\end{center}
\end{figure}
\end{enumerate}

\section{Input}
\label{s:input}
% \subsection{Read count matrices as input}
% Like \cnmops\ \panelcnmops\ does not require the data samples to be of any
% specific kind or structure. \cnmops\ only requires a {\em read count matrix},
% i.e., given $N$ data samples and $m$ genomic segments, this is an $m\times N$
% real- or integer-valued matrix $\mathbf{X}$,
% in which an entry $x_{ij}$ corresponds to the read count of sample $j$ in the
% $i$-th segment. E.g. in the following read count matrix sample three has
% $17$ reads in the second segment: $x_{23}=71$.
% 
% 
% \newlength{\mylen}
% \setlength{\mylen}{0.43cm}
% 
% \[\mathbf{X}= \begin{array}{c} \\ \mathrm{Segment\ 1} \\ \mathrm{Segment\ 2} 
% \\ \mathrm{Segment\ 3} \\ \mathrm{Segment\ 4}\\ \mathrm{Segment\ 5} \\
% \mathrm{Segment\ 6} \\ \hspace{0.2cm} \end{array}
% \begin{array}{c}
% \begin{array}{cccc}
% \mathrm{Sample\ 1} & \mathrm{Sample\ 2} & \mathrm{Sample\ 3} &
% \mathrm{Sample\ 4}\end{array}\\
% \left(\begin{array}{cccc}
% \hspace{\mylen}88\hspace{\mylen} & \hspace{\mylen}82\hspace{\mylen} &
% \hspace{\mylen}79\hspace{\mylen} & \hspace{\mylen}101\hspace{\mylen}\\
% 83 & 78 & 71 & 99\\
% 43 & 50 & 55 & 37\\
% 47 & 58 & 48 & 42 \\
% 73 & 86 & 95 & 91\\
% 92 & 90 & 80 & 71
% \end{array}\right) \\ \hspace{0.2cm}  \end{array}
% \]
% 
% 
% \panelcnmops\ can handle numeric and integer matrices or \verb+GRanges+ 
% objects, in which the read counts are stored as \verb+values+ of the object.
% 
% 
% 
% \subsection{BAM files as input}
% \label{s:bam}
The most widely used file format for aligned short reads is the Sequence
Alignment Map (SAM) format or in the compressed form the Binary Alignment Map
(BAM). \panelcnmops\ modifies the read count function \verb+countBamInGRanges+
from the \R\ package \texttt{exomeCopy} to extract read counts for a list of
BAM files.
The result object of the function can directly be used as input for
\panelcnmops.

The first step is to extract all regions of interest (ROIs) that define the 
count windows from a BED file with the function \verb+getWindows+. The BED 
file that is provided is a subset of the TruSight Cancer Panel BED file.
<<>>=
bed <- system.file("extdata/Genes_part.bed", package = "panelcn.mops")
countWindows <- getWindows(bed)
@

The BED file should have the following structure:
<<echo=FALSE>>=
bed <- system.file("extdata/Genes_part.bed", package = "panelcn.mops")
write.table(head(read.table(bed)), row.names = FALSE, col.names = FALSE, 
            quote = FALSE)
@

While the first 3 columns list chromosome name, start and end position, the 
fourth column needs to start with the gene name. Additional information in the 
fourth column needs to be separated with a dot and may include the exon number 
and further information. By default the "chr" prefix of the chromosome name is 
removed if present. This can be changed by setting the {\tt chr} parameter to 
TRUE. If a mismatch of chromosome naming between the \verb+countWindows+ object 
and the BAM files is detected, the naming convention of the BAM file is chosen.


In the second step RCs are generated from the BAM files. The read.width 
parameter reflects the typical length of the reads that should be counted.
Note that the BAM file is not included so do not try to run this code. However, 
the resulting test object is included as part of the data.

<<eval=FALSE>>=
testbam <- "SAMPLE1.bam"
test <- countBamListInGRanges(countWindows = countWindows,
                                bam.files = testbam, read.width = 150)
@

In \verb+test+ you have now stored the genomic segments (left of the
$\mid$'s) and the read counts (right of the $\mid$'s):
<<>>=
(test)
@

If the BED file contains very large ROIs, a higher resolution of the CNV 
detection algorithm can be achieved by splitting up larger ROIs into smaller 
overlapping bins. This can be achieved with the funciton \verb+splitROIs+:

<<eval=FALSE>>=
splitROIs(bed, "newBed.bed")
@

By default all ROIs are split into bins of 100 bp with an overlap of 50 bp. 
The parameter {\tt limit} controls the minimum size of the ROIs that should be 
split (default = 0). The parameters {\tt bin} and {\tt shift} control the size 
of the bins and the no. of bp between start positions of adjacent bins. 

\section{runPanelcnMops}
\label{s:panelcn.mops}
The actual copy number analysis is done with the function \verb+runPanelcnMops+.
The function requires a \verb+GRanges+ object of the RCs of test and control 
samples as well as the \verb+countWindows+ object used to extract these RCs.
Optional parameters include a vector that indicates which samples to regard as 
test samples (default = c(1)), a vector of the names of the genes of interest 
(by default all genes are of interest), parameters for normalizing the RCs, 
a vector of expected fold changes for the copy number classes and a minimal 
median RC over all samples to exclude low coverage ROIs.

<<eval=FALSE>>=

selectedGenes <- "ATM"

XandCB <- test
elementMetadata(XandCB) <- cbind(elementMetadata(XandCB), 
                                elementMetadata(control))
resultlist <- runPanelcnMops(XandCB, countWindows = countWindows, 
                             selectedGenes = selectedGenes)
@


\section{Results}
\label{s:results}
The function \verb+runPanelcnMops+ returns a list of objects of the S4 class 
CNVDetectionResult, one CNVDetectionResult object per test sample. 
The structure of the CNVDetectionResult object can be listed by calling
<<eval=FALSE>>=
(str(resultlist[[1]]))
@

To get
detailed information on which data are stored in such objects, enter
<<eval=FALSE>>=
help(CNVDetectionResult)
@


The CNVs per individual are stored in the slot \verb+integerCopyNumber+:
<<>>=
integerCopyNumber(resultlist[[1]])[1:5]
@


The function \verb+createResultTable+ summarizes all relevant information for 
user selected genes of interest in a list of tables with one table per test 
sample:


<<>>=
sampleNames <- colnames(elementMetadata(test))
resulttable <- createResultTable(resultlist = resultlist, XandCB = XandCB, 
                                    countWindows = countWindows, 
                                    selectedGenes = selectedGenes, 
                                    sampleNames = sampleNames)

(tail(resulttable[[1]]))
@

The table contains one line per Region Of Interest (ROI) with information about 
the RCs of the test sample ("RC"), 
the median RCs of all control samples ("medRC"), 
the normalized RCs of the test sample ("RC.norm"), 
the median of the normalized RCs of all control samples ("medRC.norm"), 
as well as the estimated CN ("CN"). 
Additionally, in the column "lowQual" low quality ROIs are flagged.


\section{Visualization of results}
\label{s:plot}
\panelcnmops\ contains a plotting function that visualizes the normalized RCs 
of the samples analyzed as boxplots:

\begin{center}
<<eval=FALSE>>=
plotBoxplot(result = resultlist[[1]], sampleName = sampleNames[1], 
            countWindows = countWindows,
            selectedGenes = selectedGenes, showGene = 1)
@


\begin{figure}[H]
\begin{center}
\includegraphics[angle=0,width= 0.9\columnwidth]{001.pdf}
\end{center}
\end{figure}
\end{center}

The function expects a single CNVDetectionResult object as input together with 
the name of the test sample, the countWindows used, as well as a vector with 
the names of the genes of interest and an integer specifying which of the 
genes of interest to plot.


\section{Analysis of chromosome X}
\label{s:chrX}
The analysis of ROIs on chromosome X is only possible if all samples have the 
same sex and the parameter sex of the function \verb+runPanelcnMops+ is set 
accordingly. The default "mixed" results in the removal of all X-chromosomal 
ROIs. Note, that if all samples are males CN2 in the results really 
corresponds to CN1.


\section{Quality control}

The panelcn.MOPS algorithm includes different quality control metrics. 1) ROIs 
are excluded if their median read count (RC) across all samples does not exceed 
a user defined threshold (default: 30), additionally a warning message is 
displayed. 2) ROIs are marked as "low quality" in the result table if their RCs 
show a high variation across all samples. 3) Samples with a median RC across 
all ROIs lower than 0.55 times the median of all samples are considered as low 
quality. 4) For each ROI the ratio between the normalized RCs of each sample 
compared to the median across all samples is calculated. Samples that show a 
high variation in these RC ratios are also flagged as low quality. Low quality 
samples are excluded if they are control samples which leads to a warning 
message. If a test sample is of low quality, only a warning message is 
displayed. 

\section{Adjusting sensitivity and specificity}
The default parameters of the \panelcnmops\ algorithm were optimized on a data 
set of targeted NGS panel data with the aim of detecting CNVs ranging in size 
from part of a ROI to whole genes. However, you might want to adjust 
sensitivity and specificity to your specific needs.

The parameter that influences sensitivity and specificity the most is {\tt I}, 
the vector of expected fold changes of the copy number classes. The default for 
\panelcnmops\, c(0.025, 0.57, 1, 1.46, 2), leads to a higher sensitivity 
compared to the default of \cnmops\ which is c(0.025, 0.5, 1, 1.5, 2). 
Increasing the values for CN0 and CN1 further and decreasing the values for 
CN3 and CN4 may help to improve the sensitivity, a change in the other 
direction may increase the specificity.

Additional parameters that can be tuned to improve the results are the 
different normalization parameters: {\tt normType}, {\tt sizeFactor}, 
{\tt qu}, {\tt quSizeFactor}, and {\tt norm}.


% \clearpage


\section{How to cite this package}

If you use this package for research that is published later, you are kindly
asked to cite it as follows:
\citep{Povysil:17}.

To obtain Bib\TeX\ entries of the reference, you can enter the following
into your R session:
<<eval=FALSE>>=
toBibtex(citation("panelcn.mops"))
@


\bibliographystyle{natbib}
\bibliography{cnv}


\end{document}