\documentclass[article, shortnames, nojss]{jss}  
\usepackage[utf8]{inputenc}
\usepackage{amsmath}
\usepackage{enumerate}
\DeclareMathOperator*{\argmax}{arg\,max}
% \VignetteIndexEntry{PermutationBased Confidence for Molecular Class}   
% \VignetteKeyword{PAM50}   
% \VignetteKeyword{single subject classifier} 
% \VignetteKeyword{clinical outcome}  
% \VignetteKeyword{breast cancer subtype} 
% \VignetteDepends{breastCancerNKI}   
% \VignetteDepends{BiocParallel}   

\author{Crist\'{o}bal Fresno\\CONICET\\Universidad Cat\'{o}lica de C\'{o}rdoba
\And Germ\'{a}n A Gonz\'{a}lez\\CONICET\\Universidad Cat\'{o}lica
de C\'{o}rdoba  \AND Andrea S Llera\\CONICET\\Fundaci\'{o}n Instituto 
Leloir \And Elmer A Fern\'{a}ndez\\CONICET\\Universidad Cat\'{o}lica de  
C\'{o}rdoba}    

\title{\pkg{pbcmc}: Permutation-Based Confidence for Molecular
Classification}

%% for pretty printing and a nice hypersummary also set:   
%% comma-separated   
\Plainauthor{Crist\'{o}bal Fresno, Germ\'{a}n A Gonz\'{a}lez, Andrea S Llera, 
    Elmer A Fern\'{a}ndez}  
%% without formatting  
\Plaintitle{pbcmc: Permutation-Based Confidence for Molecular Classification}   
%% a short title (if necessary)   
\Shorttitle{\pkg{pbcmc}: Permutation-Based Confidence for Molecular
Classification}    

%% an abstract and keywords
\Abstract{
The \pkg{pbcmc} package characterizes uncertainty assessment on gene expression
classifiers, a. k. a. molecular signatures, based on a permutation test. In 
order to achieve this goal, synthetic simulated subjects are obtained by  
permutations of gene labels. Then, each synthetic subject is tested against  
the corresponding subtype classifier to build the null distribution. Thus,   
classification confidence measurement reports can be provided for each
subject, to assist physician therapy choice. At present, it is only available 
for PAM50 implementation in \pkg{genefu} package but, it can easily be extend 
to other molecular signatures.     
}
\Keywords{PAM50, single subject classifier, clinical outcome, breast cancer 
subtype}
%% without formatting  
\Plainkeywords{PAM50, single subject classifier, clinical outcome,   
    breast cancer subtype}  
%% at least one keyword must be supplied 

%% The address of (at least) one author should be given  
%% in the following format:
\Address{
Crist\'{o}bal Fresno, Germ\'{a}n A Gonz\'{a}lez \& Elmer A Fern\'{a}ndez\\
Bioscience Data Mining Group\\ 
Facultad de Ingenier\'{i}a\\  
Universidad Cat\'{o}lica de C\'{o}rdoba - CONICET\\   
X5016DHK Córdoba, Argentina\\  
E-mail: \email{cfresno@bdmg.com.ar, ggonzalez@bdmg.com.ar, \\ 
efernandez@bdmg.com.ar}\\
URL: \url{http://www.bdmg.com.ar/}\\   
$~$\\
Andrea S Llera\\  
Laboratorio de Terapia Molecular y Celular\\   
Fundaci\'{o}n Instituto Leloir - CONICET\\
C1405BWE Ciudad Autónoma de Buenos Aires, Argentina\\  
E-mail: \email{allera@leloir.org.ar}\\   
URL: \url{http://www.leloir.org.ar/podhajcer/}   
}

\begin{document}
\SweaveOpts{concordance=TRUE}
% \SweaveOpts{concordance=FALSE}   

\section{Introduction}
Gene expression-based classifiers, known as molecular signatures (MS), are
gaining increasing attention in oncology and market. The MS can be defined as
a set of coordinately expressed genes and an algorithm that use these data to   
predict disease subtypes, response to therapy, disease risk and clinical   
outcome \citep{andre2006}. Particularly in breast cancer market there exists
many MS such as PAM50 \citep{perou2000, perou2010},  
Prosigna\textregistered ~\citep{nielsen2014},   
Oncotype DX\textregistered~\citep{paik2004} and MammaPrint\textregistered~ 
\citep{van2002}. In essence, most MS try to provide patient subtype   
classification or risk prediction which has been associated with distant   
metastasis free survival (DMFS) or relapse free survival (RFS). Consequently,   
they are intended to be used to support therapy choice. However, several 
authors have shown that data processing steps, technology, as well as  
population variability have an effect on measured gene expression and could  
bias subtype/risk subject assignment 
\citep{ioannidis2007, lusa2007, sorlie2001, sorlie2010, wu2012, ebbert2011}.   
These effects suggest that, from a statistical point of    
view, MS are not robust for subject classification. In particular, there is no
control over type I and  II like classification error and subjects could    
potentially be assigned to a wrong subtype/risk class. Indeed, the lack of 
certainty in class assignation could lead to a misleading therapy affecting  
subject outcome. Hence, the development of methods for significance or   
certainty on MS class assignation is crucial in order to assist physicians’ 
decision making \citep{wu2012, ebbert2011}. 

\subsection{PAM50 Molecular Siganture}  
The well-known breast cancer (BC) MS, PAM50 \citep{perou2000, perou2010},    
is based on the comparison between the patient gene profiles (PGP) of 50
expressed genes, against five intrinsic genes profiles (IGP) representing:
Basal, Her2-enriched, Luminal A, Luminal B and Normal-like subtypes using   
Spearman's $\rho$ correlation. Then, the subject will be assigned to the  
$i-th$ subtype according to (\ref{eq:PAM50}):

\begin{equation}
\label{eq:PAM50}
\argmax_{i \in IGP} \rho(PGP, IGP_{i})
\end{equation}

Particularly, patients are assigned to the $i-th$ subtype which maximize   
$\rho(PGP, IGP_{i})$, even if correlation is weak or there are similar/tight  
IGP correlations. The latter case has been addressed by \citet{cheang2009},   
where they have excluded subjects with a correlation difference between   
Luminal A and B  of less than 0.1, considering them ambiguous pattern, as a  
way to control a kind of type II error. However, type I error control is still 
a debt.   

\subsection{genefu library}   
\label{sec:genefu}
At present, the freely available PAM50 \pkg{genefu} algorithm implementation    
\citep{genefu}, offers a kind of subtype probability, $P(IGP_{i})$, calculated
as (\ref{eq:genefu}):   

\begin{equation}
\label{eq:genefu}
P(IGP_{i})=\frac{\rho(PGP, IGP_{i})}{\sum_i \rho(PGP, IGP_{i})} 
~~~~\forall \rho(PGP, IGP_{i})>0  
\end{equation}

However, this probability does not take part in the classification rule. Even 
worse, a very low $\rho$ (weak relationship) could reach a high subtype 
probability, for instance, if all the others $\rho$'s are close to zero 
or are even negative.     

\subsection{Alternative proposal}   

In order to overcome these drawbacks, here a simple and reliable single 
subject classifier to control type I and II errors is proposed. Morevover, it
provides a statistical significance on subtype assignation based on a gene  
label permutation test. Briefly, it evaluates if the observed  $\rho$ of each
IGP can be achieved by chance, regarding subject observed MS expressed gene 
values. In addition, we propose a user-friendly subtype assignation panel to  
support physicians’ decision making, enhancing PAM50 or commercial reports
currently available in the market. The method is presented for PAM50 but can
easily be extended to other MS algorithms.  

\section{Methods}

The Permutation Based Confidence for Molecular Classification (\pkg{pbcmc}) 
package, estimates the statistical significance of $\rho$ for each IGP. In  
other words, we want to see whether the observed $\rho$ can be obtained by   
chance. In order to perform this task, the $\rho$ null   
distribution for each IGP ($\rho_{H0_{IGP}}$) is obtained by 
evaluating $\beta$ permutations of the SGP gene expression. Then, the observed  
(un-permuted) IGP correlations ($\rho_{u_{IGP}}$) are compared against  
their own  $\rho_{H0_{IGP}}$ in order to evaluate whether    
$H_{0}:\rho_{u_{IGP}} \in \rho_{H0_{IGP}}$ versus 
$H_{1}:\rho_{u_{IGP}} \notin \rho_{H0_{IGP}}$ according to the p-values  
($p_{IGP}$) calculated as in (\ref{eq:picp}):

\begin{equation}
    \label{eq:picp}
    p_{IGP}=\frac{\sum_{i}^{\beta}I(\rho_{i}, \rho_{u})}{\beta}; ~~~  
    I(a, b)=\left \{  
    \begin{array}{ll}
        1 & if ~~a > b \\  
        0 & if ~~a \leq b   
    \end{array}
    \right .   
\end{equation}

where IGP stands for Basal, Her2-Enriched, Luminal A, Luminal B and  
Normal-like. The resulting five $p_{IGP}$'s are adjusted to control multiple   
comparisons using False Discovery Rate, FDR \citep{fdr}. Then, assuming an   
acceptable type I error, $\alpha$, the hypothesis test for all IGPs could 
result in:   
\begin{enumerate}[i.]
    \item No significant $\rho_u$ for any IGP, i. e., all adjusted $p_{IGP} >
        \alpha$.    
    \item A unique significant $\rho_u$.
    \item Multiple significant $\rho_u$. 
\end{enumerate}    

For the first case, the subject cannot reliably be assigned to any IGP (not   
assigned - \textbf{NA}). In the second case, it is assigned (\textbf{A}) to the
trustworthy current PAM50 subtype. In order to overcome the ambiguity of case 
iii., a correlation difference threshold of 0.1 between the top ones  
($\rho_{u_{IGP_1}} >  \rho_{u_{IGP_2}} > \ldots > 0$) was established,   
similarly as in \citet{cheang2009} for Luminal subtypes. Then, if
$(\rho_{u_{IGP_1}}-\rho_{u_{IGP_2}})>0.1$ the subject is assigned as in ii., 
otherwise it is considered as an ambiguous (\textbf{Amb}) subject.

\section{Implementation}
\label{sec:implementation}
The S4 class hierarchy of the \pkg{pbcmc} package is based on the 
implementation of an abstract \texttt{MolecularPermutationClassifier} class,   
which can  potentially be used for any MS as depicted in Figure
\ref{fig:hierarchy}. Basically, it has been developed as an organized data   
processing framework for its heirs. The formers are supposed to implement the 
respective responsibilities. Once a heir object is implemented the user can:  

\begin{figure}
    \centering
    \includegraphics[width=0.9\textwidth]{pbcmc.pdf}    
    \caption{Package hierarchy. \texttt{MolecularPermutationClassifier} is 
    the main S4 abstract class and \texttt{PAM50} implements it for the  
    Molecular Signature (MS) of \citet{perou2000, perou2010}. The other 
    classes represent user-defined implementations of other MS. Note that    
    complete operation signature have been omitted for simplicity.}     
    \label{fig:hierarchy}
\end{figure}

\begin{description}
    \item[loadBCDataset:] Load one of the example breast cancer dataset
        available at Bioconductor. At present, it is possible to load   
        breastCancerXXX where XXX can be ``\texttt{upp}''   
        \citep{breastCancerUPP},    
        ``\texttt{nki}'' \citep{breastCancerNKI},   
        ``\texttt{vdx}'' \citep{breastCancerVDX},   
        ``\texttt{mainz}'' \citep{breastCancerMAINZ},   
        ``\texttt{transbig}'' \citep{breastCancerTRANSBIG} or  
        ``\texttt{unt}'' \citep{breastCancerUNT}.   
    \item[filtrate:] Remove, from the \texttt{exprs} matrix, subjects not 
        required by the classification algorithm.
    \item[classify:] Generate subject classification according to the heir's 
        implementation (PAM50, etc.).  
    \item[permute:] Obtains subject classification based on the null 
        $\rho_{H0}$ distribution by means of $\beta$ permutation simulations. 
    \item[subtype:] Obtain the new classification using the permutation 
        results.
    \item[subjectReport:] Create a friendly report to assist physician 
        treatment decision making.  
    \item[databaseReport:] Create a \texttt{pdf} with all   
        \texttt{subjectReport}s, if a database is available.   
\end{description}

At present, the only available heir is PAM50 based on \pkg{genefu} library 
\citep{genefu}. But, it can easily be extended to other MS such as 
Prosigna\textregistered ~\citep{nielsen2014} or others, just implementing   
\texttt{filtrate}, \texttt{classify}, \texttt{permutate}, \texttt{subtype}, 
\texttt{subjectReport} and \texttt{databaseReport} functions. 

\subsection{Example}

In order to work with \texttt{PAM50} MS, the user must load a Bioconductor's    
breast cancer dataset. For example we can load NKI database   
\citep{breastCancerNKI}, provided that the require library is installed, 
using the following code: 
<<General options for R, echo=false, results=hide>>=   
options(prompt="R> ", continue="+  ", width=70, useFancyQuotes=FALSE, digits=4) 
suppressMessages(library("pbcmc"))
@

<<Loading datasets, echo=TRUE, eval=TRUE>>= 
library("pbcmc")
library("BiocParallel")
object<-loadBCDataset(Class=PAM50, libname="nki", verbose=TRUE)  
object
@

The \texttt{object} is a PAM50 instance, which contains the \texttt{exprs}   
matrix with gene expression values, associated \texttt{annotation} and 
clinical data in \texttt{targets} data.frame. On the other hand, the user can 
use PAM50's constructor to create an object with his/her own data  
or convert a \pkg{limma} \texttt{MAList} object into PAM50 using    
\texttt{as.PAM50(MAList\_object)} function. In the first case, the user will
only need:   
\begin{enumerate}[a)]
    \item The \textbf{M} gene expression object, i. e., genes in rows and 
        samples in columns.  
    \item The \textbf{annotation} \texttt{data.frame} which must include the 
        compulsory fields: ``\texttt{probe}'', \\ ``\texttt{NCBI.gene.symbol}''
        and ``\texttt{EntrezGene.ID}''.   
\end{enumerate}

For example, we could use \pkg{genefu}'s PAM50 centroids to check if our 
implementation solves the proof of concept, where we \emph{a priori} know the 
true class of each subject:

<<PAM50 with own data>>= 
M<-pam50$centroids
genes<-pam50$centroids.map
names(genes)<-c("probe", "NCBI.gene.symbol", "EntrezGene.ID")  
object<-PAM50(exprs=M, annotation=genes)   
object
@

Note that for the above output, the \texttt{targets} slot is empty, i. e.,    
\texttt{nrow=0} and \texttt{ncol=0}. In addition, only the expression of the   
fifty genes and five IGP is available, with its corresponding   
\texttt{annotation} over the three compulsory fields. It is always a good idea 
to explore the slots content, to see whether they have been correctly loaded:

<<Exploring the slots>>=  
head(exprs(object))      ##The gene expression values for each subject
head(annotation(object)) ##The compulsory annotation fields
head(targets(object))    ##The clinical data, if available.
@

Just as we expected, the five centroids are loaded in \texttt{exprs} slot, 
with their corresponding ``probe'', ``NCBI.gene.symbol'' and ``EntrezGene.ID''  
number in the \texttt{annotation} slot and no available data for the  
\texttt{targets}. Now, the user is ready to work with the data following the
workflow suggested in section \ref{sec:implementation} (filtrate, classify and 
permutate):

<<Workflow>>=
object<-filtrate(object, verbose=TRUE)   
object<-classify(object, std="none", verbose=TRUE)  
object<-permutate(object, nPerm=10000, pCutoff=0.01, where="fdr", 
    corCutoff=0.1, keep=TRUE, seed=1234567890, verbose=TRUE, 
    BPPARAM=bpparam())
@

The intention of \texttt{filtrate} function is to keep only the genes that 
will take place in the classification. In this example, it will not produce
any change on the original \texttt{exprs} slot, given the fact that only the
required fifty genes are present. But, if a complete microarray would have 
been present, then, probes that do not code for IGP will be removed. In   
addition, probes that code for the same gene (repeated or with similar 
annotation) will be treated as described in standardization (\texttt{std})
parameter.    

Once genes are \texttt{filtrate}d, the user can \texttt{classify} them using   
the original PAM50 algorithm. However, here we propose to obtain subtype  
assignment confidence using at least $\beta=10.000$ permutations over 
the SGP, using a type I error $\alpha=0.01$ on the adjusted  
p-values (``\texttt{fdr}'') and a correlation difference threshold of 
\texttt{corCutoff=0.1}. As a matter of fact, this process is
computationally intensive, so we can take advantage of all the available  
computes cores using \pkg{BioParallel} package    
(\texttt{BPPARAM=bpparam()}) as we just did \citep{BiocParallel}. In addition, 
the user can track the permutation progress bar by including   
\texttt{verbose=TRUE} option. If we now take a look at the \texttt{object}:  

<<Permutation results>>=   
object
@

we can see that it has been updated. First, the \texttt{classification} slot 
contains two datasets: one with the subtype \emph{probability}, $P(IGP_{i})$,
as described in section \ref{sec:genefu} and the \emph{correlation} of each   
subject with the five IGPs. The \emph{\$subtype} item shows a frequency table 
of the possible IGPs with the used subjects. In addition, the used 
\texttt{permutation} \texttt{parameters} are shown with the dimension of 
\emph{pvalues}, \emph{fdr} and \emph{subtype}s. Note that in this case    
\texttt{keep=TRUE} option was used so, the simulated correlation null    
distribution data points ($\rho_{H0_{IGPs}}$) are available.   

In this example we have used \pkg{genefu}'s PAM50 centroids, thus, only one 
subject is present (1) for each IGP cell in the \texttt{object} output. This    
result is also confirmed by the ones in the diagonal of  
\texttt{summary(object)} matrix between the original \emph{Subtype} and the 
\emph{Classes} found by the \pkg{pbcmc} package. Moreover, this toy example   
only shows assigned subjects (\textbf{A}) to the original PAM50 subtypes,   
whereas not assigned (\textbf{NA}) marginal row/column contains only zeros
(0). If ambiguous (\textbf{AMB}) subjects would have been found,
\emph{Classes} column will have included additional rows with 
the classes in dispute (e. g., ``LumA, Normal'' or ``Her2, LumB'', etc.). 

<<Summary>>=
summary(object)
@

Finally, we can inspect the report of a single subject to see how the MS  
classification went (Figure \ref{fig:report}), in order to suggest an
appropriate therapy for the physician:

<<SubjectReport, fig=TRUE, echo=TRUE, width=7, height=7, include=FALSE>>=   
subjectReport(object, subject=1)   
@

\begin{figure}
    \centering    
    \includegraphics{pbcmc-vignette-SubjectReport}
    \caption{PAM50 permutation subject report for \textbf{genefu}'s ``Basal''  
        intrinsic gene profile (IGP). The top table summarize the results for  
        the i-th subject, i. e., the correlation, p-value and false discovery  
        rate (fdr) obtained for each IGP. In addition, scatter plots of the 
        observed subject gene profiles against the IGP with the linear   
        regression line (in blue). Red color indicates the assigned subtype.   
        Finally, a boxplot for each IGP null permuted correlation
        distribution and big dots to represent the un-permuted observed
        correlations.}
    \label{fig:report}
\end{figure}

The report of Figure \ref{fig:report} is a \texttt{grid.arrange} 
object which basically consists of three main parts: 
\begin{description}
    \item{\textbf{tableGrob}:}{~A summary table which contains the following  
        fields,
        \begin{description}
            \item{\textbf{\$Summary}:}{~\texttt{Subject} name and subtype 
                obtained by \texttt{PAM50.Subtype} or the proposed   
                methodology (\texttt{Permuted.Subtype}).}   
            \item{\textbf{\$Fields}:}{~For the five PAM50 subtypes,    
                \begin{itemize}
                    \item Correlation: The correlation of the $i-th$ PAM50 
                        centroid with the observed subject exprs,   
                        $\rho(PGP, IGP_{i})$.   
                    \item p-value: Permutation p-value obtained using the  
                        simulation data, $p_{IGP}$.  
                    \item FDR: Adjusted p-value using False Discovery Rate 
                        \citep{fdr}.
            \end{itemize}
            }
        \end{description}
    }
    \item{\textbf{facet\_wrap}:}{~Two rows to display \pkg{ggplot2}
        \citep{ggplot2} scatter plots of subject exprs versus PAM50 centroids
        \citep{perou2000, perou2010} and a linear regression fit (in blue).
        If the subject has an unique subtype, then the graph is colored in
        red. In addition, if simulated permutations were run with
        \texttt{keep=TRUE} option, then the null distribution   
        boxplots are plotted with the corresponding observed un-permuted 
        correlations as big round dots.}
    \item{\textbf{textGrob}:}{The permutation \texttt{parameter} slot used in   
        the simulation.}   
\end{description}

The \pkg{pbcmc} also includes the ability to get a \texttt{pdf} report for the
complete database calling \texttt{databaseReport} function. In this context, 
the first page is a global summary of the database, i.e., a \texttt{summary}
contingency table of the permuted test classes against the original PAM50  
subtypes results. The following pages are the respective 
\texttt{subjectReport} outputs as the one shown in Figure \ref{fig:report}.    

\section{Conclusion}
The \pkg{pbcmc} package characterizes uncertainty assessment on gene 
expression classifiers, a. k. a. molecular signatures, based on a permutation  
test. In order to achieve this goal, synthetic simulated subjects are obtained 
bypermutations of gene labels. Then, each synthetic subject is tested against  
the corresponding subtype classifier to build the null distribution. Thus,   
classification confidence measurement report can be provided for each
subject, to assist physician therapy choice. At present, it is only  
available for PAM50 implementation in \pkg{genefu} package but, it can   
easily be extend to other molecular signatures.      

\section*{Acknowledgements}
\emph{Funding}: This work was supported by Universidad Cat\'{o}lica de    
C\'{o}rdoba (PIP 800-201304-00047-CC to E.A.F.), Argentina and National Council
of Scientific and Technical Research (CONICET), Argentina.  

\section*{Session Info}   
<<Session Info, echo=true>>=  
sessionInfo()
@

\bibliography{pbcmc-vignette}

\end{document}