\documentclass[article, shortnames, nojss]{jss} \usepackage[utf8]{inputenc} \usepackage{amsmath} \usepackage{enumerate} \DeclareMathOperator*{\argmax}{arg\,max} % \VignetteIndexEntry{PermutationBased Confidence for Molecular Class} % \VignetteKeyword{PAM50} % \VignetteKeyword{single subject classifier} % \VignetteKeyword{clinical outcome} % \VignetteKeyword{breast cancer subtype} % \VignetteDepends{breastCancerNKI} % \VignetteDepends{BiocParallel} \author{Crist\'{o}bal Fresno\\CONICET\\Universidad Cat\'{o}lica de C\'{o}rdoba \And Germ\'{a}n A Gonz\'{a}lez\\CONICET\\Universidad Cat\'{o}lica de C\'{o}rdoba \AND Andrea S Llera\\CONICET\\Fundaci\'{o}n Instituto Leloir \And Elmer A Fern\'{a}ndez\\CONICET\\Universidad Cat\'{o}lica de C\'{o}rdoba} \title{\pkg{pbcmc}: Permutation-Based Confidence for Molecular Classification} %% for pretty printing and a nice hypersummary also set: %% comma-separated \Plainauthor{Crist\'{o}bal Fresno, Germ\'{a}n A Gonz\'{a}lez, Andrea S Llera, Elmer A Fern\'{a}ndez} %% without formatting \Plaintitle{pbcmc: Permutation-Based Confidence for Molecular Classification} %% a short title (if necessary) \Shorttitle{\pkg{pbcmc}: Permutation-Based Confidence for Molecular Classification} %% an abstract and keywords \Abstract{ The \pkg{pbcmc} package characterizes uncertainty assessment on gene expression classifiers, a. k. a. molecular signatures, based on a permutation test. In order to achieve this goal, synthetic simulated subjects are obtained by permutations of gene labels. Then, each synthetic subject is tested against the corresponding subtype classifier to build the null distribution. Thus, classification confidence measurement reports can be provided for each subject, to assist physician therapy choice. At present, it is only available for PAM50 implementation in \pkg{genefu} package but, it can easily be extend to other molecular signatures. } \Keywords{PAM50, single subject classifier, clinical outcome, breast cancer subtype} %% without formatting \Plainkeywords{PAM50, single subject classifier, clinical outcome, breast cancer subtype} %% at least one keyword must be supplied %% The address of (at least) one author should be given %% in the following format: \Address{ Crist\'{o}bal Fresno, Germ\'{a}n A Gonz\'{a}lez \& Elmer A Fern\'{a}ndez\\ Bioscience Data Mining Group\\ Facultad de Ingenier\'{i}a\\ Universidad Cat\'{o}lica de C\'{o}rdoba - CONICET\\ X5016DHK Córdoba, Argentina\\ E-mail: \email{cfresno@bdmg.com.ar, ggonzalez@bdmg.com.ar, \\ efernandez@bdmg.com.ar}\\ URL: \url{http://www.bdmg.com.ar/}\\ $~$\\ Andrea S Llera\\ Laboratorio de Terapia Molecular y Celular\\ Fundaci\'{o}n Instituto Leloir - CONICET\\ C1405BWE Ciudad Autónoma de Buenos Aires, Argentina\\ E-mail: \email{allera@leloir.org.ar}\\ URL: \url{http://www.leloir.org.ar/podhajcer/} } \begin{document} \SweaveOpts{concordance=TRUE} % \SweaveOpts{concordance=FALSE} \section{Introduction} Gene expression-based classifiers, known as molecular signatures (MS), are gaining increasing attention in oncology and market. The MS can be defined as a set of coordinately expressed genes and an algorithm that use these data to predict disease subtypes, response to therapy, disease risk and clinical outcome \citep{andre2006}. Particularly in breast cancer market there exists many MS such as PAM50 \citep{perou2000, perou2010}, Prosigna\textregistered ~\citep{nielsen2014}, Oncotype DX\textregistered~\citep{paik2004} and MammaPrint\textregistered~ \citep{van2002}. In essence, most MS try to provide patient subtype classification or risk prediction which has been associated with distant metastasis free survival (DMFS) or relapse free survival (RFS). Consequently, they are intended to be used to support therapy choice. However, several authors have shown that data processing steps, technology, as well as population variability have an effect on measured gene expression and could bias subtype/risk subject assignment \citep{ioannidis2007, lusa2007, sorlie2001, sorlie2010, wu2012, ebbert2011}. These effects suggest that, from a statistical point of view, MS are not robust for subject classification. In particular, there is no control over type I and II like classification error and subjects could potentially be assigned to a wrong subtype/risk class. Indeed, the lack of certainty in class assignation could lead to a misleading therapy affecting subject outcome. Hence, the development of methods for significance or certainty on MS class assignation is crucial in order to assist physicians’ decision making \citep{wu2012, ebbert2011}. \subsection{PAM50 Molecular Siganture} The well-known breast cancer (BC) MS, PAM50 \citep{perou2000, perou2010}, is based on the comparison between the patient gene profiles (PGP) of 50 expressed genes, against five intrinsic genes profiles (IGP) representing: Basal, Her2-enriched, Luminal A, Luminal B and Normal-like subtypes using Spearman's $\rho$ correlation. Then, the subject will be assigned to the $i-th$ subtype according to (\ref{eq:PAM50}): \begin{equation} \label{eq:PAM50} \argmax_{i \in IGP} \rho(PGP, IGP_{i}) \end{equation} Particularly, patients are assigned to the $i-th$ subtype which maximize $\rho(PGP, IGP_{i})$, even if correlation is weak or there are similar/tight IGP correlations. The latter case has been addressed by \citet{cheang2009}, where they have excluded subjects with a correlation difference between Luminal A and B of less than 0.1, considering them ambiguous pattern, as a way to control a kind of type II error. However, type I error control is still a debt. \subsection{genefu library} \label{sec:genefu} At present, the freely available PAM50 \pkg{genefu} algorithm implementation \citep{genefu}, offers a kind of subtype probability, $P(IGP_{i})$, calculated as (\ref{eq:genefu}): \begin{equation} \label{eq:genefu} P(IGP_{i})=\frac{\rho(PGP, IGP_{i})}{\sum_i \rho(PGP, IGP_{i})} ~~~~\forall \rho(PGP, IGP_{i})>0 \end{equation} However, this probability does not take part in the classification rule. Even worse, a very low $\rho$ (weak relationship) could reach a high subtype probability, for instance, if all the others $\rho$'s are close to zero or are even negative. \subsection{Alternative proposal} In order to overcome these drawbacks, here a simple and reliable single subject classifier to control type I and II errors is proposed. Morevover, it provides a statistical significance on subtype assignation based on a gene label permutation test. Briefly, it evaluates if the observed $\rho$ of each IGP can be achieved by chance, regarding subject observed MS expressed gene values. In addition, we propose a user-friendly subtype assignation panel to support physicians’ decision making, enhancing PAM50 or commercial reports currently available in the market. The method is presented for PAM50 but can easily be extended to other MS algorithms. \section{Methods} The Permutation Based Confidence for Molecular Classification (\pkg{pbcmc}) package, estimates the statistical significance of $\rho$ for each IGP. In other words, we want to see whether the observed $\rho$ can be obtained by chance. In order to perform this task, the $\rho$ null distribution for each IGP ($\rho_{H0_{IGP}}$) is obtained by evaluating $\beta$ permutations of the SGP gene expression. Then, the observed (un-permuted) IGP correlations ($\rho_{u_{IGP}}$) are compared against their own $\rho_{H0_{IGP}}$ in order to evaluate whether $H_{0}:\rho_{u_{IGP}} \in \rho_{H0_{IGP}}$ versus $H_{1}:\rho_{u_{IGP}} \notin \rho_{H0_{IGP}}$ according to the p-values ($p_{IGP}$) calculated as in (\ref{eq:picp}): \begin{equation} \label{eq:picp} p_{IGP}=\frac{\sum_{i}^{\beta}I(\rho_{i}, \rho_{u})}{\beta}; ~~~ I(a, b)=\left \{ \begin{array}{ll} 1 & if ~~a > b \\ 0 & if ~~a \leq b \end{array} \right . \end{equation} where IGP stands for Basal, Her2-Enriched, Luminal A, Luminal B and Normal-like. The resulting five $p_{IGP}$'s are adjusted to control multiple comparisons using False Discovery Rate, FDR \citep{fdr}. Then, assuming an acceptable type I error, $\alpha$, the hypothesis test for all IGPs could result in: \begin{enumerate}[i.] \item No significant $\rho_u$ for any IGP, i. e., all adjusted $p_{IGP} > \alpha$. \item A unique significant $\rho_u$. \item Multiple significant $\rho_u$. \end{enumerate} For the first case, the subject cannot reliably be assigned to any IGP (not assigned - \textbf{NA}). In the second case, it is assigned (\textbf{A}) to the trustworthy current PAM50 subtype. In order to overcome the ambiguity of case iii., a correlation difference threshold of 0.1 between the top ones ($\rho_{u_{IGP_1}} > \rho_{u_{IGP_2}} > \ldots > 0$) was established, similarly as in \citet{cheang2009} for Luminal subtypes. Then, if $(\rho_{u_{IGP_1}}-\rho_{u_{IGP_2}})>0.1$ the subject is assigned as in ii., otherwise it is considered as an ambiguous (\textbf{Amb}) subject. \section{Implementation} \label{sec:implementation} The S4 class hierarchy of the \pkg{pbcmc} package is based on the implementation of an abstract \texttt{MolecularPermutationClassifier} class, which can potentially be used for any MS as depicted in Figure \ref{fig:hierarchy}. Basically, it has been developed as an organized data processing framework for its heirs. The formers are supposed to implement the respective responsibilities. Once a heir object is implemented the user can: \begin{figure} \centering \includegraphics[width=0.9\textwidth]{pbcmc.pdf} \caption{Package hierarchy. \texttt{MolecularPermutationClassifier} is the main S4 abstract class and \texttt{PAM50} implements it for the Molecular Signature (MS) of \citet{perou2000, perou2010}. The other classes represent user-defined implementations of other MS. Note that complete operation signature have been omitted for simplicity.} \label{fig:hierarchy} \end{figure} \begin{description} \item[loadBCDataset:] Load one of the example breast cancer dataset available at Bioconductor. At present, it is possible to load breastCancerXXX where XXX can be ``\texttt{upp}'' \citep{breastCancerUPP}, ``\texttt{nki}'' \citep{breastCancerNKI}, ``\texttt{vdx}'' \citep{breastCancerVDX}, ``\texttt{mainz}'' \citep{breastCancerMAINZ}, ``\texttt{transbig}'' \citep{breastCancerTRANSBIG} or ``\texttt{unt}'' \citep{breastCancerUNT}. \item[filtrate:] Remove, from the \texttt{exprs} matrix, subjects not required by the classification algorithm. \item[classify:] Generate subject classification according to the heir's implementation (PAM50, etc.). \item[permute:] Obtains subject classification based on the null $\rho_{H0}$ distribution by means of $\beta$ permutation simulations. \item[subtype:] Obtain the new classification using the permutation results. \item[subjectReport:] Create a friendly report to assist physician treatment decision making. \item[databaseReport:] Create a \texttt{pdf} with all \texttt{subjectReport}s, if a database is available. \end{description} At present, the only available heir is PAM50 based on \pkg{genefu} library \citep{genefu}. But, it can easily be extended to other MS such as Prosigna\textregistered ~\citep{nielsen2014} or others, just implementing \texttt{filtrate}, \texttt{classify}, \texttt{permutate}, \texttt{subtype}, \texttt{subjectReport} and \texttt{databaseReport} functions. \subsection{Example} In order to work with \texttt{PAM50} MS, the user must load a Bioconductor's breast cancer dataset. For example we can load NKI database \citep{breastCancerNKI}, provided that the require library is installed, using the following code: <>= options(prompt="R> ", continue="+ ", width=70, useFancyQuotes=FALSE, digits=4) suppressMessages(library("pbcmc")) @ <>= library("pbcmc") library("BiocParallel") object<-loadBCDataset(Class=PAM50, libname="nki", verbose=TRUE) object @ The \texttt{object} is a PAM50 instance, which contains the \texttt{exprs} matrix with gene expression values, associated \texttt{annotation} and clinical data in \texttt{targets} data.frame. On the other hand, the user can use PAM50's constructor to create an object with his/her own data or convert a \pkg{limma} \texttt{MAList} object into PAM50 using \texttt{as.PAM50(MAList\_object)} function. In the first case, the user will only need: \begin{enumerate}[a)] \item The \textbf{M} gene expression object, i. e., genes in rows and samples in columns. \item The \textbf{annotation} \texttt{data.frame} which must include the compulsory fields: ``\texttt{probe}'', \\ ``\texttt{NCBI.gene.symbol}'' and ``\texttt{EntrezGene.ID}''. \end{enumerate} For example, we could use \pkg{genefu}'s PAM50 centroids to check if our implementation solves the proof of concept, where we \emph{a priori} know the true class of each subject: <>= M<-pam50$centroids genes<-pam50$centroids.map names(genes)<-c("probe", "NCBI.gene.symbol", "EntrezGene.ID") object<-PAM50(exprs=M, annotation=genes) object @ Note that for the above output, the \texttt{targets} slot is empty, i. e., \texttt{nrow=0} and \texttt{ncol=0}. In addition, only the expression of the fifty genes and five IGP is available, with its corresponding \texttt{annotation} over the three compulsory fields. It is always a good idea to explore the slots content, to see whether they have been correctly loaded: <>= head(exprs(object)) ##The gene expression values for each subject head(annotation(object)) ##The compulsory annotation fields head(targets(object)) ##The clinical data, if available. @ Just as we expected, the five centroids are loaded in \texttt{exprs} slot, with their corresponding ``probe'', ``NCBI.gene.symbol'' and ``EntrezGene.ID'' number in the \texttt{annotation} slot and no available data for the \texttt{targets}. Now, the user is ready to work with the data following the workflow suggested in section \ref{sec:implementation} (filtrate, classify and permutate): <>= object<-filtrate(object, verbose=TRUE) object<-classify(object, std="none", verbose=TRUE) object<-permutate(object, nPerm=10000, pCutoff=0.01, where="fdr", corCutoff=0.1, keep=TRUE, seed=1234567890, verbose=TRUE, BPPARAM=bpparam()) @ The intention of \texttt{filtrate} function is to keep only the genes that will take place in the classification. In this example, it will not produce any change on the original \texttt{exprs} slot, given the fact that only the required fifty genes are present. But, if a complete microarray would have been present, then, probes that do not code for IGP will be removed. In addition, probes that code for the same gene (repeated or with similar annotation) will be treated as described in standardization (\texttt{std}) parameter. Once genes are \texttt{filtrate}d, the user can \texttt{classify} them using the original PAM50 algorithm. However, here we propose to obtain subtype assignment confidence using at least $\beta=10.000$ permutations over the SGP, using a type I error $\alpha=0.01$ on the adjusted p-values (``\texttt{fdr}'') and a correlation difference threshold of \texttt{corCutoff=0.1}. As a matter of fact, this process is computationally intensive, so we can take advantage of all the available computes cores using \pkg{BioParallel} package (\texttt{BPPARAM=bpparam()}) as we just did \citep{BiocParallel}. In addition, the user can track the permutation progress bar by including \texttt{verbose=TRUE} option. If we now take a look at the \texttt{object}: <>= object @ we can see that it has been updated. First, the \texttt{classification} slot contains two datasets: one with the subtype \emph{probability}, $P(IGP_{i})$, as described in section \ref{sec:genefu} and the \emph{correlation} of each subject with the five IGPs. The \emph{\$subtype} item shows a frequency table of the possible IGPs with the used subjects. In addition, the used \texttt{permutation} \texttt{parameters} are shown with the dimension of \emph{pvalues}, \emph{fdr} and \emph{subtype}s. Note that in this case \texttt{keep=TRUE} option was used so, the simulated correlation null distribution data points ($\rho_{H0_{IGPs}}$) are available. In this example we have used \pkg{genefu}'s PAM50 centroids, thus, only one subject is present (1) for each IGP cell in the \texttt{object} output. This result is also confirmed by the ones in the diagonal of \texttt{summary(object)} matrix between the original \emph{Subtype} and the \emph{Classes} found by the \pkg{pbcmc} package. Moreover, this toy example only shows assigned subjects (\textbf{A}) to the original PAM50 subtypes, whereas not assigned (\textbf{NA}) marginal row/column contains only zeros (0). If ambiguous (\textbf{AMB}) subjects would have been found, \emph{Classes} column will have included additional rows with the classes in dispute (e. g., ``LumA, Normal'' or ``Her2, LumB'', etc.). <>= summary(object) @ Finally, we can inspect the report of a single subject to see how the MS classification went (Figure \ref{fig:report}), in order to suggest an appropriate therapy for the physician: <>= subjectReport(object, subject=1) @ \begin{figure} \centering \includegraphics{pbcmc-vignette-SubjectReport} \caption{PAM50 permutation subject report for \textbf{genefu}'s ``Basal'' intrinsic gene profile (IGP). The top table summarize the results for the i-th subject, i. e., the correlation, p-value and false discovery rate (fdr) obtained for each IGP. In addition, scatter plots of the observed subject gene profiles against the IGP with the linear regression line (in blue). Red color indicates the assigned subtype. Finally, a boxplot for each IGP null permuted correlation distribution and big dots to represent the un-permuted observed correlations.} \label{fig:report} \end{figure} The report of Figure \ref{fig:report} is a \texttt{grid.arrange} object which basically consists of three main parts: \begin{description} \item{\textbf{tableGrob}:}{~A summary table which contains the following fields, \begin{description} \item{\textbf{\$Summary}:}{~\texttt{Subject} name and subtype obtained by \texttt{PAM50.Subtype} or the proposed methodology (\texttt{Permuted.Subtype}).} \item{\textbf{\$Fields}:}{~For the five PAM50 subtypes, \begin{itemize} \item Correlation: The correlation of the $i-th$ PAM50 centroid with the observed subject exprs, $\rho(PGP, IGP_{i})$. \item p-value: Permutation p-value obtained using the simulation data, $p_{IGP}$. \item FDR: Adjusted p-value using False Discovery Rate \citep{fdr}. \end{itemize} } \end{description} } \item{\textbf{facet\_wrap}:}{~Two rows to display \pkg{ggplot2} \citep{ggplot2} scatter plots of subject exprs versus PAM50 centroids \citep{perou2000, perou2010} and a linear regression fit (in blue). If the subject has an unique subtype, then the graph is colored in red. In addition, if simulated permutations were run with \texttt{keep=TRUE} option, then the null distribution boxplots are plotted with the corresponding observed un-permuted correlations as big round dots.} \item{\textbf{textGrob}:}{The permutation \texttt{parameter} slot used in the simulation.} \end{description} The \pkg{pbcmc} also includes the ability to get a \texttt{pdf} report for the complete database calling \texttt{databaseReport} function. In this context, the first page is a global summary of the database, i.e., a \texttt{summary} contingency table of the permuted test classes against the original PAM50 subtypes results. The following pages are the respective \texttt{subjectReport} outputs as the one shown in Figure \ref{fig:report}. \section{Conclusion} The \pkg{pbcmc} package characterizes uncertainty assessment on gene expression classifiers, a. k. a. molecular signatures, based on a permutation test. In order to achieve this goal, synthetic simulated subjects are obtained bypermutations of gene labels. Then, each synthetic subject is tested against the corresponding subtype classifier to build the null distribution. Thus, classification confidence measurement report can be provided for each subject, to assist physician therapy choice. At present, it is only available for PAM50 implementation in \pkg{genefu} package but, it can easily be extend to other molecular signatures. \section*{Acknowledgements} \emph{Funding}: This work was supported by Universidad Cat\'{o}lica de C\'{o}rdoba (PIP 800-201304-00047-CC to E.A.F.), Argentina and National Council of Scientific and Technical Research (CONICET), Argentina. \section*{Session Info} <>= sessionInfo() @ \bibliography{pbcmc-vignette} \end{document}