% -*- mode: noweb; noweb-default-code-mode: R-mode; -*- %\VignetteIndexEntry{dexus: Manual for the R package} %\VignetteDepends{dexus} %\VignettePackage{dexus} %\VignetteKeywords{RNASeq, RNA-Seq, RNA, Sequencing, NGS, differential expression, % transcripts, transcriptomics, negative binomial, dispersion, % overdispersion, maximum-a-posteriori, I/NI, INI, informative, % non-informative, replicates, unsupervised, clustering, unknown, conditions, % ENCODE, sample groups, case-control, treatment study, randomized controlled % study, RCS, non-randomized, cross-sectional, cross sectional, cohort} \documentclass[article]{bioinf} \usepackage[noae]{Sweave} \usepackage{amsmath,amssymb} \usepackage{hyperref} \usepackage{float} \usepackage[authoryear]{natbib} \usepackage{bm} \hypersetup{colorlinks=false, pdfborder=0 0 0, pdftitle={DEXUS -- Identifying Differential Expression in RNA-Seq Studies with Unknown Conditions}, pdfauthor={G\"unter Klambauer}, pdfsubject={RNA-Seq and Detection of Differential Expression}, pdfkeywords={RNASeq, RNA-Seq, RNA, Sequencing, NGS, differential expression, transcripts, transcriptomics, negative binomial, dispersion, overdispersion, maximum-a-posteriori, I/NI, INI, informative, non-informative, replicates, unsupervised, clustering, unknown, conditions, ENCODE, sample groups, case-control, treatment study, randomized controlled study, RCS, non-randomized, cross-sectional, cross sectional, cohort}} \title{DEXUS -- Identifying Differential Expression in RNA-Seq Studies with Unknown Conditions} \author{G\"unter Klambauer and Thomas Unterthiner} \affiliation{Institute of Bioinformatics, Johannes Kepler University Linz\\Altenberger Str. 69, 4040 Linz, Austria\\ \email{klambauer@bioinf.jku.at}} \newcommand{\DEXUS}{\texttt{DEXUS}} \newcommand{\R}{R} \newcommand{\Real}{\mathbb{R}} \renewcommand{\vec}[1]{\mathbf{#1}} \def\ga{\bm{\gamma}} \def\ep{\bm{\epsilon}} \def\al{\bm{\alpha}} \def\Lambda{\lambda} \def\pp{\bm{\Psi}} \def\pps{\bm{\psi}} \def\Si{\bm{\Sigma}} \def\La{\bm{\Lambda}} \def\m{\bm{m}} \def\x{\bm{x}} \def\r{\bm{r}} \def\p{\bm{p}} \def\y{\bm{y}} \def\z{\bm{z}} \def\vb{\bm{v}} \def\u{\bm{u}} \def\1{\bm{1}} \def\0{\bm{0}} \def\le{{\bf \large 1}} \def\lnull{{\bf \large 0}} \def\I{\bm{I}} \def\C{\bm{C}} \def\c{\bm{c}} \def\X{\bm{X}} \def\Z{\bm{Z}} \def\L{\bm{L}} \def\muu{\bm{\mu}} \def\RR{\mathbb{R}} \def\NN{\mathbb{N}} \def\sign{\mathrm{sign}} \def\Xcal{\mathcal{X}} \def\Fcal{\mathcal{F}} \def\E{\mathrm{E}} \def\D{\mathrm{D}} \def\P{\mathrm{P}} \def\ML{\mathrm{ML}} \def\sgn{\mathop{\mathrm{sgn}\,}} \def\AUCROC{$\mathrm{AUC}_{\mathrm{ROC}}$\ } \def\AUCPR{$\mathrm{AUC}_{\mathrm{PR}}$\ } \definecolor{grey}{rgb}{0.8,0.8,0.8} \newcommand{\Rfunction}[1]{{\texttt{#1}}} \newcommand{\Robject}[1]{{\texttt{#1}}} \newcommand{\Rpackage}[1]{{\texttt{#1}}\ } \newcommand{\method}[1]{{\fontfamily{phv}\fontshape{rm}\selectfont #1}} \newcommand{\pvalue}{$p$-value\ } \newcommand{\pvalues}{$p$-values\ } \newcommand{\pe}{$p$\hspace{2pt}=\hspace{2pt}} \setkeys{Gin}{width=0.55\textwidth} \SweaveOpts{eps=FALSE} \begin{document} <>= options(width=75) set.seed(0) library(dexus) dexusVersion <- packageDescription("dexus")$Version @ \newcommand{\dexusVersion}{\Sexpr{dexusVersion}} \manualtitlepage[Version \dexusVersion, \today] \vspace{1cm} \newlength{\auxparskip} \setlength{\auxparskip}{\parskip} \setlength{\parskip}{0pt} \tableofcontents \clearpage \setlength{\parskip}{\auxparskip} \newlength{\Nboxwidth} \setlength{\Nboxwidth}{\textwidth} \addtolength{\Nboxwidth}{-2\fboxrule} \addtolength{\Nboxwidth}{-2\fboxsep} \newcommand{\notebox}[1]{% \begin{center} \fbox{\begin{minipage}{\Nboxwidth} \noindent{\sffamily\bfseries Note:} #1 \end{minipage}} \end{center}} \section{Introduction} \method{DEXUS} identifies differentially expressed transcripts in RNA-Seq data under all possible study designs such as studies without replicates, without sample groups, and with unknown conditions. \method{DEXUS} works also for known conditions, for example for RNA-Seq data with two or multiple conditions. RNA-Seq read count data can be provided both by the S4 class {\tt CountDataSet} and by read count matrices. Differentially expressed transcripts can be visualized by heatmaps, in which unknown conditions, replicates, and samples groups are also indicated. This software is fast as the core algorithm is written in C. For very large data sets, a parallel version of \method{DEXUS} is provided in this \Rpackage. \method{DEXUS} is a statistical model that is selected in a Bayesian framework by an EM algorithm. \method{DEXUS} does not need replicates to detect differentially expressed transcript, since the replicates (or conditions) are estimated by the EM method for each transcript. This is an unsupervised machine learning approach that does not require labeled data. The method provides an informative/non-informative (I/NI) value to extract differentially expressed transcripts at a desired significance level or power. Detection of differential expression in RNA-Seq data is currently limited to studies in which two or more sample conditions are known {\em a priori}. However, these biological conditions are typically unknown in cohort, cross-sectional, and non-randomized controlled studies such as the HapMap, the ENCODE, or the 1000 Genomes project. \method{DEXUS} models read counts as a finite mixture of negative binomial distributions. See \url{http://www.bioinf.jku.at/software/dexus} for additional information, data sets, and \R\ scripts. \section{Getting Started and Quick Guide} To load the package, enter the following in the \R\ session: <<>>= library(dexus) @ With the package {\tt dexus} we provide the ``Mice strains'' \citep{Bottomly2011}, ``Primate Liver'' \citep{Blekhman2010}, ``Maize leaves'' \citep{Li2010a}, ``European HapMap'' \citep{Montgomery2010}, and the ``Nigerian HapMap'' \citep{Pickrell2010} data sets. The read counts are stored in the objects {\tt countsBottomly}, {\tt countsGilad}, {\tt countsLi}, {\tt countsMontgomery}, and {\tt countsPickrell}, respectively. <<>>= data(dexus) ls() @ \subsection{Unknown Conditions} One can simply run \method{DEXUS} by applying the function {\tt dexus} to the count matrices. This is the mode in which the conditions are unknown, i.e. no labels that indicate the replicate groups have to be provided. <>= result <- dexus(countsBottomly[1:1000, ]) plot(result) @ <>= result <- dexus(countsBottomly[1:1000, ]) pdf("001.pdf") plot(result,cexSamples=1) dev.off() @ \begin{figure}[H] \begin{center} \includegraphics[angle=0,width= 0.6\columnwidth]{001.pdf} \caption[Result of \method{DEXUS} for data with {\em unknown} conditions.]{Result of \method{DEXUS} for data with {\em unknown} conditions. A heatmap of the log read counts of the top ranked transcripts of the ``Mice strains'' \citep{Bottomly2011} data set is shown. Rows represent transcripts sorted by their I/NI values. The trancript on the top has the highest I/NI value. Columns represent different samples. The labels ``D2'' and ``B6'' represent the two different strains. Red crossed indicate the samples belonging to the second condition that \method{DEXUS} has identified. } \end{center} \end{figure} \subsection{Known Conditions} To test between two or more replicate groups, \method{DEXUS} needs to be provided with the group labels: <>= resultSupervised <- dexus(countsBottomly[1:1000, ], labels=substr(colnames(countsBottomly),1,2)) plot(resultSupervised) @ <>= resultSupervised <- dexus(countsBottomly[1:1000, ], labels=substr(colnames(countsBottomly),1,2)) pdf("002.pdf") plot(resultSupervised,cexSamples=1) dev.off() @ \begin{figure}[H] \begin{center} \includegraphics[angle=0,width= 0.6\columnwidth]{002.pdf} \caption[Result of \method{DEXUS} for data with {\em known} conditions.]{Result of \method{DEXUS} for data with {\em known} conditions. A heatmap of the log read counts of the top ranked transcripts of the ``Mice strains'' \citep{Bottomly2011} data set is shown. Rows represent transcripts sorted by their $p$-values. The trancript on the top has the lowest $p$-value. Columns represent different samples. The labels ``D2'' and ``B6'' represent the two different strains. Red crossed indicate the samples belonging to the second condition, that was given by the labels} \end{center} \end{figure} \section{Input Data: Read Count Matrices or {\tt CountDataSets}} \method{DEXUS} expects a table of counts per transcript, transcript, exon, or any other region of interest as input in analogy to other RNA-Seq analysis methods \citep{Anders2010, Robinson2010a, Hardcastle2010, Li2012, Wang2010, Li2011, Tarazona2011, Wu2012}. The table should have the transcripts as rows and samples as columns. An entry should correspond to the number of reads of the sample mapping to the transcript. Technical replicates of one sample should be summed up so that each column corresponds to one sample. There are various ways how to produce count matrices from BAM files: \begin{itemize} \item A full guide on processing RNA-Seq data including the calculation of read count matrices is provided at \url{http://en.wikibooks.org/wiki/Next_Generation_Sequencing_(NGS)/RNA}. \item The function {\tt HTSeq-count} of {\tt HTSeq} Python package \url{http://www-huber.embl.de/users/anders/HTSeq/doc/count.html}. \item Ready-made count tables for a lot of studies are available at \url{http://bowtie-bio.sourceforge.net/recount/} \citep{Frazee2011}. \item The function {\tt countOverlaps} of the Bioconductor package {\tt GenomicRanges} can also be utilized. \item The function {\tt getSegmentReadCountsFromBAM} of the Bioconductor package {\tt cn.mops} \citep{Klambauer2012} can also be utilized to extract read counts from BAM files efficiently. \end{itemize} \subsection{Read Count Matrices or Count Tables as Input for DEXUS} A read count matrix or count table should look like the following: <<>>= data(dexus) countsBottomly[1:10,1:5] @ A numeric matrix of read counts can directly be used with \method{DEXUS}. <>= result <- dexus(countsBottomly) @ \subsection{{\tt CountDataSets} as input for DEXUS} A {\tt CountDataSet}, such as the ones used in the package \method{DESeq} \citep{Anders2010}, can also directly be used it with \method{DEXUS}: <>= library(DESeq) cds <- newCountDataSet(countData=countsBottomly, conditions=substr(colnames(countsBottomly),1,2) ) result <- dexus(cds) @ \section{General Study Designs: No Replicates, Unknown Sample Groups or Conditions} Examples of studies in which the groups are unknown, are the studies of \citet{Montgomery2010} and \citet{Pickrell2010}. They sequenced the RNA of HapMap individuals to investigate eQTLs. \method{DEXUS} is able to identify differential expression in these data sets. The method estimates the conditions for each transcript individually. To run the method simply apply the function {\tt dexus} to the count table. In the following example we run the algorithm only one the first 1000 transcripts. <<>>= resultMontgomery <- dexus(countsMontgomery[1:1000, ]) @ To show a summary of the result object, simply type the following. <<>>= resultMontgomery @ The transcripts are in their original order; the displayed columns give the whether a transcript is differentially expressed ({\tt INICall}), the evidence for differential expression measured by the I/NI values ({\tt INIValues}), and the means for each condition. It is possible to sort the result object such that the transcripts with the highest I/NI values are ranked highest. <>= sort(resultMontgomery) @ Transcripts with an I/NI value above $0.1$ are classfied as differentially expressed. The function {\tt INI} filters the result object for informative transcripts and sorts them by their I/NI calls. <<>>= informativeTranscripts <- INI(resultMontgomery,threshold=0.2) @ The result can be visualized by a heatmap using the {\tt plot} function. <>= plot(informativeTranscripts) @ <>= pdf("004.pdf",width=12,height=6) plot(informativeTranscripts) dev.off() @ \begin{figure}[H] \begin{center} \includegraphics[angle=0,width= \textwidth]{004.pdf} \caption[Result of \method{DEXUS} in unsupervised mode with {\em unknown conditions}.]{Result of \method{DEXUS} in unsupervised mode with {\em unknown conditions}. A heatmap of the log read counts of the top ranked transcripts of the ``European HapMap'' \citep{Montgomery2010} data set is shown. Rows represent transcripts sorted by their I/NI values. The trancript on the top has the highest I/NI. Columns represent different samples. Red symbols indicate samples that belong to the minor condition that was identified by \method{DEXUS}. } \end{center} \end{figure} Information about a specific transcript can also be accessed from the result object by subsetting it with the transcript name. <<>>= resultMontgomery["ENSG00000007038"] @ Even more information can be obtained by using the {\tt as.data.frame} function. <>= as.data.frame(resultMontgomery["ENSG00000007038"]) @ <>= as.data.frame(resultMontgomery["ENSG00000007038"])[,1:12] @ To convert the full result object to a data frame that can be exported the function {\tt as.data.frame} can be used. <>= as.data.frame(sort(resultMontgomery)) @ For more information on the result object, see Section \ref{s:ResultObject}. \section{Case-Control Like Study Designs: Replicates, Known Sample Groups or Conditions} \subsection{Two Known Groups or Conditions} In the study of \citet{Bottomly2011}, two strains of mice, C57BL/6J (B6) and DBA/2J (D2), were compared using both RNA-Seq and microarrays. The data set consists of 21 lanes from male mice (10 of the B6 strain and 11 of D2 strain), produced using an Illumina GAIIx sequencing machine. The data set was provided by the \method{ReCount} repository \citep{Frazee2011} that is based on Ensembl 61 transcript definitions. In this case of {\em two known conditions} we provide \method{DEXUS} with the group labels, in order to detect transcripts that are differentially expressed between the two mice strains. We apply the function {\tt dexus} to the count table of the first 1000 transcripts and provide the labels of the samples, and set the normalization to ``Upper Quartile'' normalization. <<>>= resultSupervised <- dexus(countsBottomly[1:1000, ], labels=substr(colnames(countsBottomly),1,2), normalization="upperquartile") @ To show a list of differentially expressed transcripts, simply type the name of the result object. <<>>= resultSupervised @ To sort the transcripts in the result object by $p$-values use the {\tt sort} method. <<>>= resultSupervised <- sort(resultSupervised) @ To obtain a heatmap of the differentially expressed transcripts, type {\tt plot}. <>= plot(resultSupervised) @ To get the full list of transcripts together with additional information, such as the I/NI values, conditions, dispersions, and means the function {\tt as.data.frame} can be used. <>= as.data.frame(resultSupervised) @ For more information on the result object, see Section \ref{s:ResultObject}. \subsection{Multiple Known Groups or Conditions} \citet{Blekhman2010} investigated the differences in alternative splicing in liver tissue between humans, chimpanzees and rhesus macaques. For this purpose they performed RNA-Seq on three male and three female liver samples from each species. They focused on the expression values of exons that had reliably determined orthologs in all species. Read counts for exons were provided by \citet{Blekhman2010}, who used transcript models from Ensemble (Release 50). In this case the three species are three distinct groups. The aim is to find transcripts, that show large differences between these groups. We run \method{DEXUS} on this data set and provide the method with the group labels, i.e. the species. <<>>= resultMultipleGroups <- dexus(countsGilad[1:1000, ], labels=substr(colnames(countsGilad),1,2)) @ To show a list of differentially expressed transcripts, simply type the name of the result object. <<>>= resultMultipleGroups @ To sort the transcripts in the result object by $p$-values use the {\tt sort} method. <<>>= resultMultipleGroups <- sort(resultMultipleGroups) @ To obtain a heatmap of the top-ranked transcripts use the {\tt plot} function. <>= plot(resultMultipleGroups) @ <>= pdf("003.pdf") plot(resultMultipleGroups,cexSamples=1) dev.off() @ \begin{figure}[H] \begin{center} \includegraphics[angle=0,width= 0.6\columnwidth]{003.pdf} \caption[Result of \method{DEXUS} in supervised mode with {\em multiple known groups}.]{Result of \method{DEXUS} in supervised mode with {\em multiple known groups}. A heatmap of the log read counts of the top ranked transcripts of the ``Primate Liver'' \citep{Blekhman2010} data set is shown. Rows represent transcripts sorted by their $p$-values. The trancript on the top has the lowest $p$-value. Columns represent different samples. The labels ``HS'',``PT'' and ``MM'' represent the three different species. Red symbols indicate the different species. } \end{center} \end{figure} To get the full list of transcripts together with their $p$-values the function {\tt getResult} can be used. <>= as.data.frame(resultMultipleGroups) @ For more information on the result object, see Section \ref{s:ResultObject}. \section{Calling Differential Expression, Visualization and the Result Object} \label{s:ResultObject} \subsection{Calling Differential Expression by the Informative/Non-Informative Call} In a setting in which the conditions or sample groups are unknown, or in which there are no replicates, the I/NI value measures the evidence for differential expression. At different thresholds \method{DEXUS} has different detection powers (sensitivity) and significance levels (specificity). On 2,400 simulated data sets, I/NI value thresholds of 0.025, 0.05, and 0.1 yielded average specificities of 92\%, 97\%, and 99\% at sensitivities of 76\%, 61\%, and 38\% respectively. The threshold for the I/NI values is set by the function {\tt INIThreshold}. The function {\tt INI} filters out non-informative transcripts. <>= informativeTranscripts2 <- INI(resultMontgomery,threshold=0.25) @ The object {\tt informativeTranscripts2} contains only the informative, i.e. the differentially expressed transcripts. \subsection{Visualization} There is a generic plotting function that can be applied to the result object of \method{DEXUS}. The log read counts are visualized as a heatmap, in which we also indicate the identified sample condition. We can select which transcripts we want to plot by using the parameter {\tt idx}. <>= #plots the top 8 transcripts plot(sort(informativeTranscripts2), idx=1:8) @ <>= pdf("005.pdf",width=12,height=6) plot(sort(informativeTranscripts2),idx=1:8) dev.off() @ \begin{figure}[H] \begin{center} \includegraphics[angle=0,width= \textwidth]{005.pdf} \caption[Result of \method{DEXUS} in unsupervised mode with {\em unknown conditions}.]{Result of \method{DEXUS} in unsupervised mode with {\em unknown conditions}. A heatmap of the log read counts of the top ranked transcripts of the ``European HapMap'' \citep{Montgomery2010} data set is shown. Rows represent transcripts sorted by their I/NI values. The trancript on the top has the highest I/NI. Columns represent different samples. Red symbols indicate samples that belong to the minor condition that was identified by \method{DEXUS}. } \end{center} \end{figure} \subsection{The Structure of the Result Object} \method{DEXUS} returns an instance of ``DEXUSResult'' that contains the following slots: \begin{itemize} \item {\tt transcriptNames}: The names of the transcripts, genes, exons, or regions of interest. \item {\tt sampleNames}: The sample names, as they were given in the input matrix. \item {\tt inputData}: The original read count matrix. \item {\tt normalizedData}: The normalized read count matrix. \item {\tt sizeFactors}: The size factors that were calculated for the normalization. This is that factor that scales each column or sample. \item {\tt INIValues}: An informative/non-informative (I/NI) value for each sample that measures the evidence for differential expression. \item {\tt INIThreshold}: The threshold for the I/NI values. Transcript with I/NI values above the threshold will be considered as differentially expressed. \item {\tt INICalls}: A binary value for each transcript indicating whether it is differentially expressed. \item {\tt pvals:} In case of {\em two known conditions} or {\em multiple known conditions} it is possible to calculate a $p$-value for each transcript. This value is given in this slot. \item {\tt responsibilites:} A matrix of the size of the input matrix. It indicates the condition for each sample and transcript. The condition named ``1'' is the major condition. All other conditions are minor conditions. In case of supervised ({\em two known conditions} or {\em multiple known conditions}) analyses this clustering matrix will be the same for all transcripts. \item {\tt posteriorProbs}: An array of the dimension of transcripts times samples times conditions. It gives the probability that a certain read count $x$ was generated under a condition. \item {\tt logFC}: The log foldchanges between the conditions. The reference is always condition ``1''. \item {\tt conditionSizes}: The ratio of samples belonging to that condition. These are the $\alpha_i$ values of the model. \item {\tt sizeParameters}: The size parameter estimates for each condition. These are the $r_i$ values of the model. \item {\tt means}: The mean of each condition. The $\mu _i$ values of the model. \item {\tt dispersions}: The dispersion estimates for each condition. The inverse size parameters. \item {\tt params}: The input parameters of the \method{DEXUS} algorithm. \end{itemize} \section{Parameter Settings of DEXUS} The input parameters of the \method{DEXUS} algorithm are the following: \begin{itemize} \item {\tt X}: The read count matrix. If the reads are already normalized, then set {\tt normalization} to ``{\tt none}''. \item {\tt nclasses}: The number of conditions that \method{DEXUS} should model. The number should be much smaller than the number of samples. In supervised mode ({\em two known conditions} or {\em multiple known conditions}) the algorithm uses the number of different labels as {\tt nclasses}. Needs not be specified in supervised mode. \item {\tt alphaInit}: The initialization of the $\alpha_i$ values of the model. A vector with length of the number of conditions. The algorithm internally scales this vector to sum 1. Needs not be specified in supervised mode. \item {\tt G}: The weight of the Dirichlet prior. An important parameter that guides the EM algorithm. The higher this value the more transcripts will be explained by one condition and will therefore be classified as not differentially expressed. The lower the value of {\tt G} the more transcripts will be found to be differentially expressed. Needs not be specified in supervised mode. \item {\tt cyc}: The number of cycles of the EM algorithm per transcript. Needs not be specified in supervised mode. \item {\tt labels}: If the conditions, groups, or classes are known, then they can be passed to the algorithm through this parameter. \item {\tt normalization}: The normalization method to be used. Choices are ``RLE'', ``upperquartile'', and ``none''. \item {\tt kmeansIter}: For the initialization of the algorithm a k-means clustering is run. This is the number of iterations of the clustering. \item {\tt ignoreIfAllCountsSmaller}: A transcript is considered as ``not expressed'', if counts of all samples are below this value. The algorithm is not applied to these transcripts. \item {\tt theta}: The weight of the exponential prior on the size parameter of the negative binomial distributions. The higher this parameter, the lower the estimates of the size parameters, and consequently the higher the estimates of the overdispersions. \item {\tt minMu}: The minimal value for the mean parameter of the negative binomial distribution. \item {\tt rmax}: An upper bound for the size parameter and thereby a lower bound for the overdispersion. The value is set to the value that \method{DESeq} uses for this purpose. \item {\tt initializiation}: How the initial estimates of the conditions are determined. Possible choices are ``kmeans'' and ``quantiles''. Needs not be specified in supervised mode. \item {\tt multiClassPhiPoolingFunction}: In case of multiple known conditions it is possible to calculate one overdispersion value per transcript. This can be calculated over all conditions or as mean, maximum or minimum over the specified conditions. Usually the option ``NULL'' (calculation of the overdispersion across all conditions) performs best. \end{itemize} \section{The Method} \method{DEXUS} models read counts as a finite mixture of negative binomial distributions in which each mixture component corresponds to a condition. \method{DEXUS} classifies a transcript as differentially expressed if modeling of its read counts requires more than one condition. To account for the high overdispersion observed in RNA-Seq data, \method{DEXUS} assumes that under each condition the read counts are drawn from a negative binomial distribution. Read count $x$ is explained by a mixture of $n$ negative binomial distributions: \begin{align} \label{eq:DEXUS} p(x) \ = \ \sum_{i=1}^{n}\alpha_{i} \ \mathrm{NB}(x \ ; \ \mu_{i},r_{i}) \ , \end{align} where $\alpha_{i}$ is the probability of being in condition $i$ out of $n$ possible conditions. In condition $i$, read counts are drawn from a negative binomial distribution with mean $\mu_i$ and size $r_i$, where the size parameter $r_{i}$ is the inverse of the overdispersion $\phi_i$. An expectation maximization (EM) algorithm is used to estimate mean and overdispersion parameters of the negative binomials as well as the condition under which a particular read count was generated. \method{DEXUS} decomposes read count variation into variation due to noise and variation due to differential expression. The evidence for differential expression is measured by an informative/non-informative (I/NI) value. \method{DEXUS} applies a threshold to the I/NI value to extract differentially expressed transcripts with a desired specificity (significance level) or sensitivity (power). \method{DEXUS} performs excellently in identifying differentially expressed transcripts on data with unknown conditions. \method{DEXUS} was tested on 2,400 simulated data sets. For I/NI value thresholds of 0.025, 0.05, and 0.1, it yielded average specificities of 92\%, 97\%, and 99\% at sensitivities of 76\%, 61\%, and 38\%, respectively. Subsequently, \method{DEXUS} was tested on real-world data sets, in which it identified differentially expressed transcripts between subgroups defined by sex, species, or tissue although information about these subgroups was withheld. On HapMap individuals, \method{DEXUS} detected several differentially expressed transcripts, the vast majority of which are related to sex, eQTLs, or copy number variable regions. However, we were unable to interpret the conditions for some differentially expressed transcripts which hints at the existence of another cause of differential expression. \section{A MAP Estimate for the Size Parameter and the Overdispersion of a Negative Binomial} We provide the function {\tt getSizeNB} that gives an estimate for the size parameter of a negative binomial distribution from given data. In this function the maximum-likelihood estimate is used, if the argument {\tt eta} is set to 0, and if {\tt eta} is set to a value greater than 0, a maximum-a-posteriori estimator for the size parameter is calculated. In that case an exponential prior is used. The argument {\tt eta} determines the weight of this prior. The maximum-likelihood estimator overestimates the size parameter and, thus, underestimates the overdispersion parameter \cite{Piegorsch1990}. The maximum-a-posteriori estimator can correct for this bias and decreases the variance of the estimator, as we show in the following example. Another problem is that, if the mean of the given data exceeds the variance, the maximum-likelihood-estimator tends to infinity \cite{Anscombe1950}. By setting the argument {\tt rmax} to a positive value, one can infer an upper bound on the size parameter and, thereby, a lower bound on the overdispersion. <>= trueSizeParameter <- 2 x <- rnbinom(n=5, size=trueSizeParameter, mu=40) (sizeML <- getSizeNB(x,eta=0)) @ <>= trueSizeParameter <- 2 x <- c(38,40,30,55,36) (sizeML <- getSizeNB(x,eta=0)) @ <<>>= (sizeMAP <- getSizeNB(x,eta=1)) @ <<>>= (trueDispersion <- 1/trueSizeParameter) @ <<>>= (dispersionML <- 1/getSizeNB(x,eta=0)) @ <<>>= (dispersionMAP <- 1/getSizeNB(x,eta=1)) @ \bibliographystyle{natbib} %\bibliography{literature} \begin{thebibliography}{} \bibitem[Anders and Huber(2010)Anders and Huber]{Anders2010} Anders, S. and Huber, W. (2010). \newblock Differential expression analysis for sequence count data. \newblock {\em Genome Biology\/}, {\bf 11}(10), R106. \bibitem[Anscombe(1950)Anscombe]{Anscombe1950} Anscombe, F. J. (1950). \newblock Sampling theory of the negative binomial and logarithmic series distributions. \newblock {\em Biometrika\/}, {\bf 37}(3/4), 358--382. \bibitem[Blekhman {\em et al.}(2010)Blekhman, Marioni, Zumbo, Stephens, and Gilad]{Blekhman2010} Blekhman, R., Marioni, J. C., Zumbo, P., Stephens, M., and Gilad, Y. (2010). \newblock Sex-specific and lineage-specific alternative splicing in primates. \newblock {\em Genome Research\/}, {\bf 20}(2), 180--189. \bibitem[Bottomly {\em et al.}(2011)Bottomly, Walter, Hunter, Darakjian, Kawane, Buck, Searles, Mooney, McWeeney, and Hitzemann]{Bottomly2011} Bottomly, D., Walter, N. A. R., Hunter, J. E., Darakjian, P., Kawane, S., Buck, K. J., Searles, R. P., Mooney, M., McWeeney, S. K., and Hitzemann, R. (2011). \newblock {Evaluating gene expression in C57BL/6J and DBA/2J mouse striatum using RNA-Seq and microarrays.} \newblock {\em PLoS One\/}, {\bf 6}(3), e17820. \bibitem[Frazee {\em et al.}(2011)Frazee, Langmead, and Leek]{Frazee2011} Frazee, A. C., Langmead, B., and Leek, J. T. (2011). \newblock {ReCount: a multi-experiment resource of analysis-ready RNA-seq gene count datasets.} \newblock {\em BMC Bioinformatics\/}, {\bf 12}, 449. \bibitem[Hardcastle and Kelly(2010)Hardcastle and Kelly]{Hardcastle2010} Hardcastle, T. J. and Kelly, K. A. (2010). \newblock {baySeq: empirical Bayesian methods for identifying differential expression in sequence count data.} \newblock {\em BMC Bioinformatics\/}, {\bf 11}, 422. \bibitem[Klambauer {\em et al.}(2012)Klambauer, Schwarzbauer, Mayr, Clevert, Mitterecker, Bodenhofer, and Hochreiter]{Klambauer2012} Klambauer, G., Schwarzbauer, K., Mayr, A., Clevert, D.-A., Mitterecker, A., Bodenhofer, U., and Hochreiter, S. (2012). \newblock {cn.MOPS: mixture of Poissons for discovering copy number variations in next-generation sequencing data with a low false discovery rate}. \newblock {\em Nucleic Acids Research\/}, {\bf 40}(9), e69. \bibitem[Li and Tibshirani(2011)Li and Tibshirani]{Li2011} Li, J. and Tibshirani, R. (2011). \newblock Finding consistent patterns: A nonparametric approach for identifying differential expression in {RNA}-seq data. \newblock {\em Statistical Methods in Medical Research\/}, {\bf Published online}. \bibitem[Li {\em et al.}(2012)Li, Witten, Johnstone, and Tibshirani]{Li2012} Li, J., Witten, D. M., Johnstone, I. M., and Tibshirani, R. (2012). \newblock {Normalization, testing, and false discovery rate estimation for RNA-sequencing data}. \newblock {\em Biostatistics\/}, {\bf 13}(3), 523--538. \bibitem[Li {\em et al.}(2010)Li, Ponnala, Gandotra, Wang, Si, Tausta, Kebrom, Provart, Patel, Myers, Reidel, Turgeon, Liu, Sun, Nelson, and Brutnell]{Li2010a} Li, P., Ponnala, L., Gandotra, N., Wang, L., Si, Y., Tausta, S. L., Kebrom, T. H., Provart, N., Patel, R., Myers, C. R., Reidel, E. J., Turgeon, R., Liu, P., Sun, Q., Nelson, T., and Brutnell, T. P. (2010). \newblock The developmental dynamics of the maize leaf transcriptome. \newblock {\em Nature Genetics\/}, {\bf 42}(12), 1060--1067. \bibitem[Montgomery {\em et al.}(2010)Montgomery, Sammeth, Gutierrez-Arcelus, Lach, Ingle, Nisbett, Guigo, and Dermitzakis]{Montgomery2010} Montgomery, S. B., Sammeth, M., Gutierrez-Arcelus, M., Lach, R. P., Ingle, C., Nisbett, J., Guigo, R., and Dermitzakis, E. T. (2010). \newblock Transcriptome genetics using second generation sequencing in a caucasian population. \newblock {\em Nature\/}, {\bf 464}(7289), 773--777. \bibitem[Pickrell {\em et al.}(2010)Pickrell, Marioni, Pai, Degner, Engelhardt, Nkadori, Veyrieras, Stephens, Gilad, and Pritchard]{Pickrell2010} Pickrell, J. K., Marioni, J. C., Pai, A. A., Degner, J. F., Engelhardt, B. E., Nkadori, E., Veyrieras, J.-B., Stephens, M., Gilad, Y., and Pritchard, J. K. (2010). \newblock Understanding mechanisms underlying human gene expression variation with rna sequencing. \newblock {\em Nature\/}, {\bf 464}(7289), 768--772. \bibitem[Piegorsch(1990)Piegorsch]{Piegorsch1990} Piegorsch, W. W. (1990). \newblock Maximum likelihood estimation for the negative binomial dispersion parameter. \newblock {\em Biometrics\/}, {\bf 46}(3), 863--867. \bibitem[Robinson {\em et al.}(2010)Robinson, McCarthy, and Smyth]{Robinson2010a} Robinson, M. D., McCarthy, D. J., and Smyth, G. K. (2010). \newblock {edgeR}: a bioconductor package for differential expression analysis of digital gene expression data. \newblock {\em Bioinformatics\/}, {\bf 26}(1), 139--140. \bibitem[Tarazona {\em et al.}(2011)Tarazona, Garc\'{i}a-Alcalde, Dopazo, Ferrer, and Conesa]{Tarazona2011} Tarazona, S., Garc\'{i}a-Alcalde, F., Dopazo, J., Ferrer, A., and Conesa, A. (2011). \newblock Differential expression in {RNA-seq}: A matter of depth. \newblock {\em Genome Research\/}, {\bf 8}. \bibitem[Wang {\em et al.}(2010)Wang, Feng, Wang, Wang, and Zhang]{Wang2010} Wang, L., Feng, Z., Wang, X., Wang, X., and Zhang, X. (2010). \newblock {DEGseq: an R package for identifying differentially expressed genes from RNA-seq data.} \newblock {\em Bioinformatics\/}, {\bf 26}(1), 136--138. \bibitem[Wu {\em et al.}(2012)Wu, Wang, and Wu]{Wu2012} Wu, H., Wang, C., and Wu, Z. (2012). \newblock A new shrinkage estimator for dispersion improves differential expression detection in rna-seq data. \newblock {\em Biostatistics\/}. \end{thebibliography} \end{document}