\name{gsva} \alias{gsva} \alias{gsva,ExpressionSet,list,missing-method} \alias{gsva,ExpressionSet,GeneSetCollection,missing-method} \alias{gsva,matrix,GeneSetCollection,character-method} \alias{gsva,matrix,list,missing-method} \title{ Gene Set Variation Analysis } \description{ Estimates GSVA enrichment scores. } \usage{ \S4method{gsva}{ExpressionSet,list,missing}(expr, gset.idx.list, annotation=NULL, method=c("gsva", "ssgsea", "zscore"), rnaseq=FALSE, abs.ranking=FALSE, min.sz=1, max.sz=Inf, no.bootstraps=0, bootstrap.percent = .632, parallel.sz=0, parallel.type="SOCK", mx.diff=TRUE, tau=switch(method, gsva=1, ssgsea=0.25, NA), kernel=TRUE, verbose=TRUE) \S4method{gsva}{ExpressionSet,GeneSetCollection,missing}(expr, gset.idx.list, annotation=NULL, method=c("gsva", "ssgsea", "zscore"), rnaseq=FALSE, abs.ranking=FALSE, min.sz=1, max.sz=Inf, no.bootstraps=0, bootstrap.percent = .632, parallel.sz=0, parallel.type="SOCK", mx.diff=TRUE, tau=switch(method, gsva=1, ssgsea=0.25, NA), kernel=TRUE, verbose=TRUE) \S4method{gsva}{matrix,GeneSetCollection,character}(expr, gset.idx.list, annotation=NA, method=c("gsva", "ssgsea", "zscore"), rnaseq=FALSE, abs.ranking=FALSE, min.sz=1, max.sz=Inf, no.bootstraps=0, bootstrap.percent = .632, parallel.sz=0, parallel.type="SOCK", mx.diff=TRUE, tau=switch(method, gsva=1, ssgsea=0.25, NA), kernel=TRUE, verbose=TRUE) \S4method{gsva}{matrix,list,missing}(expr, gset.idx.list, annotation=NULL, method=c("gsva", "ssgsea", "zscore"), rnaseq=FALSE, abs.ranking=FALSE, min.sz=1, max.sz=Inf, no.bootstraps=0, bootstrap.percent = .632, parallel.sz=0, parallel.type="SOCK", mx.diff=TRUE, tau=switch(method, gsva=1, ssgsea=0.25, NA), kernel=TRUE, verbose=TRUE) } \arguments{ \item{expr}{Gene expression data which can be given either as an \code{ExpressionSet} object or as a matrix of expression values where rows correspond to genes and columns correspond to samples.} \item{gset.idx.list}{Gene sets provided either as a \code{list} object or as a \code{GeneSetCollection} object.} \item{annotation}{In the case of calling \code{gsva()} with expression data in a \code{matrix} and gene sets as a \code{GeneSetCollection} object, the \code{annotation} argument can be used to supply the name of the Bioconductor package that contains annotations for the class of gene identifiers occurring in the row names of the expression data matrix. By default \code{annotation=NULL} which implies that \code{gsva()} will try to match the identifiers in \code{expr} to the identifiers in \code{gset.idx.list} just as they are.} \item{method}{Method to employ in the estimation of gene-set enrichment scores per sample. By default this is set to \code{gsva} (H\"anzelmann et al, submitted) and other options are \code{ssgsea} (Barbie et al, 2009) and \code{zscore}. The latter standardizes expression profiles into z-scores and combines them together (Stouffer et al, 1949). This has been used in this context, for instance, by Lee et al. (2008).} \item{rnaseq}{Flag to inform whether the input gene expression data comes from microarray (\code{rnaseq=FALSE}, default) or RNA-Seq (\code{rnaseq=TRUE}) experiments.} \item{abs.ranking}{Flag to determine whether genes should be ranked according to their sign (\code{abs.ranking=FALSE}) or by absolute value (\code{abs.ranking=TRUE}). In the latter, pathways with genes enriched on either extreme (high or low) will be regarded as 'highly' activated. } \item{min.sz}{Minimum size of the resulting gene sets.} \item{max.sz}{Maximum size of the resulting gene sets.} \item{no.bootstraps}{Number of bootstrap iterations to perform.} \item{bootstrap.percent}{.632 is the ideal percent samples bootstrapped.} \item{parallel.sz}{Number of processors to use when doing the calculations in parallel. This requires to previously load either the \code{parallel} or the \code{snow} library. If \code{parallel} is loaded and this argument is left with its default value (\code{parallel.sz=0}) then it will use all available core processors unless we set this argument with a smaller number. If \code{snow} is loaded then we must set this argument to a positive integer number that specifies the number of processors to employ in the parallel calculation.} \item{parallel.type}{Type of cluster architecture when using \code{snow}.} \item{mx.diff}{Offers two approaches to calculate the enrichment statistic (ES) from the KS random walk statistic. \code{mx.diff=FALSE}: ES is calculated as the maximum distance of the random walk from 0. \code{mx.diff=TRUE} (default): ES is calculated as the magnitude difference between the largest positive and negative random walk deviations.} \item{tau}{Exponent defining the weight of the tail in the random walk performed by both the \code{gsva} (H\"anzelmann et al., submitted) and the \code{ssgsea} (Barbie et al., 2009) methods. By default, this \code{tau=1} when \code{method="gsva"} and \code{tau=0.25} when \code{method="ssgsea"} just as specified by Barbie et al. (2009) where this parameter is called \code{alpha}.} \item{kernel}{Logical, set to \code{TRUE} when the GSVA method employes a kernel non-parametric estimation of the empirical cumulative distribution function (default) and \code{FALSE} when this function is directly estimated from the observed data. This last option is justified in the limit of the size of the sample by the so-called Glivenko-Cantelli theorem.} \item{verbose}{Gives information about each calculation step. Default: \code{FALSE}.} } \details{ GSVA assesses the relative enrichment of gene sets across samples using a non-parametric approach. Conceptually, GSVA transforms a p-gene by n-sample gene expression matrix into a g-geneset by n-sample pathway enrichment matrix. This facilitates many forms of statistical analysis in the 'space' of pathways rather than genes, providing a higher level of interpretability. The \code{gsva()} function first maps the identifiers in the gene sets to the identifiers in the input expression data leading to a filtered collection of gene sets. This collection can be further filtered to require a minimun and/or maximum size of the gene sets for which we want to calculate GSVA enrichment scores, by using the arguments \code{min.sz} and \code{max.sz}. } \value{ A gene-set by sample matrix of GSVA enrichment scores. } \references{ Barbie, D.A. et al. Systematic RNA interference reveals that oncogenic KRAS-driven cancers require TBK1, \emph{Nature}, 462(5):108-112, 2009. H\"anzelmann, S., Castelo, R. and Guinney, J. GSVA: Gene Set Variation Analysis, \emph{submitted} Lee, E. et al. Inferring pathway activity toward precise disease classification, \emph{PLoS Comp Biol}, 4(11):e1000217, 2008. Stouffer, S.A. et al. The American Soldier, Vol.1: Adjustment during Army Life. Princeton University press, Princeton, 1949. } \author{J. Guinney and R. Castelo} \seealso{ \code{\link{filterGeneSets}} \code{\link{computeGeneSetsOverlap}} } \examples{ library(limma) p <- 10 ## number of genes n <- 30 ## number of samples nGrp1 <- 15 ## number of samples in group 1 nGrp2 <- n - nGrp1 ## number of samples in group 2 ## consider three disjoint gene sets geneSets <- list(set1=paste("g", 1:3, sep=""), set2=paste("g", 4:6, sep=""), set3=paste("g", 7:10, sep="")) ## sample data from a normal distribution with mean 0 and st.dev. 1 y <- matrix(rnorm(n*p), nrow=p, ncol=n, dimnames=list(paste("g", 1:p, sep="") , paste("s", 1:n, sep=""))) ## genes in set1 are expressed at higher levels in the last 10 samples y[geneSets$set1, (nGrp1+1):n] <- y[geneSets$set1, (nGrp1+1):n] + 2 ## build design matrix design <- cbind(sampleGroup1=1, sampleGroup2vs1=c(rep(0, nGrp1), rep(1, nGrp2))) ## fit linear model fit <- lmFit(y, design) ## estimate moderated t-statistics fit <- eBayes(fit) ## genes in set1 are differentially expressed topTable(fit, coef="sampleGroup2vs1") ## estimate GSVA enrichment scores for the three sets gsva_es <- gsva(y, geneSets, mx.diff=1)$es.obs ## fit the same linear model now to the GSVA enrichment scores fit <- lmFit(gsva_es, design) ## estimate moderated t-statistics fit <- eBayes(fit) ## set1 is differentially expressed topTable(fit, coef="sampleGroup2vs1") } \keyword{Pathway variation}