\name{gsva}
\alias{gsva}
\alias{gsva,ExpressionSet,list-method}
\alias{gsva,ExpressionSet,GeneSetCollection-method}
\alias{gsva,matrix,list-method}

\title{
Gene Set Variation Analysis
}
\description{
Estimates GSVA enrichment scores.
}
\usage{
\S4method{gsva}{ExpressionSet,list}(expr, gset.idx.list,
    abs.ranking=FALSE,
    min.sz=1,
    max.sz=Inf,
    no.bootstraps=0,
    bootstrap.percent = .632,
    parallel.sz=0,
    parallel.type="SOCK",
    verbose=TRUE,
    mx.diff=TRUE)
\S4method{gsva}{ExpressionSet,GeneSetCollection}(expr, gset.idx.list,
    abs.ranking=FALSE,
    min.sz=1,
    max.sz=Inf,
    no.bootstraps=0,
    bootstrap.percent = .632,
    parallel.sz=0,
    parallel.type="SOCK",
    verbose=TRUE,
    mx.diff=TRUE)
\S4method{gsva}{matrix,list}(expr, gset.idx.list,
    abs.ranking=FALSE,
    min.sz=1,
    max.sz=Inf,
    no.bootstraps=0,
    bootstrap.percent = .632,
    parallel.sz=0,
    parallel.type="SOCK",
    verbose=TRUE,
    mx.diff=TRUE)

}
\arguments{
  \item{expr}{Gene expression data which can be given either as an \code{ExpressionSet}
              object or as a matrix of expression values where rows correspond
              to genes and columns correspond to samples.}
  \item{gset.idx.list}{Gene sets provided either as a \code{list} object or as a
                       \code{GeneSetCollection} object.}
  \item{abs.ranking}{Flag to determine whether genes should be ranked according to 
  					their sign (flag=FALSE) or by absolute value (flag=TRUE). 
  					In the latter, pathways with genes enriched on either extreme
  					(high or low) will be regarded as 'highly' activated. }
  \item{min.sz}{Minimum size of the resulting gene sets.}
  \item{max.sz}{Maximum size of the resulting gene sets.}
  \item{no.bootstraps}{Number of bootstrap iterations to perform.}
  \item{bootstrap.percent}{.632 is the ideal percent samples bootstrapped.}
  \item{parallel.sz}{Number of processors to use when doing the calculations in parallel.
                     This requires to previously load either the \code{multicore} or the
                     \code{snow} library. If \code{multicore} is loaded and this argument
                     is left with its default value (\code{parallel.sz=0}) then it will use
                     all available core processors unless we set this argument with a
                     smaller number. If \code{snow} is loaded then we must set this argument
                     to a positive integer number that specifies the number of processors to
                     employ in the parallel calculation.}
  \item{parallel.type}{Type of cluster architecture when using \code{snow}.}
  \item{verbose}{Gives information about each calculation step. Default: \code{FALSE}.}
  \item{mx.diff}{Offers two approaches to calculate the enrichment statistic (ES)
                 from the KS random walk statistic. \code{mx.diff=FALSE}: ES is calculated as
                 the maximum distance of the random walk from 0. \code{mx.diff=TRUE} (default): ES
                 is calculated as the magnitude difference between the largest positive
                 and negative random walk deviations.
}
}

\details{
GSVA assesses the relative enrichment of gene sets across samples using
a non-parametric approach.  Conceptually, GSVA transforms a p-gene by n-sample
gene expression matrix into a g-geneset by n-sample pathway enrichment matrix.
This facilitates many forms of statistical analysis in the 'space' of pathways
rather than genes, providing a higher level of interpretability.

The \code{gsva()} function first maps the identifiers in the gene sets to the
identifiers in the input expression data leading to a filtered collection of
gene sets. This collection can be further filtered to require a minimun and/or
maximum size of the gene sets for which we want to calculate GSVA enrichment
scores, by using the arguments \code{min.sz} and \code{max.sz}.
}
\value{
A gene-set by sample matrix of GSVA enrichment scores.
}
\references{
H\"anzelmann, S., Castelo, R. and Guinney, J.
GSVA: Gene Set Variation Analysis, \emph{submitted}
}
\author{J. Guinney}
\seealso{
  \code{\link{filterGeneSets}}
  \code{\link{computeGeneSetsOverlap}}
}
\examples{

library(limma)

p <- 10 ## number of genes
n <- 30 ## number of samples
nGrp1 <- 15 ## number of samples in group 1
nGrp2 <- n - nGrp1 ## number of samples in group 2

## consider three disjoint gene sets
geneSets <- list(set1=paste("g", 1:3, sep=""),
                 set2=paste("g", 4:6, sep=""),
                 set3=paste("g", 7:10, sep=""))

## sample data from a normal distribution with mean 0 and st.dev. 1
y <- matrix(rnorm(n*p), nrow=p, ncol=n,
            dimnames=list(paste("g", 1:p, sep="") , paste("s", 1:n, sep="")))

## genes in set1 are expressed at higher levels in the last 10 samples
y[geneSets$set1, (nGrp1+1):n] <- y[geneSets$set1, (nGrp1+1):n] + 2

## build design matrix
design <- cbind(sampleGroup1=1, sampleGroup2vs1=c(rep(0, nGrp1), rep(1, nGrp2)))

## fit linear model
fit <- lmFit(y, design)

## estimate moderated t-statistics
fit <- eBayes(fit)

## genes in set1 are differentially expressed
topTable(fit, coef="sampleGroup2vs1")

## estimate GSVA enrichment scores for the three sets
gsva_es <- gsva(y, geneSets, mx.diff=1)$es.obs

## fit the same linear model now to the GSVA enrichment scores
fit <- lmFit(gsva_es, design)

## estimate moderated t-statistics
fit <- eBayes(fit)

## set1 is differentially expressed
topTable(fit, coef="sampleGroup2vs1")

}
\keyword{Pathway variation}