\pdfminorversion=4 % tell pdflatex to generate PDF in version 1.4
\documentclass[article, shortnames, nojss]{jss}
\usepackage[utf8]{inputenc}
\usepackage{amsmath}
\usepackage{enumerate}
\usepackage{multirow}
\DeclareMathOperator*{\argmax}{arg\,max}
% \VignetteIndexEntry{Massive and Integrative Gene Set Analysis}
% \VignetteKeyword{singular enrichment analysis}
% \VignetteKeyword{over representation analysis}
% \VignetteKeyword{gene set enrichment analysis}
% \VignetteKeyword{functional class scoring}
% \VignetteKeyword{big omics data}

% \VignetteDepends{BiocParallel}
% \VignetteDepends{edgeR}
% \VignetteDepends{limma}
% \VignetteDepends{mGSZ}
% \VignetteDepends{MIGSA}
% \VignetteDepends{MIGSAdata}

\author{Juan C Rodriguez\\CONICET\\Universidad Cat\'{o}lica de C\'{o}rdoba\\
Universidad Nacional de C\'{o}rdoba
\And Crist\'{o}bal Fresno\\Instituto Nacional de Medicina Gen\'{o}mica
\AND Andrea S Llera\\CONICET\\Fundaci\'{o}n Instituto
Leloir \And Elmer A Fern\'{a}ndez\\CONICET\\Universidad Cat\'{o}lica de
C\'{o}rdoba\\Universidad Nacional de C\'{o}rdoba}

\title{\pkg{MIGSA}: Massive and Integrative Gene Set Analysis}

%% for pretty printing and a nice hypersummary also set:
%% comma-separated
\Plainauthor{Juan C Rodriguez, Crist\'{o}bal Fresno, Andrea S Llera, Elmer A
    Fern\'{a}ndez}
%% without formatting
\Plaintitle{MIGSA: Massive and Integrative Gene Set Analysis}
%% a short title (if necessary)
\Shorttitle{\pkg{MIGSA}: Massive and Integrative Gene Set Analysis}

%% an abstract and keywords
\Abstract{
The \pkg{MIGSA} package allows to perform a massive and integrative gene set
analysis over several experiments and gene sets simultaneously. It provides a
common gene expression analytic framework that grants a comprehensive and
coherent analysis. Only a minimal user parameter setting is required to
perform both singular and gene set enrichment analyses in an integrative
manner by means of enhanced versions of the best available methods, i.e.
\pkg{dEnricher} and \pkg{mGSZ} respectively.
\\ One of the greatest strengths of this big omics data tool is the
availability of several functions to explore, analyze and visualize its results
in order to facilitate the data mining task over huge information sources.
\\ The MIGSA package also  allows to easily load the most updated gene sets
collections from several repositories.
}
\Keywords{singular enrichment analysis, over representation analysis, gene set
enrichment analysis, functional class scoring, big omics data, r package,
bioconductor}

%% without formatting
\Plainkeywords{singular enrichment analysis, over representation analysis,
gene set enrichment analysis, functional class scoring, big omics data}
%% at least one keyword must be supplied

%% The address of (at least) one author should be given
%% in the following format:
\Address{
Juan C Rodriguez \& Elmer A Fern\'{a}ndez\\
Bioscience Data Mining Group\\
Facultad de Ingenier\'{i}a\\
Universidad Cat\'{o}lica de C\'{o}rdoba - CONICET\\
X5016DHK C\'{o}rdoba, Argentina\\
E-mail: \email{jcrodriguez@bdmg.com.ar, efernandez@bdmg.com.ar}\\
URL: \url{http://www.bdmg.com.ar/}\\
}

\begin{document}
\SweaveOpts{concordance=TRUE}
% \SweaveOpts{concordance=FALSE}

\section{Introduction}
The functional analysis methodology allows researchers to gain biological
insight from a list of deregulated gene sets between experimental conditions
of interest. As suggested by \citep{rodriguez2016improving} both singular
enrichment analysis (SEA) and gene set enrichment analysis (GSEA) must be
performed over the same dataset in order to gain as much biological insight
as possible. This strategy is known as Integrative Functional Analysis (IFA)
and integrates into the same analysis with enhanced versions of the dEnricher
\citep{dEnricher} and mGSZ \citep{mGSZ} methods.
\par At present, there are several freely available datasets which provide
data over the same disease, characteristic of interest (e.g. survival), or
subjects studied over several different platforms. The Cancer Genome Atlas
(TCGA) among other projects makes possible the study and comparison in a
massive way of these datasets, not only among them but, also against our own
population of interest. This unprecedented opportunity allows researchers to
search for common functional patterns between these studies, or, more
interestingly, particular patterns of our experiment in question. However,
this type of approach has not been implemented in any existing tool yet,
leaving aside valuable biological information that might assist research
hypotheses.
\par Here, we present a Massive and Integrative Gene Set Analysis tool called
\pkg{MIGSA}. It allows to evaluate and compare, massively and transparently, a
large collection of datasets coming from diverse sources, maintaining the gene
set enrichment ideas of IFA and minimizing parameter settings. In addition, it
includes a gene ranking score alternative for RNAseq data by integrating the
\textit{Voom+Limma} methodological approach. It provides an enhanced version
of mGSZ (\texttt{MIGSAmGSZ}) faster than the default implementation, in order
to speed up even more its execution, \pkg{MIGSA} can be run using multicore
architectures. In this sense it can be applied over a large collection of
datasets on many gene sets in a fast way. Finally, \pkg{MIGSA} provides
several user-friendly methods to easily explore and visualize results at gene
set, dataset and individual gene level to aid researchers in their biological
hypothesis understanding.

\section{Preliminaries}

\subsection{Citing MIGSA}
\pkg{MIGSA} implements a body of methodological research by the authors and
co-workers. Citations are the main means by which the authors receive
professional credit for their work. The \pkg{MIGSA} package can be cited as:
\\
\\ Rodriguez JC, Merino GA, Llera AS, Fern\'{a}ndez EA (2019).
\\“Massive integrative gene set analysis enables functional characterization of
breast cancer subtypes.” \textit{Journal of Biomedical Informatics},
\textbf{93}, 103157.
\\
\\ Rodriguez JC, Gonz\'{a}lez GA, Fresno C, Llera AS, Fern\'{a}ndez EA (2016).
\\“Improving information retrieval in functional analysis.”
\textit{Computers in Biology and Medicine}, \textbf{79}, 10–20.

\subsection{Installation}
\pkg{MIGSA} is a package for the R computing environment and it is assumed that
you have already installed R. See the R project at http://www.r-project.org.
To install the latest version of \pkg{MIGSA}, you will need to be using the
latest version of R.
\\ \pkg{MIGSA} is part of the Bioconductor project at
http://www.bioconductor.org. (Prior to R 3.4).
\\To get \pkg{MIGSA} package you can type in an R session:
<<eval=FALSE>>=
## try http:// if https:// URLs are not supported
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}
BiocManager::install("MIGSA")
@

\section{Gene sets}
\pkg{MIGSA} allows to perform the functional analysis of any type of gene
sets provided by the user. Such gene sets should be present as
GeneSetCollection objects from the \pkg{GSEABase} R library, in this section we
will give a brief introduction on how to construct such an object from our
own gene sets. In addition, the tools provided by \pkg{MIGSA} to automatically
load various collections of known gene sets will be presented.

\subsection{Sample GeneSetCollection creation}
Here we present a simple way to create a GeneSetCollection object from own
gene sets, for more detailed information please refer to the \pkg{GSEABase}
documentation.
\\For this example we are going to manually create the GeneSetCollection
object for the gene sets hsa00232, hsa00130 and hsa00785 from KEGG.
\\First, we will have to create each gene set separately, and then the
GeneSetCollection object.
<<>>=
library(GSEABase)
gs1 <- GeneSet(c("10", "1544", "1548", "1549", "1553", "7498", "9"),
  setName = "hsa00232",
  setIdentifier = "Caffeine metabolism"
)
gs1
gs2 <- GeneSet(c("10229", "27235", "3242", "51004", "51805", "6898", "84274"),
  setName = "hsa00130",
  setIdentifier = "Ubiquinone and other terpenoid-quinone biosynthesis"
)
gs3 <- GeneSet(c("11019", "387787", "51601"),
  setName = "hsa00785",
  setIdentifier = "Lipoic acid metabolism"
)
## And now construct the GeneSetCollection object.
gsetsColl <- GeneSetCollection(list(gs1, gs2, gs3))
gsetsColl
@

\subsection{MIGSA gene sets loading}
As mentioned above, \pkg{MIGSA} provides functions for automatically loading
known collections of gene sets. These functions are \texttt{loadGo} and
\texttt{downloadEnrichrGeneSets}, the first constructs the GeneSetCollection
object using the \pkg{org.Hs.eg.db} R package. Meanwhile,
\texttt{downloadEnrichrGeneSets} constructs the object by downloading the
gene sets from the Enrichr database
(http://amp.pharm.mssm.edu/Enrichr/\#stats). Enrichr gene set names can be
listed with the \texttt{enrichrGeneSets} function.

<<eval=FALSE>>=
## Not run:

## Load cellular component gene sets (another possibility would be "MF" or "BP")
ccGsets <- loadGo("CC")
# It is a GeneSetCollection object

## Load KEGG and Reactome gene sets
keggReact <- downloadEnrichrGeneSets(c("KEGG_2015", "Reactome_2015"))
## It is a list object containing two GeneSetCollection objects

## End(Not run)
@

\section{MIGSAmGSZ}

\subsection{mGSZ speedup}
As stated below, \pkg{MIGSA} provides the \texttt{MIGSAmGSZ} function, which
implements \textit{mGSZ} but running much faster. In order to test
\texttt{MIGSAmGSZ}'s correctness and speed up over \textit{mGSZ}, it was
evaluated using the TCGA's microarray breast cancer dataset. Basal vs. Luminal
A contrast was tested (16,207 genes x 237 subjects) over the Gene Ontology and
KEGG gene sets (20,425 gene sets).
\\This analysis was carried out using an Intel(R) Xeon(R) E5-2620 v3 @ 2.40GHz
(24 cores), 128 GB RAM. Different number of cores were used to analyze the
speed up.
\\Let's test it!
\\Note that we are using MulticoreParam as I am testing under Linux.

<<Speedup1, eval=TRUE>>=
library(BiocParallel)
library(mGSZ)
library(MIGSA)
library(MIGSAdata)
data(tcgaMAdata)
subtypes <- tcgaMAdata$subtypes
geneExpr <- tcgaMAdata$geneExpr
## MA data: filter genes with less than 30% of genes read per condition
dim(geneExpr)
geneExpr <- geneExpr[
  rowSums(is.na(geneExpr[, subtypes == "Basal" ])) <
    .3 * sum(subtypes == "Basal") &
    rowSums(is.na(geneExpr[, subtypes == "LumA" ])) <
      .3 * sum(subtypes == "LumA"),
]
dim(geneExpr)
@

<<Speedup2, eval=FALSE>>=
## Not run:

## Download GO and KEGG gene sets using MIGSA
gSets <- list(
  KEGG = downloadEnrichrGeneSets("KEGG_2015")[[1]],
  BP = loadGo("BP"),
  CC = loadGo("CC"),
  MF = loadGo("MF")
)
gSetsList <- do.call(c, lapply(gSets, MIGSA:::asList))
rm(gSets)
nCores <- c(1, 2, 4, 8, 10, 12, 14)
allRes <- lapply(nCores, function(actCores) {
  # setting in how many cores to run
  register(MulticoreParam(
    workers = actCores, threshold = "DEBUG",
    progressbar = TRUE
  ))

  set.seed(8818)
  newtimeSpent <- Sys.time()
  MIGSAmGSZres <- MIGSAmGSZ(geneExpr, gSetsList, subtypes)
  newtimeSpent <- Sys.time() - newtimeSpent

  res <- list(timeSpent = newtimeSpent, res = MIGSAmGSZres)

  return(res)
})

set.seed(8818)
timeSpent <- Sys.time()
mGSZres <- mGSZ(geneExpr, gSetsList, subtypes)
timeSpent <- Sys.time() - timeSpent
mGSZres <- mGSZres$mGSZ
## this tests that the returned values are equal, must give all TRUE
lapply(allRes, function(actRes) {
  actRes <- actRes$res
  actRes <- actRes[, 1:4]
  mergedRes <- merge(mGSZres, actRes,
    by = "gene.sets",
    suffixes = c("mGSZ", "MIGSAmGSZ")
  )

  all(unlist(lapply(2:4, function(x) {
    all.equal(mergedRes[, x], mergedRes[, x + 3])
  })))
})
## End(Not run)
@

<<Speedup3, eval=TRUE>>=
## As last chunk of code was not executed, we load that data:
library(MIGSAdata)
data(mGSZspeedup)
nCores <- mGSZspeedup$nCores
allRes <- mGSZspeedup$allRes
timeSpent <- mGSZspeedup$timeSpent
## End(Loading data)

newtimeSpent <- lapply(allRes, function(actRes) {
  actRes$timeSpent
})
names(newtimeSpent) <- nCores
speeduptable <- c(timeSpent, unlist(newtimeSpent))
names(speeduptable) <- c(1, nCores)
## Let's put all times in the same unit in order to measure speedup
newtimeSpent <- lapply(newtimeSpent, function(acttime) {
  units(acttime) <- "secs"
  return(acttime)
})
units(timeSpent) <- "secs"
speedup <- do.call(c, lapply(newtimeSpent, function(acttime) {
  as.numeric(timeSpent) / as.numeric(acttime)
}))
speeduptable <- rbind(speeduptable, c(1, speedup))
## calculate efficiency
speeduptable <- rbind(
  speeduptable,
  speeduptable[2, ] / as.numeric(colnames(speeduptable))
)
rownames(speeduptable) <- c("Runtime", "Speedup", "Efficiency")
round(speeduptable, 2)
@

As it can be seen in Table \ref{speedUpTable}, no matter the number of cores in
which \texttt{MIGSAmGSZ} was tested, it outperformed \textit{mGSZ}. Running in
one core, it has shown a speedup of 1.6X, reaching for a top of 10.8X speedup
with ten cores, giving the same results in 14 minutes in contrast to
\textit{mGSZ}'s 2.46 hours execution.

\begin{table}[!htpb]
    \caption{MIGSAmGSZ speedup}
    \centering
    \begin{tabular}{lcccccccc}
\hline
            & mGSZ & \multicolumn{7}{c}{MIGSAmGSZ} \\
\#cores     & 1 & 1 & 2 & 4 & 8 & 10 & 12 & 14 \\
Runtime    & 2.46h & 1.55h & 46.5m & 24.98m & 15.63m & 13.67m & 14.79m
& 28.43m \\
Speedup    & 1 & 1.58 & 3.18 & 5.91 & 9.45 & 10.81 & 9.98 & 5.19 \\
Efficiency & 1 & 1.58 & 1.59 & 1.48 & 1.18 & 1.08 & 0.83 & 0.37 \\ \hline
    \end{tabular}
    \label{speedUpTable}
\end{table}

\subsection{MIGSAmGSZ simple example}
Following, we show how to simply execute one \texttt{MIGSAmGSZ} analysis.
\\ In this example we will generate an expression matrix with 200 genes (ten
differentially expressed) and eight subjects (four of condition ``C1'' and four
of ``C2''), and 50 gene sets of ten genes each one.

<<MIGSAmGSZ example>>=
library(MIGSA)
## Let's create our gene expression matrix with 200 genes and 8 subjects
nSamples <- 8
# 8 subjects
nGenes <- 200
# 200 genes
geneNames <- paste("g", 1:nGenes, sep = "")
# with names g1 ... g200

## Create random gene expression data matrix.
set.seed(8818)
exprMatrix <- matrix(rnorm(nGenes * nSamples), ncol = nSamples)
## It must have rownames, as they will be treated as the gene names!
rownames(exprMatrix) <- geneNames
## There will be 10 differentially expressed genes.
nDeGenes <- 10
## Let's generate the offsets to sum to the differentially expressed genes.
deOffsets <- matrix(2 * abs(rnorm(nDeGenes * nSamples / 2)), ncol = nSamples / 2)
## Randomly select which are the DE genes.
deIndexes <- sample(1:nGenes, nDeGenes, replace = FALSE)
exprMatrix[deIndexes, 1:(nSamples / 2)] <-
  exprMatrix[deIndexes, 1:(nSamples / 2)] + deOffsets
## 4 subjects with condition C1 and 4 with C2.
conditions <- rep(c("C1", "C2"), c(nSamples / 2, nSamples / 2))
nGSets <- 50
# 50 gene sets
## Let's create randomly 50 gene sets, of 10 genes each
gSets <- lapply(1:nGSets, function(i) sample(geneNames, size = 10))
names(gSets) <- paste("set", as.character(1:nGSets), sep = "")
## with names set1 ... set50

## And simply execute MIGSAmGSZ
MIGSAmGSZres <- MIGSAmGSZ(exprMatrix, gSets, conditions)
## It is just a simple data.frame
head(MIGSAmGSZres)
@

\section{MIGSA simple example}
Following, we show how to simply execute one \textit{MIGSA} analysis.
\\ In this example we will generate two expression matrices with 300 genes (30
differentially expressed) and 16 subjects (8 of condition ``C1'' and 8 of
``C2''), and two sets of 30 gene sets of ten genes each one.

<<MIGSA example>>=
library(MIGSA)
## Let's simulate two expression matrices of 300 genes and 16 subjects.
nGenes <- 300
# 300 genes
nSamples <- 16
# 16 subjects
geneNames <- paste("g", 1:nGenes, sep = "")
# with names g1 ... g300

## Create the random gene expression data matrices.
set.seed(8818)
exprData1 <- matrix(rnorm(nGenes * nSamples), ncol = nSamples)
rownames(exprData1) <- geneNames
exprData2 <- matrix(rnorm(nGenes * nSamples), ncol = nSamples)
rownames(exprData2) <- geneNames
## There will be 30 differentially expressed genes.
nDeGenes <- nGenes / 10
## Let's generate the offsets to sum to the differentially expressed genes.
deOffsets <- matrix(2 * abs(rnorm(nDeGenes * nSamples / 2)), ncol = nSamples / 2)
## Randomly select which are the DE genes.
deIndexes1 <- sample(1:nGenes, nDeGenes, replace = FALSE)
exprData1[deIndexes1, 1:(nSamples / 2)] <-
  exprData1[deIndexes1, 1:(nSamples / 2)] + deOffsets
deIndexes2 <- sample(1:nGenes, nDeGenes, replace = FALSE)
exprData2[deIndexes2, 1:(nSamples / 2)] <-
  exprData2[deIndexes2, 1:(nSamples / 2)] + deOffsets
exprData1 <- new("MAList", list(M = exprData1))
exprData2 <- new("MAList", list(M = exprData2))
## 8 subjects with condition C1 and 8 with C2.
conditions <- rep(c("C1", "C2"), c(nSamples / 2, nSamples / 2))
fitOpts <- FitOptions(conditions)
nGSets <- 30
# 30 gene sets
## Let's create randomly 30 gene sets, of 10 genes each

gSets1 <- lapply(1:nGSets, function(i) sample(geneNames, size = 10))
names(gSets1) <- paste("set", as.character(1:nGSets), sep = "")
myGSs1 <- as.Genesets(gSets1)
gSets2 <- lapply(1:nGSets, function(i) sample(geneNames, size = 10))
names(gSets2) <- paste("set", as.character((nGSets + 1):(2 * nGSets)), sep = "")
myGSs2 <- as.Genesets(gSets2)
igsaInput1 <- IGSAinput(
  name = "igsaInput1", expr_data = exprData1,
  fit_options = fitOpts
)
igsaInput2 <- IGSAinput(
  name = "igsaInput2", expr_data = exprData2,
  fit_options = fitOpts
)
experiments <- list(igsaInput1, igsaInput2)
## As we did not set gene sets for each IGSAinput, then we will have to
## provide them in MIGSA function

## another way of generating the same MIGSA input would be setting the
## gene sets individually to each IGSAinput:
igsaInput1 <- IGSAinput(
  name = "igsaInput1", expr_data = exprData1,
  fit_options = fitOpts,
  gene_sets_list = list(myGeneSets1 = myGSs1, myGeneSets2 = myGSs2)
)
igsaInput2 <- IGSAinput(
  name = "igsaInput2", expr_data = exprData2,
  fit_options = fitOpts,
  gene_sets_list = list(myGeneSets1 = myGSs1, myGeneSets2 = myGSs2)
)
experiments <- list(igsaInput1, igsaInput2)
## And then simply run MIGSA
migsaRes <- MIGSA(experiments)
## migsaRes contains the p-values obtained in each experiment for each gene set
head(migsaRes)
@

<<>>=
## Other possible analyses:
## If we want some gene sets to be evaluated in just one IGSAinput we
## can do this:

## If we want to test myGSs1 in exprData1 and myGSs2 in exprData2:
igsaInput1 <- IGSAinput(
  name = "igsaInput1", expr_data = exprData1,
  fit_options = fitOpts, gene_sets_list = list(myGeneSets1 = myGSs1)
)
igsaInput2 <- IGSAinput(
  name = "igsaInput2", expr_data = exprData2,
  fit_options = fitOpts, gene_sets_list = list(myGeneSets2 = myGSs2)
)
experiments <- list(igsaInput1, igsaInput2)
## If we want to test myGSs1 in exprData1 and both in exprData2:
igsaInput1 <- IGSAinput(
  name = "igsaInput1", expr_data = exprData1,
  fit_options = fitOpts, gene_sets_list = list(myGeneSets1 = myGSs1)
)
igsaInput2 <- IGSAinput(
  name = "igsaInput2", expr_data = exprData2,
  fit_options = fitOpts,
  gene_sets_list = list(myGeneSets1 = myGSs1, myGeneSets2 = myGSs2)
)
experiments <- list(igsaInput1, igsaInput2)
## And this way, all possible combinations.
@

\section{MIGSA's utility}
In this section we are going to demonstrate \pkg{MIGSA}'s utility by analyzing
several well known breast cancer datasets. For each dataset, subjects were
classified into breast cancer intrinsic subtypes (Basal-Like, Her2-Enriched,
Luminal B, Luminal A and Normal-Like) using the PAM50 algorithm \citep{PAM50}
by means of the \pkg{pbcmc} R library \citep{pbcmc} and processed as suggested
by Sorlie et al. \citep{sorlie}. Only those subjects classified as Basal-Like
or Luminal A were included.
\\ Enrichment was tested over 20,245 Gene Ontology gene sets (14,291 biological
processes, 1,692 cellular components and 4,263 molecular functions), and 179
from KEGG.

\subsection{Used datasets}
A total of eight datasets were tested, six of them were loaded by means of the
\pkg{pbcmc} R library, i.e., Mainz, Nki, Transbig, Unt, Upp and Vdx); and two
were downloaded from the TCGA repository, i.e., microarray and RNAseq data
matrices.
For each dataset, genes reliably detected in less than 30\% of the samples per
condition were removed from the analysis. In addition, in RNAseq data, genes
with a mean less than 15 counts per condition were also removed.
Detailed datasets information can be seen in Table \ref{datasetsTable}.

\begin{table}[!htpb]
    \caption{Datasets details}
    \centering
    \begin{tabular}{lcccc}
\hline
Dataset  & Platform   & \multicolumn{2}{c}{Subjects} & Genes \\
        &            & Basal & Luminal A &        \\ \hline
Mainz    & Microarray & 18    & 117       & 13,091 \\
Nki      & Microarray & 66    & 100       & 12,975 \\
TCGA     & Microarray & 95    & 142       & 16,207 \\
TCGA     & RNAseq     & 95    & 142       & 16,741 \\
Transbig & Microarray & 37    & 89        & 13,091 \\
Unt      & Microarray & 22    & 42        & 18,528 \\
Upp      & Microarray & 19    & 150       & 18,528 \\
Vdx      & Microarray & 80    & 134       & 13,091 \\ \hline
Total    &      -     & 432   & 916       & -      \\ \hline
    \end{tabular}
    \label{datasetsTable}
\end{table}

\subsection{MIGSA on TCGA data}
\label{migsatcga}
Let's run MIGSA over the TCGA RNAseq and microarray datasets.
We are going to load both datasets using the \pkg{MIGSAdata} package, please
refer to the gettingTcgaData vignette for details about these matrices.
\\ \textbf{NOTE:} This chunk of code took 29.83m to execute on 10 cores.

<<tcga MIGSA1, eval=TRUE>>=
library(edgeR)
library(limma)
library(MIGSA)
library(MIGSAdata)
data(tcgaMAdata)
data(tcgaRNAseqData)
geneExpr <- tcgaMAdata$geneExpr
rnaSeq <- tcgaRNAseqData$rnaSeq
subtypes <- tcgaMAdata$subtypes
# or tcgaRNAseqData$subtypes; are the same
fitOpts <- FitOptions(subtypes)
## MA data: filter genes with less than 30% of genes read per condition
dim(geneExpr)
geneExpr <- geneExpr[
  rowSums(is.na(geneExpr[, subtypes == "Basal" ])) <
    .3 * sum(subtypes == "Basal") &
    rowSums(is.na(geneExpr[, subtypes == "LumA" ])) <
      .3 * sum(subtypes == "LumA"),
]
dim(geneExpr)
## create our IGSAinput object
geneExpr <- new("MAList", list(M = geneExpr))
geneExprIgsaInput <- IGSAinput(
  name = "tcgaMA",
  expr_data = geneExpr,
  fit_options = fitOpts,
  # with this treat we will get around 5% differentially expressed genes
  sea_params = SEAparams(treat_lfc = 1.05)
)
summary(geneExprIgsaInput)
## RNAseq data: filter genes with less than 30% of genes read per
## condition and (below)
dim(rnaSeq)
rnaSeq <- rnaSeq[
  rowSums(is.na(rnaSeq[, subtypes == "Basal" ])) <
    .3 * sum(subtypes == "Basal") &
    rowSums(is.na(rnaSeq[, subtypes == "LumA" ])) <
      .3 * sum(subtypes == "LumA"),
]
dim(rnaSeq)
## a mean less than 15 counts per condition.
rnaSeq <- rnaSeq[
  rowMeans(rnaSeq[, subtypes == "Basal" ], na.rm = TRUE) >= 15 &
    rowMeans(rnaSeq[, subtypes == "LumA"  ], na.rm = TRUE) >= 15,
]
dim(rnaSeq)
## create our IGSAinput object
rnaSeq <- DGEList(counts = rnaSeq)
rnaSeqIgsaInput <- IGSAinput(
  name = "tcgaRNA",
  expr_data = rnaSeq,
  fit_options = fitOpts,
  # with this treat we will get around 5% differentially expressed genes
  sea_params = SEAparams(treat_lfc = 1.45)
)
summary(rnaSeqIgsaInput)
experiments <- list(geneExprIgsaInput, rnaSeqIgsaInput)
@

<<tcga MIGSA2, eval=FALSE>>=
## Not run:

gSets <- list(
  KEGG = downloadEnrichrGeneSets("KEGG_2015")[[1]],
  BP = loadGo("BP"),
  CC = loadGo("CC"),
  MF = loadGo("MF")
)
set.seed(8818)
tcgaMigsaRes <- MIGSA(experiments, geneSets = gSets)
## Time difference of 29.83318 mins in 10 cores
## End(Not run)
@

\subsection{MIGSA on pbcmc datasets}
\label{migsapbcmc}
Let's run \textit{MIGSA} over the pbcmc microarray datasets.
We are going to load six datasets using the \pkg{MIGSAdata} package, please
refer to the gettingPbcmcData vignette for details on how we got this matrices.
\\ \textbf{NOTE:} This chunk of code took 1.27 hours to execute on 10 cores.

<<pbcmc MIGSA1, eval=TRUE>>=
library(limma)
library(MIGSA)
library(MIGSAdata)
data(pbcmcData)
## with these treat log fold change values we will get around 5% of
## differentially expressed genes for each experiment
treatLfcs <- c(0.7, 0.2, 0.6, 0.25, 0.4, 0.75)
names(treatLfcs) <- c("mainz", "nki", "transbig", "unt", "upp", "vdx")
experiments <- lapply(names(treatLfcs), function(actName) {
  actData <- pbcmcData[[actName]]
  actExprs <- actData$geneExpr
  actSubtypes <- actData$subtypes

  # filtrate genes with less than 30% per condition
  actExprs <- actExprs[
    rowSums(is.na(actExprs[, actSubtypes == "Basal" ])) <
      .3 * sum(actSubtypes == "Basal") &
      rowSums(is.na(actExprs[, actSubtypes == "LumA" ])) <
        .3 * sum(actSubtypes == "LumA"),
  ]

  # create our IGSAinput object
  actExprData <- new("MAList", list(M = actExprs))
  actFitOpts <- FitOptions(actSubtypes)
  actIgsaInput <- IGSAinput(
    name = actName,
    expr_data = actExprData,
    fit_options = actFitOpts,
    sea_params = SEAparams(treat_lfc = treatLfcs[[actName]])
  )
  return(actIgsaInput)
})
@

<<pbcmc MIGSA2, eval=FALSE>>=
## Not run:

gSets <- list(
  KEGG = downloadEnrichrGeneSets("KEGG_2015")[[1]],
  BP = loadGo("BP"),
  CC = loadGo("CC"),
  MF = loadGo("MF")
)
set.seed(8818)
pbcmcMigsaRes <- MIGSA(experiments, geneSets = gSets)
## Time difference of 1.26684 hours in 10 cores
## End(Not run)
@

\subsection{MIGSA exploring breast cancer enrichment results}
Let's start with the exploratory task. First, merge both MIGSAres objects
into one with all the datasets results.
\\ \textbf{NOTE:} In order to follow this code, sections \ref{migsatcga} and
\ref{migsapbcmc} must have been executed. If not, jump to the next ``End(Not
run)'' tag.

<<MIGSAres merge1, eval=FALSE>>=
## Not run:

dim(pbcmcMigsaRes)
# [1] 20425     9
dim(tcgaMigsaRes)
# [1] 20425     5

## Let's merge both results in one big MIGSAres object
bcMigsaRes <- merge(pbcmcMigsaRes, tcgaMigsaRes)
dim(bcMigsaRes)
# [1] 20425     11
## End(Not run)
@

<<>>=
## As last chunk of code was not executed, we load that data:
library(MIGSA)
library(MIGSAdata)
data(bcMigsaResAsList)
bcMigsaRes <- MIGSA:::MIGSAres.data.table(
  bcMigsaResAsList$dframe,
  bcMigsaResAsList$genesRank
)
rm(bcMigsaResAsList)
## End(Loading data)

## Let's see a summary of enriched gene sets at different cutoff values
summary(bcMigsaRes)
## We will set a cutoff of 0.01 (recommended)
## A gene set will be considered enriched if its p-value is < 0.01 on
## SEA or GSEA.
bcMigsaRes <- setEnrCutoff(bcMigsaRes, 0.01)
## The bcMigsaRes data object that is included in MIGSA package is the
## following:
# bcMigsaRes <- bcMigsaRes[1:200,];
@

Let's start exploring this MIGSA results object.

<<MIGSAres exploring1, eval=TRUE>>=
colnames(bcMigsaRes)
dim(bcMigsaRes)
summary(bcMigsaRes)
## We can see that 18,191 gene sets were not enriched, while 242 were
## enriched in every dataset.
## Moreover, there is a high consensus between datasets, with a maximum of 679
## enriched gene sets in common between upp and unt.
##
## Let's keep only gene sets enriched in at least one data set
bcMigsaRes <- bcMigsaRes[ rowSums(bcMigsaRes[, -(1:3)], na.rm = TRUE) > 0, ]
dim(bcMigsaRes)
@

<<echo=FALSE, results=hide>>=
# it is the same code as below, but saving the pdf to correctly load it in tex
pdf("bcMigsaResMigsaHeatmap.pdf")
aux <- migsaHeatmap(bcMigsaRes)
dev.off()
@

<<MIGSAres exploring2, eval=FALSE>>=
## Let's see enrichment heat map
## i.e. a heat map of binary data (enriched/not enriched)
aux <- migsaHeatmap(bcMigsaRes)
@

\includegraphics{bcMigsaResMigsaHeatmap.pdf}

<<MIGSAres exploring3, eval=TRUE>>=
## In this heat map we can see a high number of gene sets that are being
## enriched in consensus by most of the datasets. Let's explore them.
## We can obtain them (enriched in at least 80% of datasets) by doing
consensusGsets <- bcMigsaRes[ rowSums(bcMigsaRes[, -(1:3)], na.rm = TRUE)
> 6.4, ]
dim(consensusGsets)
## And let's see from which sets are them
table(consensusGsets$GS_Name)
@

<<echo=FALSE, results=hide>>=
# it is the same code as below, but saving the pdf to correctly load it in tex
pdf("bcMigsaResGenesHeatmap.pdf")
aux <- genesHeatmap(bcMigsaRes,
  enrFilter = 6.4, gsFilter = 70,
  dendrogram = "col"
)
dev.off()
@

<<MIGSAres exploring4, eval=FALSE>>=
## Moreover, let's see which are the genes that are mostly contributing
## to gene set enrichment (genes contributing in at least 70 gene sets)
## i.e. a heat map showing the number of datasets in which each gene (columns)
## contributed to enrich each gene set (rows).
aux <- genesHeatmap(bcMigsaRes,
  enrFilter = 6.4, gsFilter = 70,
  dendrogram = "col"
)
@

\includegraphics{bcMigsaResGenesHeatmap.pdf}

<<MIGSAres exploring5, eval=TRUE>>=
## Well, we could continue exploring them, however, at the first heat map we
## can see that TCGA datasets are defining a separate cluster, this is caused
## by a big group of gene sets that seem to be enriched mainly by TCGA.
## Let's explore them:
## (gene sets enriched by both TCGA datasets and in less than 20% of the other)
tcgaExclusive <- bcMigsaRes[
  rowSums(bcMigsaRes[, c("tcgaMA", "tcgaRNA")], na.rm = TRUE) == 2 &
    rowSums(bcMigsaRes[, c("mainz", "nki", "transbig", "unt", "upp", "vdx")],
      na.rm = TRUE
    ) < 1.2,
]
dim(tcgaExclusive)
table(tcgaExclusive$GS_Name)
## Let's see which is this KEGG enriched gene set
tcgaExclusive[ tcgaExclusive$GS_Name == "KEGG_2015", "id" ]
## Let's see in which depths of the GO tree are these gene sets
table(getHeights(
  tcgaExclusive[ tcgaExclusive$GS_Name != "KEGG_2015", "id", drop = TRUE]
))
## We can see that the most of the gene sets are between depths three and five
@

<<echo=FALSE,results=hide>>=
# it is the same code as below, but saving the pdf to correctly load it in tex
pdf("tcgaExclusiveMigsaGoTreeMF.pdf")
aux <- migsaGoTree(tcgaExclusive, ont = "MF")
dev.off()
@

<<MIGSAres exploring6_1, eval=FALSE>>=
## And plot the GO tree of the other gene sets (except of CC, as it
## has only three gene sets, and it will look bad)
aux <- migsaGoTree(tcgaExclusive, ont = "MF")
@

\includegraphics{tcgaExclusiveMigsaGoTreeMF.pdf}

<<echo=FALSE,results=hide>>=
# it is the same code as below, but saving the pdf to correctly load it in tex
pdf("tcgaExclusiveMigsaGoTreeBP.pdf")
aux <- migsaGoTree(tcgaExclusive, ont = "BP")
dev.off()
@

<<MIGSAres exploring6_2, eval=FALSE>>=
aux <- migsaGoTree(tcgaExclusive, ont = "BP")
@

\includegraphics{tcgaExclusiveMigsaGoTreeBP.pdf}

<<echo=FALSE,results=hide>>=
# it is the same code as below, but saving the pdf to correctly load it in tex
pdf("tcgaExclusiveGenesBarplot.pdf")
mostEnrichedGenes <- genesBarplot(tcgaExclusive, gsFilter = 12.45)
dev.off()
@

<<MIGSAres exploring6_3, eval=FALSE>>=
## Let's explore which are the genes that repeat the most in these
## gene sets (that are present in at least 15% of the gene sets)
## i.e. a bar plot of the number of gene sets in which each gene contributed to
## enrich.
mostEnrichedGenes <- genesBarplot(tcgaExclusive, gsFilter = 12.45)
@

\includegraphics{tcgaExclusiveGenesBarplot.pdf}

<<MIGSAres exploring7, eval=TRUE>>=
mostEnrichedGenes$data
## Gene 652 is contributing to enrichment in 15 gene sets. And in total
## there are 6 genes that are being really active in TCGA enriched
## gene sets
tcgaImportantGenes <- as.character(mostEnrichedGenes$data$id)
@

<<echo=FALSE,results=hide>>=
# it is the same code as below, but saving the pdf to correctly load it in tex
pdf("consensusGsetsGenesBarplot.pdf")
consMostEnrichedGenes <- genesBarplot(consensusGsets, gsFilter = 53.25)
dev.off()
@

<<MIGSAres exploring8, eval=FALSE>>=
## Let's do the same analysis for the rest of the datasets, so we can filtrate
## which genes are acting exclusively in TCGA datasets
consMostEnrichedGenes <- genesBarplot(consensusGsets, gsFilter = 53.25)
@

\includegraphics{consensusGsetsGenesBarplot.pdf}

<<MIGSAres exploring9, eval=TRUE>>=
consImportantGenes <- as.character(consMostEnrichedGenes$data$id)
## Let's see which genes they share
intersect(tcgaImportantGenes, consImportantGenes)
## And get the really tcga exclusive genes (5 genes)
tcgaExclGenes <- setdiff(tcgaImportantGenes, consImportantGenes)
@

Another way of exploring the data is for example, suppose we have a list of
genes of interest, we can filter our results having the gene sets that were
enriched by our interest genes as follows:

<<MIGSAres exploring10>>=
## Let's sample 4 genes from consImportantGenes (as if they are our interest
## genes)
set.seed(8818)
myInterestGenes <- sample(consImportantGenes, 4)
## So we can get the filtered MIGSAres object by doing:
intGenesMigsa <- filterByGenes(bcMigsaRes, myInterestGenes)
dim(intGenesMigsa)
head(intGenesMigsa)
@

And with this new MIGSAres object reproduce the same analysis done below.

\section*{Session Info}
<<Session Info, echo=true>>=
sessionInfo()
@

\bibliography{MIGSA}

\end{document}