GSEAlens derives its name from Lens, symbolizing how this package acts as a magnifying glass to help researchers deeply explore key pathways in GSEA enrichment analysis.
GSEAlens provides a web-based interactive platform for displaying pathway introductions and descriptions, integrating AI-assisted pathway enrichment result export functionality. By encapsulating workflows and standardizing input formats, this R package simplifies the process of viewing and exploring GSEA enrichment analysis results.
Using the airway package as an example, this R package contains two groups with 4 replicates each.
##
# First check if airway is already installed
if (!requireNamespace("airway", quietly = TRUE)) {
# If not installed, check and install BiocManager first
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
# Install airway through BiocManager
BiocManager::install("airway")
}
library("airway")## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: generics
##
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
##
## as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
## setequal, union
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
## as.data.frame, basename, cbind, colnames, dirname, do.call,
## duplicated, eval, evalq, get, grep, grepl, is.unsorted, lapply,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## rank, rbind, rownames, sapply, saveRDS, table, tapply, unique,
## unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## I, expand.grid, unname
## Loading required package: IRanges
## Loading required package: Seqinfo
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
First, refer to the standard workflow: use the edgeR package to assemble a DGEList object, design a no-intercept matrix and rename row names, then create contrasts for fitting, filter low-expression genes, retain protein-coding RNAs, and use symbols as row names.
Note: GSEAlens is based on a no-intercept design limma-voom workflow (~0+group) to ensure that column names directly correspond to group names.
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(limma)
group_level <- airway@colData@listData[["dex"]]
design <- model.matrix(~0+group_level)
colnames(design) <- levels(group_level)
compare_end <- combn(levels(group_level), 2, simplify = FALSE)
contrast_strings <- sapply(compare_end, function(x) paste(x[2], x[1], sep = " - "))
contrast_matrix <- makeContrasts(contrasts = contrast_strings, levels = design)
genes_df <- data.frame(
gene_id = airway@rowRanges@elementMetadata@listData$gene_id,
symbol = airway@rowRanges@elementMetadata@listData$symbol,
gene_biotype = airway@rowRanges@elementMetadata@listData$gene_biotype
)
genes_df$Length <- airway@rowRanges@elementMetadata@listData$gene_seq_end -
airway@rowRanges@elementMetadata@listData$gene_seq_start + 1
gsea_limma_voom_data <- edgeR::DGEList(
counts = assay(airway, "counts"),
genes = genes_df,
norm.factors = NULL,
group = group_level,
remove.zeros = T
)## Removing 30208 rows with all zero counts
total_counts <- edgeR::cpm(gsea_limma_voom_data) |> rowSums()
dup_symbols <- gsea_limma_voom_data$genes$symbol[duplicated(gsea_limma_voom_data$genes$symbol)]
keep <- rep(TRUE, nrow(gsea_limma_voom_data))
for (gene in dup_symbols) {
idx <- which(gsea_limma_voom_data$genes$symbol == gene)
best_idx <- idx[which.max(total_counts[idx])]
remove_idx <- idx[idx != best_idx]
keep[remove_idx] <- FALSE
}
gsea_limma_voom_data <- gsea_limma_voom_data[keep, ]
rownames(gsea_limma_voom_data) <- gsea_limma_voom_data$genes$symbol
keep_biotype <- gsea_limma_voom_data$genes$gene_biotype == "protein_coding"
gsea_limma_voom_data <- gsea_limma_voom_data[keep_biotype,]
gsea_limma_voom_data <- edgeR::normLibSizes(gsea_limma_voom_data, method = "TMM")
isexpr <- rowSums(edgeR::cpm(gsea_limma_voom_data) > 1) >= 3
gsea_limma_voom_data <- gsea_limma_voom_data[isexpr,]Then perform limma-voom fitting to obtain the fit object.
Refer to the DESeq2 package documentation for data cleaning and processing. First, remove non-coding RNAs.
library("DESeq2")
dds_se <- DESeqDataSet(expression_data, design = ~ cell + dex)
gene_biotypes <- rowData(dds_se)$gene_biotype
keep_protein_coding <- gene_biotypes == "protein_coding"
dds_se <- dds_se[keep_protein_coding, ]
dds_se## class: DESeqDataSet
## dim: 22810 8
## metadata(2): '' version
## assays(1): counts
## rownames(22810): ENSG00000000003 ENSG00000000005 ... ENSG00000273482
## ENSG00000273490
## rowData names(10): gene_id gene_name ... seq_coord_system symbol
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
Then remove low-expression genes and use symbols as row names.
smallestGroupSize <- 3
keep <- rowSums(counts(dds_se) >= 10) >= smallestGroupSize
dds_se <- dds_se[keep,]
dds_se## class: DESeqDataSet
## dim: 13256 8
## metadata(2): '' version
## assays(1): counts
## rownames(13256): ENSG00000000003 ENSG00000000419 ... ENSG00000272047
## ENSG00000272325
## rowData names(10): gene_id gene_name ... seq_coord_system symbol
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
rownames(dds_se) <- dds_se@rowRanges@elementMetadata@listData[["gene_name"]]
total_counts <- rowSums(SummarizedExperiment::assay(dds_se))
keep <- rep(TRUE, nrow(dds_se))
dup_genes <- rownames(dds_se)[duplicated(rownames(dds_se))]
for (gene in dup_genes) {
idx <- which(rownames(dds_se) == gene)
best_idx <- idx[which.max(total_counts[idx])]
remove_idx <- idx[idx != best_idx]
keep[remove_idx] <- FALSE
}
dds_se <- dds_se[keep, ]Run DESeq2 processing.
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Filter low-expression genes, retain only protein-coding RNAs, and rename matrix row names to symbols.
DDS_rawdata <- expression_data
gene_biotypes_epd <- rowData(DDS_rawdata)$gene_biotype
keep_protein_coding_epd <- gene_biotypes == "protein_coding"
DDS_rawdata <- DDS_rawdata[keep_protein_coding, ]
smallestGroupSize <- 3
keep_epd <- rowSums(assay(DDS_rawdata, "counts") >= 10) >= smallestGroupSize
DDS_rawdata <- DDS_rawdata[keep_epd,]
rownames(DDS_rawdata) <- DDS_rawdata@rowRanges@elementMetadata@listData[["gene_name"]]
total_counts <- rowSums(SummarizedExperiment::assay(DDS_rawdata))
keep_name <- rep(TRUE, nrow(DDS_rawdata))
dup_genes <- rownames(DDS_rawdata)[duplicated(rownames(DDS_rawdata))]
for (gene in dup_genes) {
idx <- which(rownames(DDS_rawdata) == gene)
best_idx <- idx[which.max(total_counts[idx])]
remove_idx <- idx[idx != best_idx]
keep_name[remove_idx] <- FALSE
}
DDS_rawdata <- DDS_rawdata[keep_name, ]Matrix information extraction.
cts <- assay(DDS_rawdata,"counts")
coldata <- DDS_rawdata@colData %>% as.data.frame.array()
coldata <- coldata[,c("cell","dex")]
coldata$cell <- factor(coldata$cell)
coldata$dex <- factor(coldata$dex)Assemble into DDS object and process.
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Use the build_gsea_pathways function to construct a pathway object for GSEA enrichment analysis.
## [build_gsea_pathways] Species: Homo sapiens (Human)
##
## Selected 5 collection(s). Batch Tag: [Mix5_HS_C5_GO_BP]
##
## [build_gsea_pathways] Done! Built 16333 pathways x 19676 genes for Homo sapiens (Human)
Through the setup_gsea_env function, assemble a GSEAEnv object for computational analysis. Different workflows use the same function with different data inputs. For the limma-voom workflow, since the fit object does not contain the original gene read counts, the filtered DGEList used to generate the fit object must be additionally provided. In this example, it is the multi-step filtered gsea_limma_voom_data.
Assemble the limma-voom workflow object for analysis.
gseadata_limmavoom <- GSEAlens::setup_gsea_env(fit = fit,pathway_obj = gsea_pathwaysets,expr_data = gsea_limma_voom_data)##
## Starting GSEAlens engine...
## Detected input type: [Limma-Voom]
## GseaEnv object built successfully!
## Contains 1 contrast groups
## Contains 16333 pathways
Assemble the DESeq2 SummarizedExperiment object for analysis.
##
## Starting GSEAlens engine...
## Detected input type: [DESeq2]
## [DESeq2] target_factor not specified, automatically inferred as: 'dex'
## GseaEnv object built successfully!
## Contains 1 contrast groups
## Contains 16333 pathways
Assemble the DESeq2 Count matrix workflow object for analysis.
##
## Starting GSEAlens engine...
## Detected input type: [DESeq2]
## [DESeq2] target_factor not specified, automatically inferred as: 'dex'
## GseaEnv object built successfully!
## Contains 1 contrast groups
## Contains 16333 pathways
All objects are processed using the batch_calc_gsea function with no differences.
Parallel computing note: Adjust the workers option based on your computer’s performance to set the number of cores for computation. More contrasts recommend higher core settings for better computational efficiency.
# limma-voom workflow
gsea_res_limmavoom <- GSEAlens::batch_calc_gsea(gseadata_limmavoom,
custom_series_name = "limmavoom_data",
workers = 4, # Adjust based on number of contrasts and computer performance
force = TRUE)
# DESeq2 SummarizedExperiment workflow
gsea_res_se <- GSEAlens::batch_calc_gsea(gseadata_se,
custom_series_name = "dds_se_data",
workers = 4,
force = TRUE)
# DESeq2 Count matrix workflow
gsea_res_dds <- GSEAlens::batch_calc_gsea(gseadata_dds,
custom_series_name = "dds_data",
workers = 4,
force = TRUE)After running batch_calc_gsea, an RDS file will be generated in the corresponding directory. Load this file or use import_gsea_capsule to read it. The import_gsea_capsule function can automatically organize relevant files into the folder where your RMD file or R script is located, and provide data inspection results.
gsea_res <- import_gsea_capsule("/path/to/your/files/")
# Or directly read the RDS file
# gsea_res <- readRDS("/path/of/your/file/")After loading the data, you can use launch_gsea_app for interactive viewing.
The GseaEnv object returned by the setup_gsea_env function contains the following components:
| Component | Description |
|---|---|
backend_info |
Backend type information (limma-voom or DESeq2) |
contrast_registry |
Contrast registry containing all pairwise comparison information |
de_store |
Differential expression analysis results storage |
expr_bundle |
Expression data bundle (raw counts, normalized matrix, sample metadata) |
geneset |
Geneset information (TERM2GENE, metadata dictionary, species) |
The GseaRes object returned by the batch_calc_gsea function contains the following components:
| Component | Description |
|---|---|
metadata |
Computation metadata (runtime, cores used, parameter settings) |
backend_info |
Backend type information |
contrast_registry |
Contrast registry |
de_store |
Differential expression analysis results storage |
expr_bundle |
Expression data bundle |
geneset_info |
Geneset information |
results |
GSEA results list, one entry per contrast |
The GseaTask object returned by the extract_gsea_task function is used for single-contrast analysis:
| Component | Description |
|---|---|
gsea_res |
GSEA result object |
meta |
Metadata (contrast information, geneset name, expression data) |
## R version 4.6.0 RC (2026-04-17 r89917)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] DESeq2_1.52.0 edgeR_4.10.0
## [3] limma_3.68.2 airway_1.32.0
## [5] SummarizedExperiment_1.42.0 Biobase_2.72.0
## [7] GenomicRanges_1.64.0 Seqinfo_1.2.0
## [9] IRanges_2.46.0 S4Vectors_0.50.1
## [11] BiocGenerics_0.58.1 generics_0.1.4
## [13] MatrixGenerics_1.24.0 matrixStats_1.5.0
## [15] GSEAlens_0.99.0
##
## loaded via a namespace (and not attached):
## [1] splines_4.6.0 later_1.4.8 ggplotify_0.1.3
## [4] tibble_3.3.1 polyclip_1.10-7 enrichit_0.1.4
## [7] lifecycle_1.0.5 httr2_1.2.2 doParallel_1.0.17
## [10] globals_0.19.1 processx_3.9.0 lattice_0.22-9
## [13] MASS_7.3-65 magrittr_2.0.5 plotly_4.12.0
## [16] sass_0.4.10 rmarkdown_2.31 jquerylib_0.1.4
## [19] yaml_2.3.12 httpuv_1.6.17 otel_0.2.0
## [22] ggtangle_0.1.2 DBI_1.3.0 RColorBrewer_1.1-3
## [25] abind_1.4-8 purrr_1.2.2 msigdbr_26.1.0
## [28] yulab.utils_0.2.4 tweenr_2.0.3 rappdirs_0.3.4
## [31] aisdk_1.1.0 gdtools_0.5.0 circlize_0.4.18
## [34] enrichplot_1.32.0 ggrepel_0.9.8 listenv_0.10.1
## [37] tidytree_0.4.7 parallelly_1.47.0 codetools_0.2-20
## [40] DelayedArray_0.38.1 DOSE_4.6.0 DT_0.34.0
## [43] ggforce_0.5.0 tidyselect_1.2.1 shape_1.4.6.1
## [46] aplot_0.2.9 farver_2.1.2 jsonlite_2.0.0
## [49] GetoptLong_1.1.1 iterators_1.0.14 systemfonts_1.3.2
## [52] foreach_1.5.2 tools_4.6.0 ggnewscale_0.5.2
## [55] treeio_1.36.1 Rcpp_1.1.1-1.1 glue_1.8.1
## [58] SparseArray_1.12.2 xfun_0.57 qvalue_2.44.0
## [61] dplyr_1.2.1 withr_3.0.2 fastmap_1.2.0
## [64] callr_3.7.6 digest_0.6.39 R6_2.6.1
## [67] mime_0.13 gridGraphics_0.5-1 colorspace_2.1-2
## [70] GO.db_3.23.1 dichromat_2.0-0.1 RSQLite_3.52.0
## [73] tidyr_1.3.2 fontLiberation_0.1.0 data.table_1.18.4
## [76] httr_1.4.8 htmlwidgets_1.6.4 S4Arrays_1.12.0
## [79] scatterpie_0.2.6 pkgconfig_2.0.3 gtable_0.3.6
## [82] blob_1.3.0 ComplexHeatmap_2.28.0 S7_0.2.2
## [85] XVector_0.52.0 clusterProfiler_4.20.0 htmltools_0.5.9
## [88] fontBitstreamVera_0.1.1 clue_0.3-68 scales_1.4.0
## [91] png_0.1-9 ggfun_0.2.0 knitr_1.51
## [94] reshape2_1.4.5 rjson_0.2.23 nlme_3.1-169
## [97] curl_7.1.0 cachem_1.1.0 GlobalOptions_0.1.4
## [100] stringr_1.6.0 shinycssloaders_1.1.0 parallel_4.6.0
## [103] AnnotationDbi_1.74.0 pillar_1.11.1 grid_4.6.0
## [106] vctrs_0.7.3 promises_1.5.0 tidydr_0.0.6
## [109] xtable_1.8-8 cluster_2.1.8.2 evaluate_1.0.5
## [112] cli_3.6.6 locfit_1.5-9.12 compiler_4.6.0
## [115] rlang_1.2.0 crayon_1.5.3 future.apply_1.20.2
## [118] ps_1.9.3 plyr_1.8.9 fs_2.1.0
## [121] ggiraph_0.9.6 stringi_1.8.7 viridisLite_0.4.3
## [124] BiocParallel_1.46.0 assertthat_0.2.1 babelgene_22.9
## [127] Biostrings_2.80.0 lazyeval_0.2.3 GOSemSim_2.38.0
## [130] fontquiver_0.2.1 Matrix_1.7-5 patchwork_1.3.2
## [133] bit64_4.8.0 future_1.70.0 ggplot2_4.0.3
## [136] KEGGREST_1.52.0 statmod_1.5.2 shiny_1.13.0
## [139] clipr_0.8.0 igraph_2.3.1 memoise_2.0.1
## [142] bslib_0.11.0 ggtree_4.2.0 bit_4.6.0
## [145] ape_5.8-1 gson_0.1.0