COTAN 2.11.4
library(COTAN)
library(zeallot)
# necessary to solve precedence of overloads
conflicted::conflict_prefer("%<-%", "zeallot")
# enable multi-processing (not on Windows)
prevOptState <- options(parallelly.fork.enable = TRUE)
This tutorial shows the available tools in COTAN to help with genes’ clustering
Define a directory where the data and the output will be stored
dataDir <- file.path(tempdir(), "COTAN_vignette_data")
dir.create(dataDir, recursive = TRUE, showWarnings = FALSE)
print(dataDir)
#> [1] "/tmp/RtmpRABiog/COTAN_vignette_data"
outDir <- dataDir
# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)
#> Setting new log level to 2
# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_genes_clustering.log"))
#> Setting log file to be: /tmp/RtmpRABiog/COTAN_vignette_data/vignette_genes_clustering.log
Download the data-set for mouse cortex E17.5 from Yuzwa et al. (2017)
GEO <- "GSM2861514"
dir.create(file.path(dataDir, GEO), showWarnings = FALSE)
datasetFileName <-
file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz")
# retries up to 5 times to get the dataset
attempts <- 0L
maxAttempts <- 5L
ok <- FALSE
while (attempts < maxAttempts && !ok) {
attempts <- attempts + 1L
if (!file.exists(datasetFileName)) {
res <- try(
GEOquery::getGEOSuppFiles(
GEO = GEO,
makeDirectory = TRUE,
baseDir = dataDir,
filter_regex = base::basename(datasetFileName),
fetch_files = TRUE
),
silent = TRUE
)
}
ok <- file.exists(datasetFileName)
if (!ok && attempts < maxAttempts) {
Sys.sleep(1)
}
}
assertthat::assert_that(
ok,
msg = paste0(
"Failed to retrieve file '", datasetFileName,
"' after ", maxAttempts, " attempts."
)
)
#> [1] TRUE
rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L)
print(dim(rawDataset))
#> [1] 17085 2000
Initialize the COTAN object with the row count table and
the metadata from the experiment.
cond <- "mouse_cortex_E17.5"
obj <- COTAN(raw = rawDataset)
obj <-
initializeMetaDataset(
obj,
GEO = GEO,
sequencingMethod = "Drop_seq",
sampleCondition = cond
)
#> Initializing `COTAN` meta-data
logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
logLevel = 1L)
#> Condition mouse_cortex_E17.5
Assign to each cell its origin
# mark cells origin
data(vignette.cells.origin)
head(vignette.cells.origin, 18)
#> GCGAATTGTGAA ACCGATAACTGA CCCCGGGTGCGA CTGTAGATGTTA TCATCGAAGCGC TTCTACCGAGTC
#> Other Other Other Other Other Other
#> CTGTTCCCGGCG GCGTGTTAGTTC CTCGCGCGTTTA GATGTATAACTT GCGCTATGATTT CGTTTAGTTTAC
#> Other Other Other Other Other Cortical
#> GTGGAGGCCCAT TGTCACTACATC TCTAGAACAACG ACCTTTGTTCGT TTGTCTTCTTCG TAAAATATCGCC
#> Other Other Cortical Cortical Cortical Cortical
#> Levels: Cortical Other
obj <-
addCondition(obj, condName = "origin", conditions = vignette.cells.origin)
rm(vignette.cells.origin)
# use previously established results to determine
# which cells were dropped in the cleaning stage
data(vignette.split.clusters)
cellsToDrop <-
getCells(obj)[!(getCells(obj) %in% names(vignette.split.clusters))]
obj <- dropGenesCells(obj, cells = cellsToDrop)
# Log the remaining number of cells
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
#> n cells 1783
table(getCondition(obj, condName = "origin"))
#>
#> Cortical Other
#> 844 939
rm(vignette.split.clusters)
In this part, the COTAN model is calibrated:
all the contingency tables are computed and used to get
the statistics necessary to COEX evaluation and storing
obj <-
proceedToCoex(
obj,
calcCoex = TRUE,
optimizeForSpeed = TRUE,
cores = 3L,
deviceStr = "cuda"
)
#> COTAN dataset analysis: START
#> Genes/cells selection done: dropped [4330] genes and [0] cells
#> Working on [12755] genes and [1783] cells
#> Estimate `dispersion`: START
#> Total calculations elapsed time: 9.03410530090332
#> Estimate `dispersion`: DONE
#> Estimate `dispersion`: DONE
#> `dispersion` | min: -0.0349260648055935 | max: 760.574661197634 | % negative: 12.5362602900823
#> COTAN genes' COEX estimation: START
#> While trying to load the `torch` library Error in doTryCatch(return(expr), name, parentenv, handler): The `torch` library is installed but the required additional libraries are not available yet
#> Warning in value[[3L]](cond): The `torch` library is installed, but might
#> require further initialization
#> Warning in value[[3L]](cond): Please look at the `torch` package installation
#> guide to complete the installation
#> Warning in canUseTorch(optimizeForSpeed, deviceStr): Falling back to legacy
#> [non-torch] code.
#> Calculate genes' COEX (legacy): START
#> Warning in asMethod(object): sparse->dense coercion: allocating vector of size
#> 1.2 GiB
#> Total calculations elapsed time: 317.353862047195
#> Calculate genes' COEX (legacy): DONE
#> COTAN dataset analysis: DONE
COEXSometimes we can also be interested in the numbers present directly in the contingency tables for two specific genes.
cat("Contingency Tables:")
#> Contingency Tables:
print(contingencyTables(obj, g1 = "Satb2", g2 = "Bcl11b"))
#> $observed
#> Satb2.yes Satb2.no
#> Bcl11b.yes 62 565
#> Bcl11b.no 308 848
#>
#> $expected
#> Satb2.yes Satb2.no
#> Bcl11b.yes 138.583 488.417
#> Bcl11b.no 231.417 924.583
cat(paste("Corresponding COEX value:",
getGenesCoex(obj, zeroDiagonal = FALSE)["Satb2", "Bcl11b"]))
#> Corresponding COEX value: -0.219641253268044
Another useful function is getGenesCoex(). This can be used to extract
the whole or a partial COEX matrix from a COTAN object.
# Extract the whole matrix
coex <- getGenesCoex(obj, zeroDiagonal = FALSE)
coex[1000L:1005L, 1000L:1005L]
#> 6 x 6 Matrix of class "dspMatrix"
#> Ankrd6 Ankrd61 Anks1 Anks1b Anks3
#> Ankrd6 0.919828628 0.006022968 0.039727285 0.019217926 0.022676686
#> Ankrd61 0.006022968 0.304643397 0.012120271 0.012826424 0.002818821
#> Anks1 0.039727285 0.012120271 0.907300176 -0.011094145 -0.006198863
#> Anks1b 0.019217926 0.012826424 -0.011094145 0.919554726 -0.017552746
#> Anks3 0.022676686 0.002818821 -0.006198863 -0.017552746 0.905659149
#> Anks6 0.014713461 -0.004394969 -0.021761229 -0.007003361 0.029217368
#> Anks6
#> Ankrd6 0.014713461
#> Ankrd61 -0.004394969
#> Anks1 -0.021761229
#> Anks1b -0.007003361
#> Anks3 0.029217368
#> Anks6 0.513438755
# For a partial matrix
coex_sm <- getGenesCoex(obj, genes = c("Satb2", "Bcl11b", "Fezf2"),
zeroDiagonal = FALSE)
coex_sm[2001:2010, ]
#> 10 x 3 Matrix of class "dgeMatrix"
#> Bcl11b Fezf2 Satb2
#> Ccdc91 -0.007340495 -0.015017365 0.0081678484
#> Ccdc92 0.006863524 -0.002183516 0.0121977478
#> Ccdc93 -0.009075446 0.032998076 -0.0185992406
#> Ccdc94 -0.025712653 -0.013185117 0.0153650072
#> Ccdc96 -0.028979603 -0.015681037 0.0484463876
#> Ccdc97 -0.029885047 -0.013649152 -0.0390277387
#> Cchcr1 -0.015829490 0.019683483 0.0005360145
#> Cck -0.040912312 0.049164448 0.0376753146
#> Ccl12 -0.030998730 -0.011044525 -0.0124730051
#> Ccl25 -0.037731486 -0.006071913 0.0028277194
GDITo calculate and store the Global Differentiation Index (GDI)
use the following:
gdiDF <- calculateGDI(obj, cores = 3L)
#> Calculate GDI dataframe: START
#> Calculate `GDI`: START
#> Total calculations elapsed time: 7.52655005455017
#> Calculate `GDI`: DONE
#> Total calculations elapsed time: 17.3810369968414
#> Calculate GDI dataframe: DONE
# Show the most *differentially expressed* genes
perm <- order(gdiDF[["GDI"]], decreasing = TRUE)
head(gdiDF[perm, ], n = 10)
#> sum.raw.norm GDI exp.cells
#> Mki67 2.400195 2.810177 6.393718
#> Hmgb2 2.669711 2.789769 10.095345
#> 2810417H13Rik 2.373125 2.773485 6.505889
#> Pbk 2.189592 2.716527 5.496354
#> Prc1 2.357143 2.699208 5.384184
#> Dbi 3.145560 2.687651 23.331464
#> Ccnd2 3.220094 2.687415 21.817162
#> Vim 2.711821 2.670697 14.638250
#> Mfge8 2.370640 2.664075 5.384184
#> Cdca8 2.260013 2.658388 6.169377
# This will store only the $GDI column
obj <- storeGDI(obj, genesGDI = gdiDF)
The other two columns return the sum of the normalized data rows and the percentage of cells expressing the gene respectively.
genesLists <- list(
# Neural Progenitor Genes
"NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"),
# Pan Neural Genes
"PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"),
# House Keeping
"hk" = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a",
"Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1",
"Tars", "Amacr")
)
GDI PlotThe next function can either plot the GDI from the dataset directly or
use the already stored data.
It marks the given threshold 1.40 (in red) and
the 50% and 75% quantiles (in blue).
We can also specify some gene sets (three in this case) that
we want to label explicitly in the GDI plot.
GDIPlot(obj, genes = genesLists, GDIThreshold = 1.40)
#> GDI plot
#> Removed 0 low GDI genes (such as the fully-expressed) in GDI plot
In the picture above it is possible to see how, as expected,
the house-keeping genes (hk) have generally a low GDI value.
# saving the structure
saveRDS(obj, file = file.path(outDir, paste0(cond, ".cotan.RDS")))
To perform the Gene Pair Analysis, we can plot a heatmap of the COEX values
between two gene sets.
We have to define the different gene sets (list.genes) in a list.
The first set will be used to define the columns;
then we can choose which sets to use in the function parameter sets
(for example, from 1 to 3).
plot(heatmapPlot(obj, genesLists = genesLists, sets = 1L:3L, cores = 3L))
#> Heatmap plot: START
#> Hangling COTAN object with condition: mouse_cortex_E17.5
#> calculating PValues: START
#> Get p-values on a set of genes on columns and on a set of genes on rows
#> Total calculations elapsed time: 4.87478017807007
#> calculating PValues: DONE
#> min COEX: -0.335360548131137 max COEX: 0.329177031378561
#> Total calculations elapsed time: 4.97292065620422
#> Heatmap plot: DONE
layersGenes <- list(
"L1" = c("Reln", "Lhx5"),
"L2/3" = c("Cux1", "Satb2"),
"L4" = c("Rorb", "Sox5"),
"L5/6" = c("Bcl11b", "Fezf2"),
"Prog" = c("Hes1", "Vim", "Dummy")
)
We can also plot a general heatmap of COEX values based on some markers like
the following one.
layersMarkers <- c("Satb2", "Bcl11b", "Vim", "Hes1")
genesHeatmapPlot(
obj,
primaryMarkers = layersMarkers,
pValueThreshold = 0.0001,
symmetric = TRUE,
cores = 3L
)
#> calculating PValues: START
#> Get p-values genome wide on columns and genome wide on rows
#> Total calculations elapsed time: 35.4846775531769
#> calculating PValues: DONE
genesHeatmapPlot(
obj,
primaryMarkers = c("Satb2", "Bcl11b", "Fezf2"),
secondaryMarkers = c("Gabra3", "Meg3", "Cux1", "Neurod6"),
pValueThreshold = 0.001,
symmetric = FALSE,
cores = 3L
)
COTAN provides a way to establish genes’ clusters given some lists of markers
c(gSpace, eigPlot, pcaGenesClDF, treePlot) %<-%
establishGenesClusters(
obj,
groupMarkers = layersGenes,
numGenesPerMarker = 25L,
kCuts = 5L
)
#> Establishing gene clusters - START
#> Calculating gene co-expression space - START
#> calculating PValues: START
#> Get p-values on a set of genes on columns and genome wide on rows
#> Total calculations elapsed time: 7.20328378677368
#> calculating PValues: DONE
#> Number of selected secondary markers: 190
#> Calculate `GDI`: START
#> Total calculations elapsed time: 0.246947526931763
#> Calculate `GDI`: DONE
#> Total calculations elapsed time: 15.0532441139221
#> Calculating gene co-expression space - DONE
#> Total calculations elapsed time: 15.5592749118805
#> Establishing gene clusters - DONE
plot(eigPlot)
plot(treePlot)
UMAP plotcolSelection <- vapply(pcaGenesClDF, is.numeric, logical(1L))
genesUmapPl <-
UMAPPlot(
pcaGenesClDF[, colSelection, drop = FALSE],
clusters = getColumnFromDF(pcaGenesClDF, "hclust"),
elements = layersGenes,
title = "Genes' clusters UMAP Plot",
numNeighbors = 32L,
minPointsDist = 0.25
)
#> UMAP plot: START
#> 17:10:31 UMAP embedding parameters a = 1.121 b = 1.057
#> 17:10:31 Read 1276 rows and found 12 numeric columns
#> 17:10:31 Using Annoy for neighbor search, n_neighbors = 32
#> 17:10:31 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 17:10:31 Writing NN index file to temp file /tmp/RtmpRABiog/file33d4397c52c4bf
#> 17:10:31 Searching Annoy index using 1 thread, search_k = 3200
#> 17:10:32 Annoy recall = 100%
#> 17:10:33 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 32
#> 17:10:35 Initializing from normalized Laplacian + noise (using RSpectra)
#> 17:10:35 Commencing optimization for 500 epochs, with 55474 positive edges
#> 17:10:35 Using rng type: pcg
#> 17:10:37 Optimization finished
#> Total calculations elapsed time: 6.26974511146545
#> UMAP plot: DONE
plot(genesUmapPl)
The next few lines are just to clean.
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
# delete file if it exists
file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
# delete file if it exists
file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
unlink(file.path(outDir, cond), recursive = TRUE)
}
# if (dir.exists(file.path(outDir, GEO))) {
# unlink(file.path(outDir, GEO), recursive = TRUE)
# }
# stop logging to file
setLoggingFile("")
#> Closing previous log file - Setting log file to be:
file.remove(file.path(outDir, "vignette_genes_clustering.log"))
#> [1] TRUE
options(prevOptState)
Sys.time()
#> [1] "2026-03-29 17:10:38 EDT"
sessionInfo()
#> R Under development (unstable) (2026-03-05 r89546)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] zeallot_0.2.0 COTAN_2.11.4 BiocStyle_2.39.0
#>
#> loaded via a namespace (and not attached):
#> [1] matrixStats_1.5.0 spatstat.sparse_3.1-0
#> [3] httr_1.4.8 RColorBrewer_1.1-3
#> [5] doParallel_1.0.17 tools_4.6.0
#> [7] sctransform_0.4.3 R6_2.6.1
#> [9] lazyeval_0.2.2 uwot_0.2.4
#> [11] ggdist_3.3.3 GetoptLong_1.1.0
#> [13] withr_3.0.2 sp_2.2-1
#> [15] gridExtra_2.3 parallelDist_0.2.7
#> [17] progressr_0.18.0 cli_3.6.5
#> [19] Biobase_2.71.0 textshaping_1.0.5
#> [21] Cairo_1.7-0 spatstat.explore_3.8-0
#> [23] fastDummies_1.7.5 labeling_0.4.3
#> [25] sass_0.4.10 Seurat_5.4.0
#> [27] S7_0.2.1 spatstat.data_3.1-9
#> [29] readr_2.2.0 proxy_0.4-29
#> [31] ggridges_0.5.7 pbapply_1.7-4
#> [33] systemfonts_1.3.2 rentrez_1.2.4
#> [35] dichromat_2.0-0.1 parallelly_1.46.1
#> [37] limma_3.67.0 torch_0.16.3
#> [39] generics_0.1.4 shape_1.4.6.1
#> [41] ica_1.0-3 spatstat.random_3.4-5
#> [43] dplyr_1.2.0 distributional_0.7.0
#> [45] dendextend_1.19.1 Matrix_1.7-5
#> [47] S4Vectors_0.49.0 abind_1.4-8
#> [49] lifecycle_1.0.5 yaml_2.3.12
#> [51] SummarizedExperiment_1.41.1 SparseArray_1.11.11
#> [53] Rtsne_0.17 grid_4.6.0
#> [55] promises_1.5.0 crayon_1.5.3
#> [57] miniUI_0.1.2 lattice_0.22-9
#> [59] beachmat_2.27.3 cowplot_1.2.0
#> [61] magick_2.9.1 pillar_1.11.1
#> [63] knitr_1.51 ComplexHeatmap_2.27.1
#> [65] GenomicRanges_1.63.1 rjson_0.2.23
#> [67] future.apply_1.20.2 codetools_0.2-20
#> [69] glue_1.8.0 spatstat.univar_3.1-7
#> [71] data.table_1.18.2.1 vctrs_0.7.1
#> [73] png_0.1-9 spam_2.11-3
#> [75] gtable_0.3.6 assertthat_0.2.1
#> [77] cachem_1.1.0 zigg_0.0.2
#> [79] xfun_0.57 S4Arrays_1.11.1
#> [81] mime_0.13 Rfast_2.1.5.2
#> [83] Seqinfo_1.1.0 survival_3.8-6
#> [85] SingleCellExperiment_1.33.2 iterators_1.0.14
#> [87] tinytex_0.59 statmod_1.5.1
#> [89] fitdistrplus_1.2-6 ROCR_1.0-12
#> [91] nlme_3.1-169 bit64_4.6.0-1
#> [93] RcppAnnoy_0.0.23 bslib_0.10.0
#> [95] irlba_2.3.7 KernSmooth_2.23-26
#> [97] otel_0.2.0 colorspace_2.1-2
#> [99] BiocGenerics_0.57.0 tidyselect_1.2.1
#> [101] processx_3.8.6 bit_4.6.0
#> [103] compiler_4.6.0 curl_7.0.0
#> [105] httr2_1.2.2 xml2_1.5.2
#> [107] DelayedArray_0.37.0 plotly_4.12.0
#> [109] bookdown_0.46 scales_1.4.0
#> [111] lmtest_0.9-40 callr_3.7.6
#> [113] rappdirs_0.3.4 stringr_1.6.0
#> [115] digest_0.6.39 goftest_1.2-3
#> [117] spatstat.utils_3.2-2 rmarkdown_2.31
#> [119] GEOquery_2.79.0 XVector_0.51.0
#> [121] coro_1.1.0 htmltools_0.5.9
#> [123] pkgconfig_2.0.3 MatrixGenerics_1.23.0
#> [125] fastmap_1.2.0 rlang_1.1.7
#> [127] GlobalOptions_0.1.3 htmlwidgets_1.6.4
#> [129] ggthemes_5.2.0 shiny_1.13.0
#> [131] farver_2.1.2 jquerylib_0.1.4
#> [133] zoo_1.8-15 jsonlite_2.0.0
#> [135] BiocParallel_1.45.0 BiocSingular_1.27.1
#> [137] magrittr_2.0.4 dotCall64_1.2
#> [139] patchwork_1.3.2 Rcpp_1.1.1
#> [141] viridis_0.6.5 reticulate_1.45.0
#> [143] stringi_1.8.7 MASS_7.3-65
#> [145] plyr_1.8.9 parallel_4.6.0
#> [147] listenv_0.10.1 ggrepel_0.9.8
#> [149] deldir_2.0-4 splines_4.6.0
#> [151] tensor_1.5.1 hms_1.1.4
#> [153] circlize_0.4.17 ps_1.9.1
#> [155] igraph_2.2.2 spatstat.geom_3.7-3
#> [157] RcppHNSW_0.6.0 reshape2_1.4.5
#> [159] stats4_4.6.0 ScaledMatrix_1.19.0
#> [161] XML_3.99-0.23 evaluate_1.0.5
#> [163] SeuratObject_5.3.0 RcppParallel_5.1.11-2
#> [165] BiocManager_1.30.27 tzdb_0.5.0
#> [167] foreach_1.5.2 httpuv_1.6.17
#> [169] RANN_2.6.2 tidyr_1.3.2
#> [171] purrr_1.2.1 polyclip_1.10-7
#> [173] future_1.70.0 clue_0.3-68
#> [175] scattermore_1.2 ggplot2_4.0.2
#> [177] conflicted_1.2.0 rsvd_1.0.5
#> [179] xtable_1.8-8 RSpectra_0.16-2
#> [181] later_1.4.8 viridisLite_0.4.3
#> [183] ragg_1.5.2 tibble_3.3.1
#> [185] memoise_2.0.1 IRanges_2.45.0
#> [187] cluster_2.1.8.2 globals_0.19.1