COTAN 2.11.4
library(COTAN)
library(zeallot)
# necessary to solve precedence of overloads
conflicted::conflict_prefer("%<-%", "zeallot")
# enable multi-processing (not on Windows)
prevOptState <- options(parallelly.fork.enable = TRUE)
This tutorial shows the available tools in COTAN to help with datasets cleaning
Define a directory where the data and the output will be stored.
dataDir <- file.path(tempdir(), "COTAN_vignette_data")
dir.create(dataDir, recursive = TRUE, showWarnings = FALSE)
print(dataDir)
#> [1] "/tmp/RtmpRABiog/COTAN_vignette_data"
outDir <- dataDir
# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)
#> Setting new log level to 2
# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_dataset_cleaning.log"))
#> Setting log file to be: /tmp/RtmpRABiog/COTAN_vignette_data/vignette_dataset_cleaning.log
Download the data-set for mouse cortex E17.5 from Yuzwa et al. (2017)
GEO <- "GSM2861514"
dir.create(file.path(dataDir, GEO), showWarnings = FALSE)
datasetFileName <-
file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz")
# retries up to 5 times to get the dataset
attempts <- 0L
maxAttempts <- 5L
ok <- FALSE
while (attempts < maxAttempts && !ok) {
attempts <- attempts + 1L
if (!file.exists(datasetFileName)) {
res <- try(
GEOquery::getGEOSuppFiles(
GEO = GEO,
makeDirectory = TRUE,
baseDir = dataDir,
filter_regex = base::basename(datasetFileName),
fetch_files = TRUE
),
silent = TRUE
)
}
ok <- file.exists(datasetFileName)
if (!ok && attempts < maxAttempts) {
Sys.sleep(1)
}
}
#> Setting options('download.file.method.GEOquery'='auto')
#> Setting options('GEOquery.inmemory.gpl'=FALSE)
assertthat::assert_that(
ok,
msg = paste0(
"Failed to retrieve file '", datasetFileName,
"' after ", maxAttempts, " attempts."
)
)
#> [1] TRUE
rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L)
print(dim(rawDataset))
#> [1] 17085 2000
Initialize the COTAN object with the row count table and
the metadata from the experiment.
cond <- "mouse_cortex_E17.5"
obj <- COTAN(raw = rawDataset)
obj <-
initializeMetaDataset(
obj,
GEO = GEO,
sequencingMethod = "Drop_seq",
sampleCondition = cond
)
#> Initializing `COTAN` meta-data
logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
logLevel = 1L)
#> Condition mouse_cortex_E17.5
It is also possible to create a COTAN object starting from a
SingleCellExperiment object (and vice-versa).
Note that default patterns try to look-up in the most common locations…
sceObj <- convertToSingleCellExperiment(obj)
obj2 <- convertFromSingleCellExperiment(
sceObj,
assayName = "counts", # the assay that contains the raw counts
genesNamesPattern = "", # reg-exp to identify genes' names column
cellsIDsPattern = "", # reg-exp to identify cells' ID column
clNamesPattern = "", # reg-exp to identify clusterizations columns
condNamesPattern = "" # reg-exp to identify cells' conditions columns
)
rm(sceObj, obj2)
# mark cells origin
data(vignette.cells.origin)
head(vignette.cells.origin, 18)
#> GCGAATTGTGAA ACCGATAACTGA CCCCGGGTGCGA CTGTAGATGTTA TCATCGAAGCGC TTCTACCGAGTC
#> Other Other Other Other Other Other
#> CTGTTCCCGGCG GCGTGTTAGTTC CTCGCGCGTTTA GATGTATAACTT GCGCTATGATTT CGTTTAGTTTAC
#> Other Other Other Other Other Cortical
#> GTGGAGGCCCAT TGTCACTACATC TCTAGAACAACG ACCTTTGTTCGT TTGTCTTCTTCG TAAAATATCGCC
#> Other Other Cortical Cortical Cortical Cortical
#> Levels: Cortical Other
obj <-
addCondition(obj, condName = "origin", conditions = vignette.cells.origin)
rm(vignette.cells.origin)
# Sometimes the condition is encoded in the cells ID using some separator
# so it is useful to use the following function to retrieve it
if (FALSE) { # This is not the case with the dataset currently in use
batch <- COTAN::conditionsFromNames(
names = getCells(obj), splitPattern = "_", fragmentNum = 2L)
names(batch) <- getCells(obj)
obj <- addCondition(obj, condName = "batch", conditions = batch)
}
Before proceeding with the analysis, it might be necessary to clean the data.
The analysis uses a matrix of raw UMI counts as the input.
However the given raw data might not satisfy the needed minimal quality
requirements: it is better to remove any potential cell doublets or
multiplets, as well as any low quality or dying cells.
We can check the library size (UMI number) with an Empirical Cumulative
Distribution function discriminating the cells by their origin
Please note that in the following plots it is not necessary to specify a condition
plot(ECDPlot(obj, condName = "origin"))
The we can also plot directly the distribution of the library size
plot(cellSizePlot(obj, condName = "origin"))
and we can plot the distribution of the number of detected genes in each cell, useful to identify possible doublets
plot(genesSizePlot(obj, condName = "origin"))
As last plot we can check the scatter plot between the library size and the number of detected genes: cells far away from the main group are usually problematic
plot(scatterPlot(obj, condName = "origin"))
During the cleaning, every time we want to remove cells or genes
we can use the dropGenesCells() function.
Drop cells with too many reads as they are probably doublets or multiplets and show the updated plot
# Store the used threshold in the COTAN object for later reference
cellsSizeThr <- 6000L
obj <- addElementToMetaDataset(obj, "Cells size threshold", cellsSizeThr)
cellsToRem <- getCells(obj)[getCellsSize(obj) > cellsSizeThr]
obj <- dropGenesCells(obj, cells = cellsToRem)
plot(cellSizePlot(obj, condName = "origin"))
To drop cells by gene number: high genes count might also indicate doublets…
# Store the used threshold in the COTAN object for later reference
genesSizeHighThr <- 3000L
obj <-
addElementToMetaDataset(obj, "Num genes high threshold", genesSizeHighThr)
cellsToRem <- getCells(obj)[getNumExpressedGenes(obj) > genesSizeHighThr]
obj <- dropGenesCells(obj, cells = cellsToRem)
plot(genesSizePlot(obj, condName = "origin"))
Drop cells with too low genes expression as they are probably dead
# Store the used threshold in the COTAN object for later reference
genesSizeLowThr <- 500L
obj <- addElementToMetaDataset(obj, "Num genes low threshold", genesSizeLowThr)
cellsToRem <- getCells(obj)[getNumExpressedGenes(obj) < genesSizeLowThr]
obj <- dropGenesCells(obj, cells = cellsToRem)
plot(genesSizePlot(obj, condName = "origin"))
Cells with a too high percentage of mitochondrial genes are likely dead (or at the last problematic) cells. So we drop them!
c(mitPlot, mitSizes) %<-%
mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin")
plot(mitPlot)
# Store the used threshold in the COTAN object for later reference
mitPercThr <- 3.5
obj <- addElementToMetaDataset(obj, "Mitoc. perc. threshold", mitPercThr)
cellsToRem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr]
obj <- dropGenesCells(obj, cells = cellsToRem)
c(mitPlot, mitSizes) %<-%
mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin")
plot(mitPlot)
As last plot we can check the scatter plot between the library size and the number of detected genes: cells far away from the main group are usually problematic
plot(scatterPlot(obj, condName = "origin"))
We want also to log the current status.
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
#> n cells 1793
getDims(obj)
#> $raw
#> [1] 17085 1793
#>
#> $genesCoex
#> [1] 0 0
#>
#> $cellsCoex
#> [1] 0 0
#>
#> $metaDataset
#> [1] 10
#>
#> $metaGenes
#> [1] 0
#>
#> $metaCells
#> [1] 1
#>
#> $clustersCoex
#> [1] 0
The clean() function estimates the nu parameters from the data and drops
genes/cells that do not satisfy the minimal expression thresholds.
Therefore, it need to be run again every time any genes or cells are dropped
from the data.
obj <- addElementToMetaDataset(obj, "Num drop B group", 0L)
obj <- clean(obj)
#> Genes/cells selection done: dropped [4324] genes and [0] cells
#> Working on [12761] genes and [1793] cells
c(pcaCellsPlot, pcaCellsData, genesPlot,
UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)
#> Clean plots: START
#> PCA: START
#> PCA: DONE
#> Hierarchical clustering: START
#> Hierarchical clustering: DONE
#> Total calculations elapsed time: 2.7022819519043
#> Clean plots: DONE
plot(pcaCellsPlot)
The above plot shows a sub-group of cells B that is somewhat distinct from
the other cells
With the plot below it is possible to see the most expressed genes
in the cells of the B group
plot(genesPlot)
To color the PCA based on nu (so the cells’ efficiency)
plot(UDEPlot)
UDE (color) should not correlate with principal components!
This is very important.
The next part is used to remove the cells with efficiency too low.
plot(nuPlot)
We can zoom on the smallest values and, if COTAN detects a clear elbow, we can decide to remove the cells.
plot(zoomedNuPlot)
We can observe here that the B group cells are really enriched in hemoglobin
genes so we prefer to remove them.
They can be extracted from the pcaCellsData object and removed.
cellsToRem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]
obj <- dropGenesCells(obj, cells = cellsToRem)
# Store how many times the B group was dropped
obj <- addElementToMetaDataset(obj, "Num drop B group", 1L)
We also save the defined threshold in the metadata and re-run the estimation
udeLowThr <- 0.34
obj <- addElementToMetaDataset(obj, "Low UDE cells' threshold", udeLowThr)
obj <- estimateNuLinear(obj)
cellsToRem <- getCells(obj)[getNu(obj) < udeLowThr]
obj <- dropGenesCells(obj, cells = cellsToRem)
We repeat all the plots now that the low quality cells are removed In many practical cases the last few steps are repeated multiple times until deemed appropriate
plot(ECDPlot(obj, condName = "origin"))
plot(cellSizePlot(obj, condName = "origin"))
plot(genesSizePlot(obj, condName = "origin"))
plot(scatterPlot(obj, condName = "origin"))
c(mitPlot, mitSizes) %<-%
mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin")
plot(mitPlot)
Then we rerun also the cleaning and associated plots and notice that all is essentially fine
obj <- clean(obj)
#> Genes/cells selection done: dropped [6] genes and [0] cells
#> Working on [12755] genes and [1783] cells
c(pcaCellsPlot, pcaCellsData, genesPlot,
UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)
#> Clean plots: START
#> PCA: START
#> PCA: DONE
#> Hierarchical clustering: START
#> Hierarchical clustering: DONE
#> Total calculations elapsed time: 2.55034136772156
#> Clean plots: DONE
plot(pcaCellsPlot)
plot(genesPlot)
plot(UDEPlot)
plot(nuPlot)
plot(zoomedNuPlot)
# It is also possible to directly plot the PCA results
plot(pcaCellsData)
Log the remaining number of cells
logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
#> n cells 1783
table(getCondition(obj, condName = "origin"))
#>
#> Cortical Other
#> 844 939
The next few lines are just to clean.
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
# delete file if it exists
file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
# delete file if it exists
file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
unlink(file.path(outDir, cond), recursive = TRUE)
}
# if (dir.exists(file.path(outDir, GEO))) {
# unlink(file.path(outDir, GEO), recursive = TRUE)
# }
# stop logging to file
setLoggingFile("")
#> Closing previous log file - Setting log file to be:
file.remove(file.path(outDir, "vignette_dataset_cleaning.log"))
#> [1] TRUE
options(prevOptState)
Sys.time()
#> [1] "2026-03-29 17:02:07 EDT"
sessionInfo()
#> R Under development (unstable) (2026-03-05 r89546)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] zeallot_0.2.0 COTAN_2.11.4 BiocStyle_2.39.0
#>
#> loaded via a namespace (and not attached):
#> [1] RcppAnnoy_0.0.23 splines_4.6.0
#> [3] later_1.4.8 tibble_3.3.1
#> [5] polyclip_1.10-7 XML_3.99-0.23
#> [7] fastDummies_1.7.5 httr2_1.2.2
#> [9] lifecycle_1.0.5 doParallel_1.0.17
#> [11] globals_0.19.1 lattice_0.22-9
#> [13] MASS_7.3-65 ggdist_3.3.3
#> [15] dendextend_1.19.1 magrittr_2.0.4
#> [17] limma_3.67.0 plotly_4.12.0
#> [19] sass_0.4.10 rmarkdown_2.31
#> [21] jquerylib_0.1.4 yaml_2.3.12
#> [23] httpuv_1.6.17 otel_0.2.0
#> [25] Seurat_5.4.0 sctransform_0.4.3
#> [27] spam_2.11-3 sp_2.2-1
#> [29] spatstat.sparse_3.1-0 reticulate_1.45.0
#> [31] cowplot_1.2.0 pbapply_1.7-4
#> [33] RColorBrewer_1.1-3 abind_1.4-8
#> [35] Rtsne_0.17 GenomicRanges_1.63.1
#> [37] purrr_1.2.1 BiocGenerics_0.57.0
#> [39] rappdirs_0.3.4 circlize_0.4.17
#> [41] IRanges_2.45.0 S4Vectors_0.49.0
#> [43] ggrepel_0.9.8 irlba_2.3.7
#> [45] listenv_0.10.1 spatstat.utils_3.2-2
#> [47] rentrez_1.2.4 goftest_1.2-3
#> [49] RSpectra_0.16-2 spatstat.random_3.4-5
#> [51] fitdistrplus_1.2-6 parallelly_1.46.1
#> [53] codetools_0.2-20 DelayedArray_0.37.0
#> [55] xml2_1.5.2 tidyselect_1.2.1
#> [57] shape_1.4.6.1 farver_2.1.2
#> [59] ScaledMatrix_1.19.0 viridis_0.6.5
#> [61] matrixStats_1.5.0 stats4_4.6.0
#> [63] spatstat.explore_3.8-0 Seqinfo_1.1.0
#> [65] jsonlite_2.0.0 GetoptLong_1.1.0
#> [67] progressr_0.18.0 ggridges_0.5.7
#> [69] survival_3.8-6 iterators_1.0.14
#> [71] systemfonts_1.3.2 foreach_1.5.2
#> [73] tools_4.6.0 ragg_1.5.2
#> [75] ica_1.0-3 Rcpp_1.1.1
#> [77] glue_1.8.0 gridExtra_2.3
#> [79] SparseArray_1.11.11 xfun_0.57
#> [81] distributional_0.7.0 MatrixGenerics_1.23.0
#> [83] ggthemes_5.2.0 dplyr_1.2.0
#> [85] withr_3.0.2 BiocManager_1.30.27
#> [87] fastmap_1.2.0 digest_0.6.39
#> [89] rsvd_1.0.5 parallelDist_0.2.7
#> [91] R6_2.6.1 mime_0.13
#> [93] textshaping_1.0.5 colorspace_2.1-2
#> [95] scattermore_1.2 tensor_1.5.1
#> [97] dichromat_2.0-0.1 spatstat.data_3.1-9
#> [99] tidyr_1.3.2 generics_0.1.4
#> [101] data.table_1.18.2.1 httr_1.4.8
#> [103] htmlwidgets_1.6.4 S4Arrays_1.11.1
#> [105] uwot_0.2.4 pkgconfig_2.0.3
#> [107] gtable_0.3.6 ComplexHeatmap_2.27.1
#> [109] lmtest_0.9-40 S7_0.2.1
#> [111] SingleCellExperiment_1.33.2 XVector_0.51.0
#> [113] htmltools_0.5.9 dotCall64_1.2
#> [115] bookdown_0.46 zigg_0.0.2
#> [117] clue_0.3-68 SeuratObject_5.3.0
#> [119] scales_1.4.0 Biobase_2.71.0
#> [121] png_0.1-9 spatstat.univar_3.1-7
#> [123] knitr_1.51 tzdb_0.5.0
#> [125] reshape2_1.4.5 rjson_0.2.23
#> [127] curl_7.0.0 nlme_3.1-169
#> [129] proxy_0.4-29 cachem_1.1.0
#> [131] zoo_1.8-15 GlobalOptions_0.1.3
#> [133] stringr_1.6.0 KernSmooth_2.23-26
#> [135] parallel_4.6.0 miniUI_0.1.2
#> [137] GEOquery_2.79.0 pillar_1.11.1
#> [139] grid_4.6.0 vctrs_0.7.1
#> [141] RANN_2.6.2 promises_1.5.0
#> [143] BiocSingular_1.27.1 beachmat_2.27.3
#> [145] xtable_1.8-8 cluster_2.1.8.2
#> [147] evaluate_1.0.5 magick_2.9.1
#> [149] tinytex_0.59 readr_2.2.0
#> [151] cli_3.6.5 compiler_4.6.0
#> [153] rlang_1.1.7 crayon_1.5.3
#> [155] future.apply_1.20.2 labeling_0.4.3
#> [157] plyr_1.8.9 stringi_1.8.7
#> [159] viridisLite_0.4.3 deldir_2.0-4
#> [161] BiocParallel_1.45.0 assertthat_0.2.1
#> [163] lazyeval_0.2.2 spatstat.geom_3.7-3
#> [165] Matrix_1.7-5 RcppHNSW_0.6.0
#> [167] hms_1.1.4 patchwork_1.3.2
#> [169] future_1.70.0 conflicted_1.2.0
#> [171] ggplot2_4.0.2 statmod_1.5.1
#> [173] shiny_1.13.0 SummarizedExperiment_1.41.1
#> [175] ROCR_1.0-12 Rfast_2.1.5.2
#> [177] memoise_2.0.1 igraph_2.2.2
#> [179] RcppParallel_5.1.11-2 bslib_0.10.0