Contents

0.1 Preamble

library(COTAN)
library(zeallot)

# necessary to solve precedence of overloads
conflicted::conflict_prefer("%<-%", "zeallot")

# enable multi-processing (not on Windows)
prevOptState <- options(parallelly.fork.enable = TRUE)

1 Introduction

This tutorial shows the available tools in COTAN to help with datasets cleaning

1.0.1 Setup

Define a directory where the data and the output will be stored.

dataDir <- file.path(tempdir(), "COTAN_vignette_data")
dir.create(dataDir, recursive = TRUE, showWarnings = FALSE)

print(dataDir)
#> [1] "/tmp/RtmpRABiog/COTAN_vignette_data"

outDir <- dataDir

# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)
#> Setting new log level to 2

# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_dataset_cleaning.log"))
#> Setting log file to be: /tmp/RtmpRABiog/COTAN_vignette_data/vignette_dataset_cleaning.log

1.1 Retrieving the data-set

Download the data-set for mouse cortex E17.5 from Yuzwa et al. (2017)

GEO <- "GSM2861514"

dir.create(file.path(dataDir, GEO), showWarnings = FALSE)

datasetFileName <-
  file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz")

# retries up to 5 times to get the dataset
attempts <- 0L
maxAttempts <- 5L
ok <- FALSE

while (attempts < maxAttempts && !ok) {
  attempts <- attempts + 1L

  if (!file.exists(datasetFileName)) {
    res <- try(
      GEOquery::getGEOSuppFiles(
        GEO = GEO,
        makeDirectory = TRUE,
        baseDir = dataDir,
        filter_regex = base::basename(datasetFileName),
        fetch_files = TRUE
      ),
      silent = TRUE
    )
  }

  ok <- file.exists(datasetFileName)

  if (!ok && attempts < maxAttempts) {
    Sys.sleep(1)
  }
}
#> Setting options('download.file.method.GEOquery'='auto')
#> Setting options('GEOquery.inmemory.gpl'=FALSE)

assertthat::assert_that(
  ok,
  msg = paste0(
    "Failed to retrieve file '", datasetFileName,
    "' after ", maxAttempts, " attempts."
  )
)
#> [1] TRUE

rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L)

print(dim(rawDataset))
#> [1] 17085  2000

1.1.1 Create the COTAN object

Initialize the COTAN object with the row count table and the metadata from the experiment.

cond <- "mouse_cortex_E17.5"

obj <- COTAN(raw = rawDataset)
obj <-
  initializeMetaDataset(
    obj,
    GEO = GEO,
    sequencingMethod = "Drop_seq",
    sampleCondition = cond
  )
#> Initializing `COTAN` meta-data

logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
        logLevel = 1L)
#> Condition mouse_cortex_E17.5

1.1.1.1 Alternative ways to create a COTAN object

It is also possible to create a COTAN object starting from a SingleCellExperiment object (and vice-versa).

Note that default patterns try to look-up in the most common locations…

sceObj <- convertToSingleCellExperiment(obj)

obj2 <- convertFromSingleCellExperiment(
  sceObj,
  assayName = "counts",   # the assay that contains the raw counts
  genesNamesPattern = "", # reg-exp to identify genes' names column
  cellsIDsPattern = "",   # reg-exp to identify cells' ID column
  clNamesPattern = "",    # reg-exp to identify clusterizations columns
  condNamesPattern = ""   # reg-exp to identify cells' conditions columns
)

rm(sceObj, obj2)

1.1.2 Assign to each cell its origin

# mark cells origin
data(vignette.cells.origin)
head(vignette.cells.origin, 18)
#> GCGAATTGTGAA ACCGATAACTGA CCCCGGGTGCGA CTGTAGATGTTA TCATCGAAGCGC TTCTACCGAGTC 
#>        Other        Other        Other        Other        Other        Other 
#> CTGTTCCCGGCG GCGTGTTAGTTC CTCGCGCGTTTA GATGTATAACTT GCGCTATGATTT CGTTTAGTTTAC 
#>        Other        Other        Other        Other        Other     Cortical 
#> GTGGAGGCCCAT TGTCACTACATC TCTAGAACAACG ACCTTTGTTCGT TTGTCTTCTTCG TAAAATATCGCC 
#>        Other        Other     Cortical     Cortical     Cortical     Cortical 
#> Levels: Cortical Other

obj <- 
  addCondition(obj, condName = "origin", conditions = vignette.cells.origin)

rm(vignette.cells.origin)

# Sometimes the condition is encoded in the cells ID using some separator
# so it is useful to use the following function to retrieve it
if (FALSE) { # This is not the case with the dataset currently in use
  batch <- COTAN::conditionsFromNames(
    names = getCells(obj), splitPattern = "_", fragmentNum = 2L)
  names(batch) <- getCells(obj)
  obj <- addCondition(obj, condName = "batch", conditions = batch)
}

2 Data cleaning

Before proceeding with the analysis, it might be necessary to clean the data.

The analysis uses a matrix of raw UMI counts as the input. However the given raw data might not satisfy the needed minimal quality requirements: it is better to remove any potential cell doublets or multiplets, as well as any low quality or dying cells.

2.1 Plots from the raw counts

We can check the library size (UMI number) with an Empirical Cumulative Distribution function discriminating the cells by their origin

Please note that in the following plots it is not necessary to specify a condition

plot(ECDPlot(obj, condName = "origin"))

The we can also plot directly the distribution of the library size

plot(cellSizePlot(obj, condName = "origin"))

and we can plot the distribution of the number of detected genes in each cell, useful to identify possible doublets

plot(genesSizePlot(obj, condName = "origin"))

As last plot we can check the scatter plot between the library size and the number of detected genes: cells far away from the main group are usually problematic

plot(scatterPlot(obj, condName = "origin"))

2.2 Dropping problematic cells

During the cleaning, every time we want to remove cells or genes we can use the dropGenesCells() function.

Drop cells with too many reads as they are probably doublets or multiplets and show the updated plot

# Store the used threshold in the COTAN object for later reference
cellsSizeThr <- 6000L
obj <- addElementToMetaDataset(obj, "Cells size threshold", cellsSizeThr)

cellsToRem <- getCells(obj)[getCellsSize(obj) > cellsSizeThr]
obj <- dropGenesCells(obj, cells = cellsToRem)

plot(cellSizePlot(obj, condName = "origin"))

To drop cells by gene number: high genes count might also indicate doublets…

# Store the used threshold in the COTAN object for later reference
genesSizeHighThr <- 3000L
obj <-
  addElementToMetaDataset(obj, "Num genes high threshold", genesSizeHighThr)

cellsToRem <- getCells(obj)[getNumExpressedGenes(obj) > genesSizeHighThr]
obj <- dropGenesCells(obj, cells = cellsToRem)

plot(genesSizePlot(obj, condName = "origin"))

Drop cells with too low genes expression as they are probably dead

# Store the used threshold in the COTAN object for later reference
genesSizeLowThr <- 500L
obj <- addElementToMetaDataset(obj, "Num genes low threshold", genesSizeLowThr)

cellsToRem <- getCells(obj)[getNumExpressedGenes(obj) < genesSizeLowThr]
obj <- dropGenesCells(obj, cells = cellsToRem)

plot(genesSizePlot(obj, condName = "origin"))

Cells with a too high percentage of mitochondrial genes are likely dead (or at the last problematic) cells. So we drop them!

c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin")

plot(mitPlot)

# Store the used threshold in the COTAN object for later reference
mitPercThr <- 3.5
obj <- addElementToMetaDataset(obj, "Mitoc. perc. threshold", mitPercThr)

cellsToRem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr]
obj <- dropGenesCells(obj, cells = cellsToRem)

c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin")

plot(mitPlot)

As last plot we can check the scatter plot between the library size and the number of detected genes: cells far away from the main group are usually problematic

plot(scatterPlot(obj, condName = "origin"))

We want also to log the current status.

logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
#> n cells 1793

getDims(obj)
#> $raw
#> [1] 17085  1793
#> 
#> $genesCoex
#> [1] 0 0
#> 
#> $cellsCoex
#> [1] 0 0
#> 
#> $metaDataset
#> [1] 10
#> 
#> $metaGenes
#> [1] 0
#> 
#> $metaCells
#> [1] 1
#> 
#> $clustersCoex
#> [1] 0

The clean() function estimates the nu parameters from the data and drops genes/cells that do not satisfy the minimal expression thresholds. Therefore, it need to be run again every time any genes or cells are dropped from the data.

obj <- addElementToMetaDataset(obj, "Num drop B group", 0L)

obj <- clean(obj)
#> Genes/cells selection done: dropped [4324] genes and [0] cells
#> Working on [12761] genes and [1793] cells

c(pcaCellsPlot, pcaCellsData, genesPlot,
  UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)
#> Clean plots: START
#> PCA: START
#> PCA: DONE
#> Hierarchical clustering: START
#> Hierarchical clustering: DONE
#> Total calculations elapsed time: 2.7022819519043
#> Clean plots: DONE

plot(pcaCellsPlot)

The above plot shows a sub-group of cells B that is somewhat distinct from the other cells With the plot below it is possible to see the most expressed genes in the cells of the B group

plot(genesPlot)

To color the PCA based on nu (so the cells’ efficiency)

plot(UDEPlot)

UDE (color) should not correlate with principal components! This is very important.

The next part is used to remove the cells with efficiency too low.

plot(nuPlot)

We can zoom on the smallest values and, if COTAN detects a clear elbow, we can decide to remove the cells.

plot(zoomedNuPlot)

We can observe here that the B group cells are really enriched in hemoglobin genes so we prefer to remove them. They can be extracted from the pcaCellsData object and removed.

cellsToRem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]
obj <- dropGenesCells(obj, cells = cellsToRem)

# Store how many times the B group was dropped
obj <- addElementToMetaDataset(obj, "Num drop B group", 1L)

We also save the defined threshold in the metadata and re-run the estimation

udeLowThr <- 0.34
obj <- addElementToMetaDataset(obj, "Low UDE cells' threshold", udeLowThr)

obj <- estimateNuLinear(obj)

cellsToRem <- getCells(obj)[getNu(obj) < udeLowThr]
obj <- dropGenesCells(obj, cells = cellsToRem)

2.2.1 Check all good

We repeat all the plots now that the low quality cells are removed In many practical cases the last few steps are repeated multiple times until deemed appropriate

plot(ECDPlot(obj, condName = "origin"))


plot(cellSizePlot(obj, condName = "origin"))


plot(genesSizePlot(obj, condName = "origin"))


plot(scatterPlot(obj, condName = "origin"))


c(mitPlot, mitSizes) %<-%
  mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin")

plot(mitPlot)

Then we rerun also the cleaning and associated plots and notice that all is essentially fine

obj <- clean(obj)
#> Genes/cells selection done: dropped [6] genes and [0] cells
#> Working on [12755] genes and [1783] cells

c(pcaCellsPlot, pcaCellsData, genesPlot,
  UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj)
#> Clean plots: START
#> PCA: START
#> PCA: DONE
#> Hierarchical clustering: START
#> Hierarchical clustering: DONE
#> Total calculations elapsed time: 2.55034136772156
#> Clean plots: DONE
plot(pcaCellsPlot)


plot(genesPlot)


plot(UDEPlot)


plot(nuPlot)

plot(zoomedNuPlot)


# It is also possible to directly plot the PCA results
plot(pcaCellsData)

Log the remaining number of cells

logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)
#> n cells 1783

table(getCondition(obj, condName = "origin"))
#> 
#> Cortical    Other 
#>      844      939

2.3 Vignette clean-up stage

The next few lines are just to clean.

if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
  unlink(file.path(outDir, cond), recursive = TRUE)
}
# if (dir.exists(file.path(outDir, GEO))) {
#   unlink(file.path(outDir, GEO), recursive = TRUE)
# }

# stop logging to file
setLoggingFile("")
#> Closing previous log file - Setting log file to be:
file.remove(file.path(outDir, "vignette_dataset_cleaning.log"))
#> [1] TRUE

options(prevOptState)
Sys.time()
#> [1] "2026-03-29 17:02:07 EDT"

sessionInfo()
#> R Under development (unstable) (2026-03-05 r89546)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB              LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/New_York
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] zeallot_0.2.0    COTAN_2.11.4     BiocStyle_2.39.0
#> 
#> loaded via a namespace (and not attached):
#>   [1] RcppAnnoy_0.0.23            splines_4.6.0              
#>   [3] later_1.4.8                 tibble_3.3.1               
#>   [5] polyclip_1.10-7             XML_3.99-0.23              
#>   [7] fastDummies_1.7.5           httr2_1.2.2                
#>   [9] lifecycle_1.0.5             doParallel_1.0.17          
#>  [11] globals_0.19.1              lattice_0.22-9             
#>  [13] MASS_7.3-65                 ggdist_3.3.3               
#>  [15] dendextend_1.19.1           magrittr_2.0.4             
#>  [17] limma_3.67.0                plotly_4.12.0              
#>  [19] sass_0.4.10                 rmarkdown_2.31             
#>  [21] jquerylib_0.1.4             yaml_2.3.12                
#>  [23] httpuv_1.6.17               otel_0.2.0                 
#>  [25] Seurat_5.4.0                sctransform_0.4.3          
#>  [27] spam_2.11-3                 sp_2.2-1                   
#>  [29] spatstat.sparse_3.1-0       reticulate_1.45.0          
#>  [31] cowplot_1.2.0               pbapply_1.7-4              
#>  [33] RColorBrewer_1.1-3          abind_1.4-8                
#>  [35] Rtsne_0.17                  GenomicRanges_1.63.1       
#>  [37] purrr_1.2.1                 BiocGenerics_0.57.0        
#>  [39] rappdirs_0.3.4              circlize_0.4.17            
#>  [41] IRanges_2.45.0              S4Vectors_0.49.0           
#>  [43] ggrepel_0.9.8               irlba_2.3.7                
#>  [45] listenv_0.10.1              spatstat.utils_3.2-2       
#>  [47] rentrez_1.2.4               goftest_1.2-3              
#>  [49] RSpectra_0.16-2             spatstat.random_3.4-5      
#>  [51] fitdistrplus_1.2-6          parallelly_1.46.1          
#>  [53] codetools_0.2-20            DelayedArray_0.37.0        
#>  [55] xml2_1.5.2                  tidyselect_1.2.1           
#>  [57] shape_1.4.6.1               farver_2.1.2               
#>  [59] ScaledMatrix_1.19.0         viridis_0.6.5              
#>  [61] matrixStats_1.5.0           stats4_4.6.0               
#>  [63] spatstat.explore_3.8-0      Seqinfo_1.1.0              
#>  [65] jsonlite_2.0.0              GetoptLong_1.1.0           
#>  [67] progressr_0.18.0            ggridges_0.5.7             
#>  [69] survival_3.8-6              iterators_1.0.14           
#>  [71] systemfonts_1.3.2           foreach_1.5.2              
#>  [73] tools_4.6.0                 ragg_1.5.2                 
#>  [75] ica_1.0-3                   Rcpp_1.1.1                 
#>  [77] glue_1.8.0                  gridExtra_2.3              
#>  [79] SparseArray_1.11.11         xfun_0.57                  
#>  [81] distributional_0.7.0        MatrixGenerics_1.23.0      
#>  [83] ggthemes_5.2.0              dplyr_1.2.0                
#>  [85] withr_3.0.2                 BiocManager_1.30.27        
#>  [87] fastmap_1.2.0               digest_0.6.39              
#>  [89] rsvd_1.0.5                  parallelDist_0.2.7         
#>  [91] R6_2.6.1                    mime_0.13                  
#>  [93] textshaping_1.0.5           colorspace_2.1-2           
#>  [95] scattermore_1.2             tensor_1.5.1               
#>  [97] dichromat_2.0-0.1           spatstat.data_3.1-9        
#>  [99] tidyr_1.3.2                 generics_0.1.4             
#> [101] data.table_1.18.2.1         httr_1.4.8                 
#> [103] htmlwidgets_1.6.4           S4Arrays_1.11.1            
#> [105] uwot_0.2.4                  pkgconfig_2.0.3            
#> [107] gtable_0.3.6                ComplexHeatmap_2.27.1      
#> [109] lmtest_0.9-40               S7_0.2.1                   
#> [111] SingleCellExperiment_1.33.2 XVector_0.51.0             
#> [113] htmltools_0.5.9             dotCall64_1.2              
#> [115] bookdown_0.46               zigg_0.0.2                 
#> [117] clue_0.3-68                 SeuratObject_5.3.0         
#> [119] scales_1.4.0                Biobase_2.71.0             
#> [121] png_0.1-9                   spatstat.univar_3.1-7      
#> [123] knitr_1.51                  tzdb_0.5.0                 
#> [125] reshape2_1.4.5              rjson_0.2.23               
#> [127] curl_7.0.0                  nlme_3.1-169               
#> [129] proxy_0.4-29                cachem_1.1.0               
#> [131] zoo_1.8-15                  GlobalOptions_0.1.3        
#> [133] stringr_1.6.0               KernSmooth_2.23-26         
#> [135] parallel_4.6.0              miniUI_0.1.2               
#> [137] GEOquery_2.79.0             pillar_1.11.1              
#> [139] grid_4.6.0                  vctrs_0.7.1                
#> [141] RANN_2.6.2                  promises_1.5.0             
#> [143] BiocSingular_1.27.1         beachmat_2.27.3            
#> [145] xtable_1.8-8                cluster_2.1.8.2            
#> [147] evaluate_1.0.5              magick_2.9.1               
#> [149] tinytex_0.59                readr_2.2.0                
#> [151] cli_3.6.5                   compiler_4.6.0             
#> [153] rlang_1.1.7                 crayon_1.5.3               
#> [155] future.apply_1.20.2         labeling_0.4.3             
#> [157] plyr_1.8.9                  stringi_1.8.7              
#> [159] viridisLite_0.4.3           deldir_2.0-4               
#> [161] BiocParallel_1.45.0         assertthat_0.2.1           
#> [163] lazyeval_0.2.2              spatstat.geom_3.7-3        
#> [165] Matrix_1.7-5                RcppHNSW_0.6.0             
#> [167] hms_1.1.4                   patchwork_1.3.2            
#> [169] future_1.70.0               conflicted_1.2.0           
#> [171] ggplot2_4.0.2               statmod_1.5.1              
#> [173] shiny_1.13.0                SummarizedExperiment_1.41.1
#> [175] ROCR_1.0-12                 Rfast_2.1.5.2              
#> [177] memoise_2.0.1               igraph_2.2.2               
#> [179] RcppParallel_5.1.11-2       bslib_0.10.0