## ----knitr options, echo = FALSE, include = FALSE----------------------------- plotDev <- if (requireNamespace("ragg", quietly = TRUE)) { "ragg_png" } else if (requireNamespace("Cairo", quietly = TRUE)) { "cairo_png" } else { "png" } knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7L, fig.height = 5L, dev = plotDev, fig.crop = FALSE, dpi = 100L ) ## ----preamble, message=FALSE, warning=FALSE----------------------------------- library(COTAN) library(zeallot) # necessary to solve precedence of overloads conflicted::conflict_prefer("%<-%", "zeallot") # enable multi-processing (not on Windows) prevOptState <- options(parallelly.fork.enable = TRUE) ## ----setup-and-log------------------------------------------------------------ dataDir <- file.path(tempdir(), "COTAN_vignette_data") dir.create(dataDir, recursive = TRUE, showWarnings = FALSE) print(dataDir) outDir <- dataDir # Log-level 2 was chosen to showcase better how the package works # In normal usage a level of 0 or 1 is more appropriate setLoggingLevel(2L) # This file will contain all the logs produced by the package # as if at the highest logging level setLoggingFile(file.path(outDir, "vignette_dataset_cleaning.log")) ## ----download-data------------------------------------------------------------ GEO <- "GSM2861514" dir.create(file.path(dataDir, GEO), showWarnings = FALSE) datasetFileName <- file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz") # retries up to 5 times to get the dataset attempts <- 0L maxAttempts <- 5L ok <- FALSE while (attempts < maxAttempts && !ok) { attempts <- attempts + 1L if (!file.exists(datasetFileName)) { res <- try( GEOquery::getGEOSuppFiles( GEO = GEO, makeDirectory = TRUE, baseDir = dataDir, filter_regex = base::basename(datasetFileName), fetch_files = TRUE ), silent = TRUE ) } ok <- file.exists(datasetFileName) if (!ok && attempts < maxAttempts) { Sys.sleep(1) } } assertthat::assert_that( ok, msg = paste0( "Failed to retrieve file '", datasetFileName, "' after ", maxAttempts, " attempts." ) ) rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L) print(dim(rawDataset)) ## ----create-cotan-object------------------------------------------------------ cond <- "mouse_cortex_E17.5" obj <- COTAN(raw = rawDataset) obj <- initializeMetaDataset( obj, GEO = GEO, sequencingMethod = "Drop_seq", sampleCondition = cond ) logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])), logLevel = 1L) ## ----conversions-to-and-from-sce---------------------------------------------- sceObj <- convertToSingleCellExperiment(obj) obj2 <- convertFromSingleCellExperiment( sceObj, assayName = "counts", # the assay that contains the raw counts genesNamesPattern = "", # reg-exp to identify genes' names column cellsIDsPattern = "", # reg-exp to identify cells' ID column clNamesPattern = "", # reg-exp to identify clusterizations columns condNamesPattern = "" # reg-exp to identify cells' conditions columns ) rm(sceObj, obj2) ## ----add-origin-to-the-cotan-object------------------------------------------- # mark cells origin data(vignette.cells.origin) head(vignette.cells.origin, 18) obj <- addCondition(obj, condName = "origin", conditions = vignette.cells.origin) rm(vignette.cells.origin) # Sometimes the condition is encoded in the cells ID using some separator # so it is useful to use the following function to retrieve it if (FALSE) { # This is not the case with the dataset currently in use batch <- COTAN::conditionsFromNames( names = getCells(obj), splitPattern = "_", fragmentNum = 2L) names(batch) <- getCells(obj) obj <- addCondition(obj, condName = "batch", conditions = batch) } ## ----ecd-plot----------------------------------------------------------------- plot(ECDPlot(obj, condName = "origin")) ## ----cells-size-plot---------------------------------------------------------- plot(cellSizePlot(obj, condName = "origin")) ## ----number-detected-genes---------------------------------------------------- plot(genesSizePlot(obj, condName = "origin")) ## ----scatter-plot------------------------------------------------------------- plot(scatterPlot(obj, condName = "origin")) ## ----cull-probable-doublets--------------------------------------------------- # Store the used threshold in the COTAN object for later reference cellsSizeThr <- 6000L obj <- addElementToMetaDataset(obj, "Cells size threshold", cellsSizeThr) cellsToRem <- getCells(obj)[getCellsSize(obj) > cellsSizeThr] obj <- dropGenesCells(obj, cells = cellsToRem) plot(cellSizePlot(obj, condName = "origin")) ## ----cull-probable-doublets_2------------------------------------------------- # Store the used threshold in the COTAN object for later reference genesSizeHighThr <- 3000L obj <- addElementToMetaDataset(obj, "Num genes high threshold", genesSizeHighThr) cellsToRem <- getCells(obj)[getNumExpressedGenes(obj) > genesSizeHighThr] obj <- dropGenesCells(obj, cells = cellsToRem) plot(genesSizePlot(obj, condName = "origin")) ## ----cull-probable-dead-cells------------------------------------------------- # Store the used threshold in the COTAN object for later reference genesSizeLowThr <- 500L obj <- addElementToMetaDataset(obj, "Num genes low threshold", genesSizeLowThr) cellsToRem <- getCells(obj)[getNumExpressedGenes(obj) < genesSizeLowThr] obj <- dropGenesCells(obj, cells = cellsToRem) plot(genesSizePlot(obj, condName = "origin")) ## ----calculate-mitochondrial-genes-percentage--------------------------------- c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin") plot(mitPlot) ## ----cull-probable-dead-cells_2----------------------------------------------- # Store the used threshold in the COTAN object for later reference mitPercThr <- 3.5 obj <- addElementToMetaDataset(obj, "Mitoc. perc. threshold", mitPercThr) cellsToRem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr] obj <- dropGenesCells(obj, cells = cellsToRem) c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin") plot(mitPlot) ## ----scatter-plot_2----------------------------------------------------------- plot(scatterPlot(obj, condName = "origin")) ## ----log-current-dataset-status----------------------------------------------- logThis(paste("n cells", getNumCells(obj)), logLevel = 1L) getDims(obj) ## ----calculate-b-group-from-pca----------------------------------------------- obj <- addElementToMetaDataset(obj, "Num drop B group", 0L) obj <- clean(obj) c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj) plot(pcaCellsPlot) ## ----show-most-expressed-genes-b-group---------------------------------------- plot(genesPlot) ## ----cells-efficiency-pca-plot------------------------------------------------ plot(UDEPlot) ## ----cells-efficiency-plot---------------------------------------------------- plot(nuPlot) ## ----cells-efficiency-plot_low-tail------------------------------------------- plot(zoomedNuPlot) ## ----cull-b-group-and-recalculate--------------------------------------------- cellsToRem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"] obj <- dropGenesCells(obj, cells = cellsToRem) # Store how many times the B group was dropped obj <- addElementToMetaDataset(obj, "Num drop B group", 1L) ## ----cull-low-efficiency-cells------------------------------------------------ udeLowThr <- 0.34 obj <- addElementToMetaDataset(obj, "Low UDE cells' threshold", udeLowThr) obj <- estimateNuLinear(obj) cellsToRem <- getCells(obj)[getNu(obj) < udeLowThr] obj <- dropGenesCells(obj, cells = cellsToRem) ## ----raw-plots---------------------------------------------------------------- plot(ECDPlot(obj, condName = "origin")) plot(cellSizePlot(obj, condName = "origin")) plot(genesSizePlot(obj, condName = "origin")) plot(scatterPlot(obj, condName = "origin")) c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^mt-", condName = "origin") plot(mitPlot) ## ----recalculate-clean-plots-------------------------------------------------- obj <- clean(obj) c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj) ## ----show-clean-plots--------------------------------------------------------- plot(pcaCellsPlot) plot(genesPlot) plot(UDEPlot) plot(nuPlot) plot(zoomedNuPlot) # It is also possible to directly plot the PCA results plot(pcaCellsData) ## ----log-updated-dataset-status----------------------------------------------- logThis(paste("n cells", getNumCells(obj)), logLevel = 1L) table(getCondition(obj, condName = "origin")) ## ----final-clean-up----------------------------------------------------------- if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) { # delete file if it exists file.remove(file.path(outDir, paste0(cond, ".cotan.RDS"))) } if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) { # delete file if it exists file.remove(file.path(outDir, paste0(cond, "_times.csv"))) } if (dir.exists(file.path(outDir, cond))) { unlink(file.path(outDir, cond), recursive = TRUE) } # if (dir.exists(file.path(outDir, GEO))) { # unlink(file.path(outDir, GEO), recursive = TRUE) # } # stop logging to file setLoggingFile("") file.remove(file.path(outDir, "vignette_dataset_cleaning.log")) options(prevOptState) ## ----closure------------------------------------------------------------------ Sys.time() sessionInfo()