## ----knitr options, echo = FALSE, include = FALSE----------------------------- plotDev <- if (requireNamespace("ragg", quietly = TRUE)) { "ragg_png" } else if (requireNamespace("Cairo", quietly = TRUE)) { "cairo_png" } else { "png" } knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7L, fig.height = 5L, dev = plotDev, fig.crop = FALSE, dpi = 100L ) ## ----preamble, message=FALSE, warning=FALSE----------------------------------- library(COTAN) library(zeallot) # necessary to solve precedence of overloads conflicted::conflict_prefer("%<-%", "zeallot") # enable multi-processing (not on Windows) prevOptState <- options(parallelly.fork.enable = TRUE) ## ----setup-and-log------------------------------------------------------------ dataDir <- file.path(tempdir(), "COTAN_vignette_data") dir.create(dataDir, recursive = TRUE, showWarnings = FALSE) print(dataDir) outDir <- dataDir # Log-level 2 was chosen to showcase better how the package works # In normal usage a level of 0 or 1 is more appropriate setLoggingLevel(2L) # This file will contain all the logs produced by the package # as if at the highest logging level setLoggingFile(file.path(outDir, "vignette_uniform_clustering.log")) ## ----download-data------------------------------------------------------------ GEO <- "GSM2861514" dir.create(file.path(dataDir, GEO), showWarnings = FALSE) datasetFileName <- file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz") # retries up to 5 times to get the dataset attempts <- 0L maxAttempts <- 5L ok <- FALSE while (attempts < maxAttempts && !ok) { attempts <- attempts + 1L if (!file.exists(datasetFileName)) { res <- try( GEOquery::getGEOSuppFiles( GEO = GEO, makeDirectory = TRUE, baseDir = dataDir, filter_regex = base::basename(datasetFileName), fetch_files = TRUE ), silent = TRUE ) } ok <- file.exists(datasetFileName) if (!ok && attempts < maxAttempts) { Sys.sleep(1) } } assertthat::assert_that( ok, msg = paste0( "Failed to retrieve file '", datasetFileName, "' after ", maxAttempts, " attempts." ) ) rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L) print(dim(rawDataset)) ## ----create-cotan-object------------------------------------------------------ cond <- "mouse_cortex_E17.5" obj <- COTAN(raw = rawDataset) obj <- initializeMetaDataset( obj, GEO = GEO, sequencingMethod = "Drop_seq", sampleCondition = cond ) logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])), logLevel = 1L) ## ----add-origin-to-the-cotan-object------------------------------------------- # mark cells origin data(vignette.cells.origin) head(vignette.cells.origin, 18) obj <- addCondition(obj, condName = "origin", conditions = vignette.cells.origin) rm(vignette.cells.origin) ## ----drop-low-quality-cells--------------------------------------------------- # use previously established results to determine # which cells were dropped in the cleaning stage data(vignette.split.clusters) cellsToDrop <- getCells(obj)[!(getCells(obj) %in% names(vignette.split.clusters))] obj <- dropGenesCells(obj, cells = cellsToDrop) # Log the remaining number of cells logThis(paste("n cells", getNumCells(obj)), logLevel = 1L) table(getCondition(obj, condName = "origin")) rm(vignette.split.clusters) ## ----calculate-and-store-coex-matrix------------------------------------------ obj <- proceedToCoex( obj, calcCoex = FALSE, optimizeForSpeed = TRUE, cores = 3L, deviceStr = "cuda" ) ## ----create-ut-clusterization, eval=FALSE, include=TRUE----------------------- # # This code is a too computationally heavy to be used in a vignette [~20 min.] # # So the result was stored and will be re-loaded in the next chunk # # # initialClusters <- NULL # # if (TRUE) { # # read from the last file among those named all_check_results_XX.csv # mergeDir <- file.path(outDir, cond, "reclustering") # # resFiles <- # list.files( # path = mergeDir, # pattern = "partial_clusterization_.*csv", # full.names = TRUE # ) # # if (length(resFiles) != 0L) { # prevClustRes <- # read.csv(resFiles[length(resFiles)], header = TRUE, row.names = 1L) # initialClusters <- getColumnFromDF(prevClustRes, 1L) # initialClusters[is.na(initialClusters)] <- "-1" # initialClusters <- niceFactorLevels(initialClusters) # } # } # # # # default constructed checker is OK # advChecker <- new("AdvancedGDIUniformityCheck") # # c(splitClusters, splitCoexDF) %<-% # cellsUniformClustering( # obj, # initialResolution = 0.8, # checker = advChecker, # dataMethod = "LogLikelihood", # of observed against expected data # useCoexEigen = TRUE, # project on `COEX` eigenvectors instead doing a `PCA` # numReducedComp = 50L, # minimumUTClusterSize = 50L, # too large values imply many un-clustered cells # initialClusters = initialClusters, # initialIteration = 1L, # optimizeForSpeed = TRUE, # deviceStr = "cuda", # cores = 3L, # saveObj = TRUE, # save intermediate data for log and restart # outDir = outDir # ) # # # add the newly found clusterization to the `COTAN` object # obj <- # addClusterization( # obj, # clName = "split", # clusters = splitClusters, # coexDF = splitCoexDF # ) # # table(splitClusters) ## ----load-pre-calculated-clusterization_split--------------------------------- data("vignette.split.clusters", package = "COTAN") splitClusters <- vignette.split.clusters # explicitly calculate the DEA of the clusterization to store it splitCoexDF <- DEAOnClusters(obj, clusters = splitClusters) obj <- addClusterization( obj, clName = "split", clusters = splitClusters, coexDF = splitCoexDF, override = FALSE ) ## ----extract-clusterization-summary_split------------------------------------- c(summaryData, summaryPlot) %<-% clustersSummaryPlot( obj, clName = "split", plotTitle = "split summary", condName = "origin" ) head(summaryData, n = 10L) ## ----summary-as-plot---------------------------------------------------------- plot(summaryPlot) ## ----define-layers-specific-genes--------------------------------------------- # these are some genes associated to each cortical layer layersGenes <- list( "L1" = c("Reln", "Lhx5"), "L2/3" = c("Cux1", "Satb2"), "L4" = c("Rorb", "Sox5"), "L5/6" = c("Bcl11b", "Fezf2"), "Prog" = c("Hes1", "Vim") ) ## ----clusterization-data-visualization_split---------------------------------- c(splitHeatmap, splitScoreDF, splitPValueDF) %<-% clustersMarkersHeatmapPlot( obj, groupMarkers = layersGenes, clName = "split", kCuts = 2L, adjustmentMethod = "holm" ) ComplexHeatmap::draw(splitHeatmap) ## ----clusterization-umap-plot-via-drm_split, eval=FALSE, include=TRUE--------- # c(umapPlot, cellsRDM) %<-% # cellsUMAPPlot( # obj, # clName = "split", # clusters = NULL, # useCoexEigen = TRUE, # Aligning these to `split` generation gives best res. # dataMethod = "LogLikelihood", # "AdjBinarized" is also a good choice # numComp = 50L, # number of components of the data reduced matrix # colors = NULL, # a list of user defined colors for the various clusters # numNeighbors = 30L, # UMAP related parameters # minPointsDist = 0.3 # UMAP related parameters # ) # # plot(umapPlot) ## ----clusterization-umap-plot-via-drm_split_as-seurat------------------------- c(umapPlot, cellsRDM) %<-% cellsUMAPPlot( obj, clName = "split", clusters = NULL, useCoexEigen = FALSE, dataMethod = "LogNormalized", numComp = 50L, # number of components of the data reduced matrix genesSel = "HVG_Seurat", numGenes = 500L, colors = NULL, # a list of user defined colors for the various clusters numNeighbors = 30L, # UMAP related parameters minPointsDist = 0.3 # UMAP related parameters ) plot(umapPlot) ## ----merge-clusters-if-ut, eval=FALSE, include=TRUE--------------------------- # # checkersList <- # list( # advChecker, # shiftCheckerThresholds(advChecker, 0.01), # Sensible threshold shifts really # shiftCheckerThresholds(advChecker, 0.03) # depend on the dataset # ) # # prevCheckRes <- data.frame() # # # The following code shows the case when one wants to re-use the already # # calculated checks from the saved intermediate outputs. # # This can apply in cases one wants to add a new checker to the list above or # # in the cases the execution is interrupted prematurely. # # # # Note that a similar facility exists for the `cellsUniformClustering()` # # function above. # # # if (TRUE) { # # read from the last file among those named all_check_results_XX.csv # mergeDir <- file.path(outDir, cond, "leafs_merge") # # resFiles <- # list.files( # path = mergeDir, # pattern = "all_check_results_.*csv", # full.names = TRUE # ) # # if (length(resFiles) != 0L) { # prevCheckRes <- # read.csv(resFiles[length(resFiles)], header = TRUE, row.names = 1L) # } # } # # c(mergedClusters, mergedCoexDF) %<-% # mergeUniformCellsClusters( # obj, # clusters = splitClusters, # checkers = checkersList, # allCheckResults = prevCheckRes, # optimizeForSpeed = TRUE, # deviceStr = "cuda", # cores = 6L, # useDEA = TRUE, # saveObj = TRUE, # outDir = outDir # ) # # # add the newly found clusterization to the `COTAN` object # obj <- # addClusterization( # obj, # clName = "merge", # override = TRUE, # clusters = mergedClusters, # coexDF = mergedCoexDF # ) # # table(mergedClusters) ## ----load-pre-calculated-clusterization_merge--------------------------------- data("vignette.merge.clusters", package = "COTAN") mergedClusters <- vignette.merge.clusters[getCells(obj)] # explicitly calculate the DEA of the clusterization to store it mergedCoexDF <- DEAOnClusters(obj, clusters = mergedClusters) obj <- addClusterization( obj, clName = "merge", clusters = mergedClusters, coexDF = mergedCoexDF, override = FALSE ) # this gives immediate visual information about which clusters were merged table(splitClusters, mergedClusters) ## ----extract-clusterization-summary_merge------------------------------------- c(summaryData, summaryPlot) %<-% clustersSummaryPlot( obj, clName = "merge", plotTitle = "merge summary", condName = "origin" ) head(summaryData, n = 10L) plot(summaryPlot) ## ----clusterization-data-visualization_merge---------------------------------- c(mergeHeatmap, mergeScoresDF, mergePValuesDF) %<-% clustersMarkersHeatmapPlot( obj, groupMarkers = layersGenes, clName = "merge", condNameList = list("origin") ) ComplexHeatmap::draw(mergeHeatmap) ## ----clusterization-data-visualization_merge-vs-split}------------------------ c(mergeVsSplitHeatmap, ..) %<-% clustersMarkersHeatmapPlot( obj, clName = "merge", condNameList = list("split"), conditionsList = list(splitClusters) ) ComplexHeatmap::draw(mergeVsSplitHeatmap) ## ----clusterization-umap-plot-via-drm_merge, eval=FALSE, include=TRUE--------- # c(umapPlot, .) %<-% # cellsUMAPPlot( # obj, # clName = "merge", # clusters = NULL, # useCoexEigen = TRUE, # dataMethod = "LogLikelihood", # "AdjBinarized" is also a good choice # numComp = 50L, # number of components of the data reduced matrix # colors = NULL, # a list of user defined colors for the various clusters # numNeighbors = 30L, # UMAP related parameters # minPointsDist = 0.3 # UMAP related parameters # ) # # plot(umapPlot) ## ----clusterization-umap-plot-via-drm_merge_as-seurat------------------------- c(umapPlot, .) %<-% cellsUMAPPlot( obj, clName = "merge", clusters = NULL, useCoexEigen = FALSE, dataMethod = "LogNormalized", # "AdjBinarized" is also a good choice numComp = 50L, # number of components of the data reduced matrix genesSel = "HVG_Scanpy", numGenes = 500L, colors = NULL, # a list of user defined colors for the various clusters numNeighbors = 30L, # UMAP related parameters minPointsDist = 0.3 # UMAP related parameters ) plot(umapPlot) ## ----clean-up----------------------------------------------------------------- if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) { # delete file if it exists file.remove(file.path(outDir, paste0(cond, ".cotan.RDS"))) } if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) { # delete file if it exists file.remove(file.path(outDir, paste0(cond, "_times.csv"))) } if (dir.exists(file.path(outDir, cond))) { unlink(file.path(outDir, cond), recursive = TRUE) } # if (dir.exists(file.path(outDir, GEO))) { # unlink(file.path(outDir, GEO), recursive = TRUE) # } # stop logging to file setLoggingFile("") file.remove(file.path(outDir, "vignette_uniform_clustering.log")) options(prevOptState) ## ----closure------------------------------------------------------------------ Sys.time() sessionInfo()