## ----knitr options, echo = FALSE, include = FALSE----------------------------- plotDev <- if (requireNamespace("ragg", quietly = TRUE)) { "ragg_png" } else if (requireNamespace("Cairo", quietly = TRUE)) { "cairo_png" } else { "png" } knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7L, fig.height = 5L, dev = plotDev, fig.crop = FALSE, dpi = 100L ) ## ----preamble, message=FALSE, warning=FALSE----------------------------------- library(COTAN) library(zeallot) # necessary to solve precedence of overloads conflicted::conflict_prefer("%<-%", "zeallot") # enable multi-processing (not on Windows) prevOptState <- options(parallelly.fork.enable = TRUE) ## ----setup-and-log------------------------------------------------------------ dataDir <- file.path(tempdir(), "COTAN_vignette_data") dir.create(dataDir, recursive = TRUE, showWarnings = FALSE) print(dataDir) outDir <- dataDir # Log-level 2 was chosen to showcase better how the package works # In normal usage a level of 0 or 1 is more appropriate setLoggingLevel(2L) # This file will contain all the logs produced by the package # as if at the highest logging level setLoggingFile(file.path(outDir, "vignette_DEA.log")) ## ----download-data------------------------------------------------------------ GEO <- "GSM2861514" dir.create(file.path(dataDir, GEO), showWarnings = FALSE) datasetFileName <- file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz") # retries up to 5 times to get the dataset attempts <- 0L maxAttempts <- 5L ok <- FALSE while (attempts < maxAttempts && !ok) { attempts <- attempts + 1L if (!file.exists(datasetFileName)) { res <- try( GEOquery::getGEOSuppFiles( GEO = GEO, makeDirectory = TRUE, baseDir = dataDir, filter_regex = base::basename(datasetFileName), fetch_files = TRUE ), silent = TRUE ) } ok <- file.exists(datasetFileName) if (!ok && attempts < maxAttempts) { Sys.sleep(1) } } assertthat::assert_that( ok, msg = paste0( "Failed to retrieve file '", datasetFileName, "' after ", maxAttempts, " attempts." ) ) rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L) print(dim(rawDataset)) ## ----create-cotan-object------------------------------------------------------ cond <- "mouse_cortex_E17.5" obj <- COTAN(raw = rawDataset) obj <- initializeMetaDataset( obj, GEO = GEO, sequencingMethod = "Drop_seq", sampleCondition = cond ) logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])), logLevel = 1L) ## ----add-origin-to-the-cotan-object------------------------------------------- # mark cells origin data(vignette.cells.origin) head(vignette.cells.origin, 18) obj <- addCondition(obj, condName = "origin", conditions = vignette.cells.origin) rm(vignette.cells.origin) ## ----drop-low-quality-cells--------------------------------------------------- # use previously established results to determine # which cells were dropped in the cleaning stage data(vignette.split.clusters) cellsToDrop <- getCells(obj)[!(getCells(obj) %in% names(vignette.split.clusters))] obj <- dropGenesCells(obj, cells = cellsToDrop) # Log the remaining number of cells logThis(paste("n cells", getNumCells(obj)), logLevel = 1L) table(getCondition(obj, condName = "origin")) rm(vignette.split.clusters) ## ----calculate-and-store-coex-matrix------------------------------------------ obj <- proceedToCoex( obj, calcCoex = FALSE, optimizeForSpeed = TRUE, cores = 3L, deviceStr = "cuda" ) ## ----load-pre-calculated-clusterizations-------------------------------------- data("vignette.split.clusters", package = "COTAN") data("vignette.merge.clusters", package = "COTAN") vignette.split.clusters <- asClusterization( vignette.split.clusters, # a named vector/factor/data.frame allCells = getCells(obj) # used only to check names are coherent ) vignette.merge.clusters <- asClusterization( vignette.merge.clusters, # a named vector/factor/data.frame allCells = getCells(obj) # used only to check names are coherent ) # explicitly calculate the DEA of the clusterization to store it vignette.split.coexDF <- DEAOnClusters(obj, clusters = vignette.split.clusters) vignette.merge.coexDF <- DEAOnClusters(obj, clusters = vignette.merge.clusters) obj <- addClusterization( obj, clName = "split", clusters = vignette.split.clusters, coexDF = vignette.split.coexDF, override = FALSE ) obj <- addClusterization( obj, clName = "merge", clusters = vignette.merge.clusters, coexDF = vignette.merge.coexDF, override = FALSE ) # these will be recovered from the COTAN obj as needed rm(vignette.split.clusters, vignette.split.coexDF) rm(vignette.merge.clusters, vignette.merge.coexDF) ## ----extract-clusterizations-------------------------------------------------- # COTAn always stores clusterizations as factors. splitClusters <- getClusters(obj, clName = "split") # Use the utility factorToVector() to properly decay any named factor # to a named char array # splitClusters <- factorToVector(splitClusters) # In this case the following calls are no-op since the clusterization # were created by COTAN and so they already made nice and reordered splitClusters <- niceFactorLevels(splitClusters) c(splitClusters, splitCoexDF, perm) %<-% reorderClusterization( obj, clName = "split", reverse = FALSE, coexDF = NULL, useDEA = TRUE, # T: Cosine dist. on DEA, F: Eucl. dist. on avg. zero/one distance = NULL, hclustMethod = "ward.D2" ) mergeClusters <- getClusters(obj, clName = "merge") table(splitClusters, mergeClusters) ## ----clusterization-manipulation---------------------------------------------- # this has an inverse `fromClustersList()` splitClustersAsList <- toClustersList(splitClusters) assertthat::assert_that(length(splitClustersAsList) == nlevels(splitClusters)) splitClustersOrigin <- rlang::set_names( rlang::rep_along(x = NA_character_, along = splitClustersAsList), names(splitClustersAsList) ) origin <- getCondition(obj, "origin") for (clName in names(splitClustersAsList)) { cluster <- splitClustersAsList[[clName]] # assign most common origin to the cluster splitClustersOrigin[[clName]] <- names(which.max(table(origin[cluster]))) # print the average non-zero expression in the cluster clRawData <- getRawData(obj)[, cluster, drop = FALSE] clRawData <- clRawData[clRawData > 0.0] cat( paste("Cluster", clName, "of", splitClustersOrigin[[clName]], "\torigin - average non-zero expression:", mean(clRawData)), "\n") rm(clRawData) } # If one needs to reorder the cells by cluster, # labels are ordered as in the clusterization orderCellsByMergeCluster <- groupByClusters(mergeClusters) plot(getNumExpressedGenes(obj)[orderCellsByMergeCluster], ylab = "Num expressed genes", xlab = NA_character_) mergeClustersAsList <- toClustersList(mergeClusters) mergeClustersOrigin <- vapply( mergeClustersAsList, \(cluster, origin) { names(which.max(table(origin[cluster]))) }, FUN.VALUE = character(1L), origin ) names(mergeClustersOrigin) <- names(mergeClustersAsList) # It is also possible to reorder a subset of cells using orderCellsBySomeMergeClusters <- groupByClustersList( getCells(obj), mergeClustersAsList[c(2, 4, 6)] ) plot(getNumExpressedGenes(obj)[orderCellsBySomeMergeClusters], ylab = "Library Size", xlab = NA_character_) ## ----clusters-in-a-tree------------------------------------------------------- treePlot <- clustersTreePlot( obj, kCuts = 2L, clName = "split", useDEA = TRUE # T: Cosine dist. on DEA, F: Eucl. dist. on avg. zero/one )[["dend"]] plot(treePlot) # use origin to mark the clusters dendextend::labels(treePlot) <- splitClustersOrigin[base::labels(treePlot)] plot(treePlot) ## ----define-specific-genes-lists---------------------------------------------- # these are some genes associated to each cortical layer layersGenes <- list( "L1" = c("Reln", "Lhx5"), "L2/3" = c("Cux1", "Satb2"), "L4" = c("Rorb", "Sox5"), "L5/6" = c("Bcl11b", "Fezf2"), "Prog" = c("Hes1", "Vim") ) neuralTypeGenes <- list( # Neural Progenitor Genes "NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"), # Pan Neural Genes "PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"), # House Keeping "hk" = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a", "Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1", "Tars", "Amacr") ) ## ----clusterization-data-visualization, fig.height=8, fig.width=20------------ c(splitHeatmap, splitScoreDF, splitPValueDF) %<-% clustersMarkersHeatmapPlot( obj, groupMarkers = layersGenes, clName = "split", kCuts = 2L, adjustmentMethod = "bonferroni", condNameList = list("origin") ) ComplexHeatmap::draw(splitHeatmap) c(mergeHeatmap, mergeScoreDF, mergePValueDF) %<-% clustersMarkersHeatmapPlot( obj, groupMarkers = neuralTypeGenes, clName = "merge", kCuts = 3L, adjustmentMethod = "bonferroni", condNameList = list("origin") ) ComplexHeatmap::draw(mergeHeatmap) ## ----find-marge-markers------------------------------------------------------- mergeClusterMarkers <- findClustersMarkers( obj, clName = "merge", n = 5L, markers = unlist(layersGenes), adjustmentMethod = "bonferroni" ) foundMarkers <- list() # All relevant genes with strong `p-values` geneIsEnriched <- mergeClusterMarkers[, "adjPVal"] < 1e-10 for (clName in levels(mergeClusters)) { geneIsInCluster <- mergeClusterMarkers[, "CL"] == clName foundMarkers[[clName]] <- mergeClusterMarkers[geneIsEnriched & geneIsInCluster, "Gene", drop = TRUE] } # number of genes per cluster lengths(foundMarkers) ## ----cl-data-visualization_found-markers, fig.height=8, fig.width=20---------- c(mergeHeatmap, ..) %<-% clustersMarkersHeatmapPlot( obj, groupMarkers = foundMarkers, clName = "merge", condNameList = list("origin") ) ComplexHeatmap::draw(mergeHeatmap) ## ----create-sub-object-------------------------------------------------------- # We will focus on the clusters `03` (likely part of layers 5/6) and # `05` (likely part of layers 2/3) of the `split` clusterization cellsToDrop <- getCells(obj)[!(splitClusters %in% c("03", "05"))] obj2 <- dropGenesCells(obj, cells = cellsToDrop) obj2 <- proceedToCoex(obj2, calcCoex = FALSE) table(getClusters(obj2, clName = "split")) ## ----plot-sub-object-cluster-data, eval=FALSE, include=TRUE------------------- # # this does not give more information than the same full-plot above # # c(splitHeatmap2, ., .) %<-% # clustersMarkersHeatmapPlot( # obj2, # groupMarkers = layersGenes, # clName = "split", # kCuts = 2L, # adjustmentMethod = "bonferroni" # ) # # ComplexHeatmap::draw(splitHeatmap2) ## ----find-sub-object-markers-------------------------------------------------- deaMarkers <- findClustersMarkers( obj2, n = 10L, clName = "split", adjustmentMethod = "bonferroni", markers = layersGenes[c("L2/3", "L5/6")] ) # over-expressed genes follow the under-expressed ones deaMarkers[11:20, ] deaMarkers[31:40, ] ## ----clean-up----------------------------------------------------------------- if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) { # delete file if it exists file.remove(file.path(outDir, paste0(cond, ".cotan.RDS"))) } if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) { # delete file if it exists file.remove(file.path(outDir, paste0(cond, "_times.csv"))) } if (dir.exists(file.path(outDir, cond))) { unlink(file.path(outDir, cond), recursive = TRUE) } # if (dir.exists(file.path(outDir, GEO))) { # unlink(file.path(outDir, GEO), recursive = TRUE) # } # stop logging to file setLoggingFile("") file.remove(file.path(outDir, "vignette_uniform_clustering.log")) options(prevOptState) ## ----closure------------------------------------------------------------------ Sys.time() sessionInfo()