---
title: "Guided tutorial to Differential Expression Analysis using COTAN"
author: 
- name: "Silvia Giulia Galfrè"
  affiliation: "Department of Computer Science, University of Pisa"
- name: "Marco Fantozzi"
  affiliation: "University of Parma"

package: COTAN
output:
  BiocStyle::html_document:
    keep_md: true
vignette: >
  %\VignetteIndexEntry{DEA using COTAN}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

## Preamble

```{r knitr options, echo = FALSE, include = FALSE}
plotDev <- if (requireNamespace("ragg", quietly = TRUE)) {
  "ragg_png"
} else if (requireNamespace("Cairo", quietly = TRUE)) {
  "cairo_png"
} else {
  "png"
}

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7L,
  fig.height = 5L,
  dev = plotDev,
  fig.crop = FALSE,
  dpi = 100L
)
```

```{r preamble, message=FALSE, warning=FALSE}
library(COTAN)
library(zeallot)

# necessary to solve precedence of overloads
conflicted::conflict_prefer("%<-%", "zeallot")

# enable multi-processing (not on Windows)
prevOptState <- options(parallelly.fork.enable = TRUE)
```

# Introduction

This tutorial shows the available tools in COTAN to help with genes' clustering

### Setup

Define a directory where the data and the output will be stored

```{r setup-and-log}
dataDir <- file.path(tempdir(), "COTAN_vignette_data")
dir.create(dataDir, recursive = TRUE, showWarnings = FALSE)

print(dataDir)

outDir <- dataDir

# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)

# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_DEA.log"))
```

## Retrieving the data-set

Download the data-set for `r "mouse cortex E17.5"` from
`r "Yuzwa et al."` (2017)

```{r download-data}
GEO <- "GSM2861514"

dir.create(file.path(dataDir, GEO), showWarnings = FALSE)

datasetFileName <-
  file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz")


# retries up to 5 times to get the dataset
attempts <- 0L
maxAttempts <- 5L
ok <- FALSE

while (attempts < maxAttempts && !ok) {
  attempts <- attempts + 1L

  if (!file.exists(datasetFileName)) {
    res <- try(
      GEOquery::getGEOSuppFiles(
        GEO = GEO,
        makeDirectory = TRUE,
        baseDir = dataDir,
        filter_regex = base::basename(datasetFileName),
        fetch_files = TRUE
      ),
      silent = TRUE
    )
  }

  ok <- file.exists(datasetFileName)

  if (!ok && attempts < maxAttempts) {
    Sys.sleep(1)
  }
}

assertthat::assert_that(
  ok,
  msg = paste0(
    "Failed to retrieve file '", datasetFileName,
    "' after ", maxAttempts, " attempts."
  )
)

rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L)

print(dim(rawDataset))
```

### Create the COTAN object

Initialize the `COTAN` object with the row count table and
the metadata from the experiment.

```{r create-cotan-object}
cond <- "mouse_cortex_E17.5"

obj <- COTAN(raw = rawDataset)
obj <-
  initializeMetaDataset(
    obj,
    GEO = GEO,
    sequencingMethod = "Drop_seq",
    sampleCondition = cond
  )

logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
        logLevel = 1L)
```

Assign to each cell its origin

```{r add-origin-to-the-cotan-object}
# mark cells origin
data(vignette.cells.origin)
head(vignette.cells.origin, 18)

obj <- 
  addCondition(obj, condName = "origin", conditions = vignette.cells.origin)

rm(vignette.cells.origin)
```

### Drop low quality cells

```{r drop-low-quality-cells}
# use previously established results to determine
# which cells were dropped in the cleaning stage
data(vignette.split.clusters)

cellsToDrop <-
  getCells(obj)[!(getCells(obj) %in% names(vignette.split.clusters))]

obj <- dropGenesCells(obj, cells = cellsToDrop)

# Log the remaining number of cells

logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)

table(getCondition(obj, condName = "origin"))

rm(vignette.split.clusters)
```

## COTAN analysis

In this part, the `COTAN` model is calibrated, but the `COEX` matrix
is not evaluated as it is not strictly necessary

```{r calculate-and-store-coex-matrix}
obj <-
  proceedToCoex(
    obj,
    calcCoex = FALSE,
    optimizeForSpeed = TRUE,
    cores = 3L,
    deviceStr = "cuda"
  )
```

# Differential Expression Analysis

TODO:

### Existing clusterizations

Loading pre-calculated *clusterizations* from data and call the
`DEAOnClusters()`. This method estimates assess the differential expression of
genes when partitioning cells into two subsets: the cluster and all other cells
in the dataset.
For each cluster `COTAN` calculates a *correlation coefficient* based on
contingency tables that counts whether each gene occurs or not in cells inside
or outside of the cluster.

These numbers are organized as one column for each cluster with a row for each
gene and are a measure of the enrichment or depletion of the genes inside the
clusters.

```{r load-pre-calculated-clusterizations}
data("vignette.split.clusters", package = "COTAN")
data("vignette.merge.clusters", package = "COTAN")

vignette.split.clusters <-
  asClusterization(
    vignette.split.clusters, # a named vector/factor/data.frame
    allCells = getCells(obj) # used only to check names are coherent
  )
vignette.merge.clusters <-
  asClusterization(
    vignette.merge.clusters, # a named vector/factor/data.frame
    allCells = getCells(obj) # used only to check names are coherent
  )

# explicitly calculate the DEA of the clusterization to store it
vignette.split.coexDF <-
  DEAOnClusters(obj, clusters = vignette.split.clusters)
vignette.merge.coexDF <-
  DEAOnClusters(obj, clusters = vignette.merge.clusters)

obj <-
  addClusterization(
    obj,
    clName = "split",
    clusters = vignette.split.clusters,
    coexDF = vignette.split.coexDF,
    override = FALSE
  )

obj <-
  addClusterization(
    obj,
    clName = "merge",
    clusters = vignette.merge.clusters,
    coexDF = vignette.merge.coexDF,
    override = FALSE
  )

# these will be recovered from the COTAN obj as needed
rm(vignette.split.clusters, vignette.split.coexDF)
rm(vignette.merge.clusters, vignette.merge.coexDF)
```

## How to manipulate a clusterization

It is possible to improve on the labels in a *clusterization*, so that they
look nicer in plots and lists, for example to ensure all labels have
the **same** length.

Another improvement is to reorder the labels of the *clusterization* so that
*near* clusters have *near* labels

```{r extract-clusterizations}
# COTAn always stores clusterizations as factors.
splitClusters <- getClusters(obj, clName = "split")

# Use the utility factorToVector() to properly decay any named factor
# to a named char array
# splitClusters <- factorToVector(splitClusters)

# In this case the following calls are no-op since the clusterization
# were created by COTAN and so they already made nice and reordered

splitClusters <- niceFactorLevels(splitClusters)

c(splitClusters, splitCoexDF, perm) %<-% 
  reorderClusterization(
    obj,
    clName = "split",
    reverse = FALSE, 
    coexDF = NULL,
    useDEA = TRUE, # T: Cosine dist. on DEA, F: Eucl. dist. on avg. zero/one
    distance = NULL,
    hclustMethod = "ward.D2"
  )

mergeClusters <- getClusters(obj, clName = "merge")

table(splitClusters, mergeClusters)
```

`COTAN` provides also easy facility to convert a *clusterization* from a
*factor* to a *list* of *array* of cells and vice-versa.
This can come useful if one wants to iterate separately
on the cells of each cluster.

```{r clusterization-manipulation}
# this has an inverse `fromClustersList()`
splitClustersAsList <- toClustersList(splitClusters)

assertthat::assert_that(length(splitClustersAsList) == nlevels(splitClusters))

splitClustersOrigin <-
  rlang::set_names(
    rlang::rep_along(x = NA_character_, along = splitClustersAsList),
    names(splitClustersAsList)
  )

origin <- getCondition(obj, "origin")

for (clName in names(splitClustersAsList)) {
  cluster <- splitClustersAsList[[clName]]

  # assign most common origin to the cluster
  splitClustersOrigin[[clName]] <- names(which.max(table(origin[cluster])))

  # print the average non-zero expression in the cluster
  clRawData <- getRawData(obj)[, cluster, drop = FALSE]
  clRawData <- clRawData[clRawData > 0.0]

  cat(
    paste("Cluster", clName, "of", splitClustersOrigin[[clName]],
          "\torigin - average non-zero expression:", mean(clRawData)), "\n")

  rm(clRawData)
}

# If one needs to reorder the cells by cluster,
# labels are ordered as in the clusterization
orderCellsByMergeCluster <- groupByClusters(mergeClusters)

plot(getNumExpressedGenes(obj)[orderCellsByMergeCluster],
     ylab = "Num expressed genes", xlab = NA_character_)

mergeClustersAsList <- toClustersList(mergeClusters)

mergeClustersOrigin <- 
  vapply(
    mergeClustersAsList,
    \(cluster, origin) {
      names(which.max(table(origin[cluster])))
    },
    FUN.VALUE = character(1L),
    origin
  )
names(mergeClustersOrigin) <- names(mergeClustersAsList)


# It is also possible to reorder a subset of cells using
orderCellsBySomeMergeClusters <- 
  groupByClustersList(
    getCells(obj),
    mergeClustersAsList[c(2, 4, 6)]
  )

plot(getNumExpressedGenes(obj)[orderCellsBySomeMergeClusters],
     ylab = "Library Size", xlab = NA_character_)
```

### Clusters dendogram

It is possible to create a *dendogram* of the *clusterization* that uses
a cluster distance based on the clusters' `COEX`.

```{r clusters-in-a-tree}
treePlot <-
  clustersTreePlot(
    obj,
    kCuts = 2L,
    clName = "split",
    useDEA = TRUE  # T: Cosine dist. on DEA, F: Eucl. dist. on avg. zero/one
  )[["dend"]]

plot(treePlot)

# use origin to mark the clusters
dendextend::labels(treePlot) <- splitClustersOrigin[base::labels(treePlot)]

plot(treePlot)
```

It is easy to see that the dendogram splits according to origin except for
cluster `07` that is deemed more similar to the some of the non `cortical`
cells.

## Relevance of given marker genes

It is possible to visualize how relevant are some *marker genes* for the
clusters comprising a given *clusterization*

```{r define-specific-genes-lists}
# these are some genes associated to each cortical layer
layersGenes <- list(
  "L1"   = c("Reln",   "Lhx5"),
  "L2/3" = c("Cux1",   "Satb2"),
  "L4"   = c("Rorb",   "Sox5"),
  "L5/6" = c("Bcl11b", "Fezf2"),
  "Prog" = c("Hes1",    "Vim")
)

neuralTypeGenes <- list(
  # Neural Progenitor Genes
  "NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"),
  # Pan Neural Genes
  "PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"),
  # House Keeping
  "hk"   = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a",
             "Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1",
             "Tars", "Amacr")
)
```

The following is one of the most comprehensive plot available in `COTAN` to
visually summarize whether any of the given genes is strongly over/under
expressed in the clusters. The heat-map shows the `COEX` score and the
characters signal the corresponding adjusted p-value:
`***` for *p* \< 0.001, `**` for *p* \< 0.01,
`*` for *p* \< 0.05, `.` for *p* \< 0.1.

On the left it also allows to see how much each cluster is characterized by the
passed conditions.

```{r clusterization-data-visualization, fig.height=8, fig.width=20}
c(splitHeatmap, splitScoreDF, splitPValueDF) %<-%
  clustersMarkersHeatmapPlot(
    obj,
    groupMarkers = layersGenes,
    clName = "split",
    kCuts = 2L,
    adjustmentMethod = "bonferroni",
    condNameList = list("origin")
  )

ComplexHeatmap::draw(splitHeatmap)

c(mergeHeatmap, mergeScoreDF, mergePValueDF) %<-%
  clustersMarkersHeatmapPlot(
    obj,
    groupMarkers = neuralTypeGenes,
    clName = "merge",
    kCuts = 3L,
    adjustmentMethod = "bonferroni",
    condNameList = list("origin")
  )

ComplexHeatmap::draw(mergeHeatmap)
```

In the above graph, it is possible to see that the found clusters align well to
the expression of the layers' genes. Again it is possible to see that cluster
`07` of the `"split"` *clusterization* (`05` of the `"merge"`) is likely
composed by a **progenitor** cells instead of **mature layers** cells, like it
happens for cluster `08` and `09` (`06` of the `"merge"`) even if they have
supposedly different *origin*.


## Find differentially expressed genes

The following function uses the calculated clusters' `COEX` to select, for
each cluster, the *n* genes that are over-expressed and the *n* genes that are
under-expressed with respect to a neutral model assumption.

For each gene found, the function returns the `COEX` value,
the adjusted *p-value* and the log-fold-change.
It also flags whether the found gene is one of the markers
provided by the user.

```{r find-marge-markers}
mergeClusterMarkers <-
  findClustersMarkers(
    obj,
    clName = "merge",
    n = 5L,
    markers = unlist(layersGenes),
    adjustmentMethod = "bonferroni"
  )

foundMarkers <- list()

# All relevant genes with strong `p-values`
geneIsEnriched <- mergeClusterMarkers[, "adjPVal"] < 1e-10

for (clName in levels(mergeClusters)) {
  geneIsInCluster <- mergeClusterMarkers[, "CL"] == clName
  foundMarkers[[clName]] <-
    mergeClusterMarkers[geneIsEnriched & geneIsInCluster, "Gene", drop = TRUE]
}

# number of genes per cluster
lengths(foundMarkers)
```


### Relevance of marker genes for the `merge` clusters

Again, it is possible to visualize how relevant are the *marker genes* from
above:

```{r cl-data-visualization_found-markers, fig.height=8, fig.width=20}
c(mergeHeatmap, ..) %<-%
  clustersMarkersHeatmapPlot(
    obj,
    groupMarkers = foundMarkers,
    clName = "merge",
    condNameList = list("origin")
  )

ComplexHeatmap::draw(mergeHeatmap)
```

## DEA cluster vs. cluster

Sometimes it is useful to compare the genes' expression of a pair of clusters.

In such cases the simplest thing to do is to *drop* all other cells so to
obtain a leaner `COTAN` object with just the 2 clusters and then use the
procedures shown above to analyze the resulting object.

```{r create-sub-object}
# We will focus on the clusters `03` (likely part of layers 5/6) and
# `05` (likely part of layers 2/3) of the `split` clusterization

cellsToDrop <- getCells(obj)[!(splitClusters %in% c("03", "05"))]

obj2 <- dropGenesCells(obj, cells = cellsToDrop)

obj2 <- proceedToCoex(obj2, calcCoex = FALSE)

table(getClusters(obj2, clName = "split"))
```

```{r plot-sub-object-cluster-data, eval=FALSE, include=TRUE}
# this does not give more information than the same full-plot above

c(splitHeatmap2, ., .) %<-%
  clustersMarkersHeatmapPlot(
    obj2,
    groupMarkers = layersGenes,
    clName = "split",
    kCuts = 2L,
    adjustmentMethod = "bonferroni"
  )

ComplexHeatmap::draw(splitHeatmap2)
```

In the case of only two clusters, over-expressed genes in a cluster correspond
exactly to under-expressed genes in the other,
so one can just look at the former 

```{r find-sub-object-markers}
deaMarkers <- findClustersMarkers(
  obj2,
  n = 10L,
  clName = "split",
  adjustmentMethod = "bonferroni",
  markers = layersGenes[c("L2/3", "L5/6")]
)

# over-expressed genes follow the under-expressed ones
deaMarkers[11:20, ]
deaMarkers[31:40, ]
```

We can see that in this case we recover the layers genes.


## Vignette clean-up stage

The next few lines are just to clean.

```{r clean-up}
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
  unlink(file.path(outDir, cond), recursive = TRUE)
}
# if (dir.exists(file.path(outDir, GEO))) {
#   unlink(file.path(outDir, GEO), recursive = TRUE)
# }

# stop logging to file
setLoggingFile("")
file.remove(file.path(outDir, "vignette_uniform_clustering.log"))

options(prevOptState)
```

```{r closure}
Sys.time()

sessionInfo()
```