---
title: "Guided tutorial to genes' clustering using COTAN"
author: 
- name: "Silvia Giulia Galfrè"
  affiliation: "Department of Computer Science, University of Pisa"
- name: "Marco Fantozzi"
  affiliation: "University of Parma"

package: COTAN
output:
  BiocStyle::html_document:
    keep_md: true
vignette: >
  %\VignetteIndexEntry{Genes' clustering using COTAN}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

## Preamble

```{r knitr options, echo = FALSE, include = FALSE}
plotDev <- if (requireNamespace("ragg", quietly = TRUE)) {
  "ragg_png"
} else if (requireNamespace("Cairo", quietly = TRUE)) {
  "cairo_png"
} else {
  "png"
}

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7L,
  fig.height = 5L,
  dev = plotDev,
  fig.crop = FALSE,
  dpi = 100L
)
```


```{r preamble, message=FALSE, warning=FALSE}
library(COTAN)
library(zeallot)

# necessary to solve precedence of overloads
conflicted::conflict_prefer("%<-%", "zeallot")

# enable multi-processing (not on Windows)
prevOptState <- options(parallelly.fork.enable = TRUE)
```

# Introduction

This tutorial shows the available tools in COTAN to help with genes' clustering

### Setup

Define a directory where the data and the output will be stored

```{r setup-and-log}
dataDir <- file.path(tempdir(), "COTAN_vignette_data")
dir.create(dataDir, recursive = TRUE, showWarnings = FALSE)

print(dataDir)

outDir <- dataDir

# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2L)

# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_genes_clustering.log"))
```


## Retrieving the data-set

Download the data-set for `r "mouse cortex E17.5"` from
`r "Yuzwa et al."` (2017)

```{r download-data}
GEO <- "GSM2861514"

dir.create(file.path(dataDir, GEO), showWarnings = FALSE)

datasetFileName <-
  file.path(dataDir, GEO, "GSM2861514_E175_All_Cells_DGE.txt.gz")


# retries up to 5 times to get the dataset
attempts <- 0L
maxAttempts <- 5L
ok <- FALSE

while (attempts < maxAttempts && !ok) {
  attempts <- attempts + 1L

  if (!file.exists(datasetFileName)) {
    res <- try(
      GEOquery::getGEOSuppFiles(
        GEO = GEO,
        makeDirectory = TRUE,
        baseDir = dataDir,
        filter_regex = base::basename(datasetFileName),
        fetch_files = TRUE
      ),
      silent = TRUE
    )
  }

  ok <- file.exists(datasetFileName)

  if (!ok && attempts < maxAttempts) {
    Sys.sleep(1)
  }
}

assertthat::assert_that(
  ok,
  msg = paste0(
    "Failed to retrieve file '", datasetFileName,
    "' after ", maxAttempts, " attempts."
  )
)

rawDataset <- read.csv(datasetFileName, sep = "\t", row.names = 1L)

print(dim(rawDataset))
```

### Create the COTAN object

Initialize the `COTAN` object with the row count table and
the metadata from the experiment.

```{r create-cotan-object}
cond <- "mouse_cortex_E17.5"

obj <- COTAN(raw = rawDataset)
obj <-
  initializeMetaDataset(
    obj,
    GEO = GEO,
    sequencingMethod = "Drop_seq",
    sampleCondition = cond
  )

logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
        logLevel = 1L)
```

Assign to each cell its origin

```{r add-origin-to-the-cotan-object}
# mark cells origin
data(vignette.cells.origin)
head(vignette.cells.origin, 18)

obj <- 
  addCondition(obj, condName = "origin", conditions = vignette.cells.origin)

rm(vignette.cells.origin)
```


### Drop low quality cells

```{r drop-low-quality-cells}
# use previously established results to determine
# which cells were dropped in the cleaning stage
data(vignette.split.clusters)

cellsToDrop <-
  getCells(obj)[!(getCells(obj) %in% names(vignette.split.clusters))]

obj <- dropGenesCells(obj, cells = cellsToDrop)

# Log the remaining number of cells

logThis(paste("n cells", getNumCells(obj)), logLevel = 1L)

table(getCondition(obj, condName = "origin"))

rm(vignette.split.clusters)
```

# Establish genes' clusters

## COTAN analysis

In this part, the `COTAN` model is calibrated:
all the contingency tables are computed and used to get
the statistics necessary to `COEX` evaluation and storing

```{r calculate-and-store-coex-matrix}
obj <-
  proceedToCoex(
    obj,
    calcCoex = TRUE,
    optimizeForSpeed = TRUE,
    cores = 3L,
    deviceStr = "cuda"
  )
```

### Get data tables and stored `COEX` 

Sometimes we can also be interested in the numbers present directly in the
contingency tables for two specific genes.

```{r contingency-tables-and-coex-for-gene-pair}
cat("Contingency Tables:")
print(contingencyTables(obj, g1 = "Satb2", g2 = "Bcl11b"))

cat(paste("Corresponding COEX value:",
          getGenesCoex(obj, zeroDiagonal = FALSE)["Satb2", "Bcl11b"]))
```

Another useful function is `getGenesCoex()`. This can be used to extract
the whole or a partial `COEX` matrix from a `COTAN` object.

```{r display-small-chunk-of-coex-matrix}
# Extract the whole matrix
coex <- getGenesCoex(obj, zeroDiagonal = FALSE)
coex[1000L:1005L, 1000L:1005L]
```

```{r get-coex-sub-matrix}
# For a partial matrix
coex_sm <- getGenesCoex(obj, genes = c("Satb2", "Bcl11b", "Fezf2"),
                        zeroDiagonal = FALSE)
coex_sm[2001:2010, ]
```

## Analysis of the elaborated data: `GDI`

To calculate and store the *Global Differentiation Index* (`GDI`)
use the following:

```{r calculate-and-store-gdi}
gdiDF <- calculateGDI(obj, cores = 3L)

# Show the most *differentially expressed* genes
perm <- order(gdiDF[["GDI"]], decreasing = TRUE)
head(gdiDF[perm, ], n = 10)

# This will store only the $GDI column
obj <- storeGDI(obj, genesGDI = gdiDF)
```

The other two columns return the sum of the normalized data rows
and the percentage of cells expressing the gene respectively.

### Define genes lists

```{r genes-list-definition}
genesLists <- list(
  # Neural Progenitor Genes
  "NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"),
  # Pan Neural Genes
  "PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"),
  # House Keeping
  "hk"   = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a",
             "Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1",
             "Tars", "Amacr")
)
```

### `GDI` Plot

The next function can either plot the `GDI` from the dataset directly or
use the already stored data.

It marks the given threshold 1.40 (in red) and
the 50% and 75% quantiles (in blue).
We can also specify some gene sets (three in this case) that
we want to label explicitly in the `GDI` plot.

```{r gdi-plot-with-marked-genes}
GDIPlot(obj, genes = genesLists, GDIThreshold = 1.40)
```

In the picture above it is possible to see how, as expected,
the house-keeping genes (`hk`) have generally a low `GDI` value.


### Store the calibrated object for future analysis

```{r save-cotan-obj, eval=FALSE}
# saving the structure
saveRDS(obj, file = file.path(outDir, paste0(cond, ".cotan.RDS")))
```


## Heatmaps

To perform the Gene Pair Analysis, we can plot a heatmap of the `COEX` values
between two gene sets.

We have to define the different gene sets (`list.genes`) in a list.
The first set will be used to define the columns;
then we can choose which sets to use in the function parameter sets
(for example, from 1 to 3).

```{r coex-heatmap-plot-for-selected-genes}
plot(heatmapPlot(obj, genesLists = genesLists, sets = 1L:3L, cores = 3L))
```


```{r define-layers-specific-genes}
layersGenes <- list(
  "L1"   = c("Reln",   "Lhx5"),
  "L2/3" = c("Cux1",   "Satb2"),
  "L4"   = c("Rorb",   "Sox5"),
  "L5/6" = c("Bcl11b", "Fezf2"),
  "Prog" = c("Hes1",    "Vim", "Dummy")
)
```

We can also plot a general heatmap of `COEX` values based on some markers like
the following one.

```{r coex-heatmap-plot-for-primary-marker-genes}
layersMarkers <- c("Satb2", "Bcl11b", "Vim", "Hes1")

genesHeatmapPlot(
  obj,
  primaryMarkers = layersMarkers,
  pValueThreshold = 0.0001,
  symmetric = TRUE,
  cores = 3L
)
```

```{r coex-heatmap-plot-two-genes-sets, eval=FALSE, include=TRUE}
genesHeatmapPlot(
  obj,
  primaryMarkers = c("Satb2", "Bcl11b", "Fezf2"),
  secondaryMarkers = c("Gabra3", "Meg3", "Cux1", "Neurod6"),
  pValueThreshold = 0.001,
  symmetric = FALSE,
  cores = 3L
)
```

## Establishing genes' clusters

`COTAN` provides a way to establish genes' clusters given some lists of markers

```{r establish-genes-clusters}
c(gSpace, eigPlot, pcaGenesClDF, treePlot) %<-%
  establishGenesClusters(
    obj,
    groupMarkers = layersGenes,
    numGenesPerMarker = 25L,
    kCuts = 5L
  )

plot(eigPlot)
```

```{r genes-tree-plot}
plot(treePlot)
```

### `UMAP` plot

```{r genes-umap-plot}
colSelection <- vapply(pcaGenesClDF, is.numeric, logical(1L))

genesUmapPl <-
  UMAPPlot(
    pcaGenesClDF[, colSelection, drop = FALSE],
    clusters = getColumnFromDF(pcaGenesClDF, "hclust"),
    elements = layersGenes,
    title = "Genes' clusters UMAP Plot",
    numNeighbors = 32L,
    minPointsDist = 0.25
  )

plot(genesUmapPl)
```


## Vignette clean-up stage

The next few lines are just to clean.

```{r final-clean-up}
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) {
  # delete file if it exists
  file.remove(file.path(outDir, paste0(cond, "_times.csv")))
}
if (dir.exists(file.path(outDir, cond))) {
  unlink(file.path(outDir, cond), recursive = TRUE)
}
# if (dir.exists(file.path(outDir, GEO))) {
#   unlink(file.path(outDir, GEO), recursive = TRUE)
# }

# stop logging to file
setLoggingFile("")
file.remove(file.path(outDir, "vignette_genes_clustering.log"))

options(prevOptState)
```

```{r closure}
Sys.time()

sessionInfo()
```