---
title: "scGPS introduction"
description: "scGPS is a single cell RNA analysis framework to decompose a mixed population into clusters (SCORE), followed by analysing the relationship between clusters (scGPS)"
author: "Quan Nguyen and Michael Thompson"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
    %\VignetteIndexEntry{single cell Global fate Potential of Subpopulations}
    %\VignetteEngine{knitr::rmarkdown}
    \usepackage[utf8]{inputenc}
---

```{r setup, eval = TRUE, echo = FALSE}
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>"
)
```

# 1. Installation instruction

```{r installation, eval = FALSE}
# To install scGPS from github (Depending on the configuration of the local
# computer or HPC, possible custom C++ compilation may be required - see
# installation trouble-shootings below)
devtools::install_github("IMB-Computational-Genomics-Lab/scGPS")

# for C++ compilation trouble-shooting, manual download and installation can be
# done from github

git clone https://github.com/IMB-Computational-Genomics-Lab/scGPS

# then check in scGPS/src if any of the precompiled (e.g.  those with *.so and
# *.o) files exist and delete them before recompiling

# then with the scGPS as the R working directory, manually install and load
# using devtools functionality
# Install the package
devtools::install()
#load the package to the workspace 
library(scGPS)

```

# 2. A simple workflow of the scGPS: 
The purpose of this workflow is to solve the following task:

* Given a mixed population with known subpopulations, estimate transition scores between these subpopulation.

## 2.1 Create scGPS objects
```{r scGPS_object, warning = FALSE, message = FALSE}

# load mixed population 1 (loaded from day_2_cardio_cell_sample dataset, 
# named it as day2)
library(scGPS)

day2 <- day_2_cardio_cell_sample
mixedpop1 <- new_scGPS_object(ExpressionMatrix = day2$dat2_counts,
    GeneMetadata = day2$dat2geneInfo, CellMetadata = day2$dat2_clusters)

# load mixed population 2 (loaded from day_5_cardio_cell_sample dataset, 
# named it as day5)
day5 <- day_5_cardio_cell_sample
mixedpop2 <- new_scGPS_object(ExpressionMatrix = day5$dat5_counts,
    GeneMetadata = day5$dat5geneInfo, CellMetadata = day5$dat5_clusters)

```

## 2.2 Run prediction
```{r prediction, warning = FALSE, message = FALSE}

# select a subpopulation
c_selectID <- 1
# load gene list (this can be any lists of user selected genes)
genes <- training_gene_sample
genes <- genes$Merged_unique
# load cluster information 
cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]
#run training (running nboots = 3 here, but recommend to use nboots = 50-100)
LSOLDA_dat <- bootstrap_prediction(nboots = 3, mixedpop1 = mixedpop1, 
    mixedpop2 = mixedpop2, genes = genes, c_selectID  = c_selectID,
    listData = list(), cluster_mixedpop1 = cluster_mixedpop1, 
    cluster_mixedpop2 = cluster_mixedpop2, trainset_ratio = 0.7)
names(LSOLDA_dat)
```

## 2.3 Summarise results 
```{r summarise_results, warning = FALSE, message = FALSE}
# summary results LDA
sum_pred_lda <- summary_prediction_lda(LSOLDA_dat = LSOLDA_dat, nPredSubpop = 4)
# summary results Lasso to show the percent of cells
# classified as cells belonging 
sum_pred_lasso <- summary_prediction_lasso(LSOLDA_dat = LSOLDA_dat,
    nPredSubpop = 4)
# plot summary results 
plot_sum <-function(sum_dat){
    sum_dat_tf <- t(sum_dat)
    sum_dat_tf <- na.omit(sum_dat_tf)
    sum_dat_tf <- apply(sum_dat[, -ncol(sum_dat)],1,
        function(x){as.numeric(as.vector(x))})
    sum_dat$names <- gsub("ElasticNet for subpop","sp",  sum_dat$names )
    sum_dat$names <- gsub("in target mixedpop","in p",  sum_dat$names) 
    sum_dat$names <- gsub("LDA for subpop","sp",  sum_dat$names )
    sum_dat$names <- gsub("in target mixedpop","in p",  sum_dat$names)
    colnames(sum_dat_tf) <- sum_dat$names
    boxplot(sum_dat_tf, las=2)
}
plot_sum(sum_pred_lasso)
plot_sum(sum_pred_lda)
# summary accuracy to check the model accuracy in the leave-out test set 
summary_accuracy(object = LSOLDA_dat)
# summary maximum deviance explained by the model 
summary_deviance(object = LSOLDA_dat)
```


# 3. A complete workflow of the scGPS:
The purpose of this workflow is to solve the following task:

* Given an unknown mixed population, find clusters and estimate relationship between clusters
 
## 3.1 Identify clusters in a dataset using CORE
 (skip this step if clusters are known)
```{r CORE, warning = FALSE, message = FALSE}

# find clustering information in an expresion data using CORE
day5 <- day_5_cardio_cell_sample
cellnames <- colnames(day5$dat5_counts)
cluster <-day5$dat5_clusters
cellnames <-data.frame("Cluster"=cluster, "cellBarcodes" = cellnames)
mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts,
                    GeneMetadata = day5$dat5geneInfo, CellMetadata = cellnames)

CORE_cluster <- CORE_clustering(mixedpop2, remove_outlier = c(0), PCA=FALSE)

# to update the clustering information, users can ...
key_height <- CORE_cluster$optimalClust$KeyStats$Height
optimal_res <- CORE_cluster$optimalClust$OptimalRes
optimal_index = which(key_height == optimal_res)

clustering_after_outlier_removal <- unname(unlist(
 CORE_cluster$Cluster[[optimal_index]]))
corresponding_cells_after_outlier_removal <- CORE_cluster$cellsForClustering
original_cells_before_removal <- colData(mixedpop2)[,2]
corresponding_index <- match(corresponding_cells_after_outlier_removal,
                            original_cells_before_removal )
# check the matching
identical(as.character(original_cells_before_removal[corresponding_index]),
         corresponding_cells_after_outlier_removal)
# create new object with the new clustering after removing outliers
mixedpop2_post_clustering <- mixedpop2[,corresponding_index]
colData(mixedpop2_post_clustering)[,1] <- clustering_after_outlier_removal

```

## 3.2 Identify clusters in a dataset using SCORE (Stable Clustering at Optimal REsolution)
(skip this step if clusters are known)

(SCORE aims to get stable subpopulation results by introducing bagging aggregation and bootstrapping to the CORE algorithm)
```{r SCORE with bagging, warning = FALSE, message = FALSE}

# find clustering information in an expresion data using SCORE
day5 <- day_5_cardio_cell_sample
cellnames <- colnames(day5$dat5_counts)
cluster <-day5$dat5_clusters
cellnames <-data.frame("Cluster"=cluster, "cellBarcodes" = cellnames)
mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts,
                    GeneMetadata = day5$dat5geneInfo, CellMetadata = cellnames )

SCORE_test <- CORE_bagging(mixedpop2, remove_outlier = c(0), PCA=FALSE,
                                bagging_run = 20, subsample_proportion = .8)

```


## 3.3 Visualise all cluster results in all iterations
```{r visualisation, dev='CairoPDF'}
dev.off()
##3.2.1 plot CORE clustering
p1 <- plot_CORE(CORE_cluster$tree, CORE_cluster$Cluster, 
    color_branch = c("#208eb7", "#6ce9d3", "#1c5e39", "#8fca40", "#154975",
        "#b1c8eb"))
p1
#extract optimal index identified by CORE
key_height <- CORE_cluster$optimalClust$KeyStats$Height
optimal_res <- CORE_cluster$optimalClust$OptimalRes
optimal_index = which(key_height == optimal_res)
#plot one optimal clustering bar
plot_optimal_CORE(original_tree= CORE_cluster$tree,
                 optimal_cluster = unlist(CORE_cluster$Cluster[optimal_index]),
                 shift = -2000)

##3.2.2 plot SCORE clustering
#plot all clustering bars
plot_CORE(SCORE_test$tree, list_clusters = SCORE_test$Cluster)
#plot one stable optimal clustering bar
plot_optimal_CORE(original_tree= SCORE_test$tree,
                 optimal_cluster = unlist(SCORE_test$Cluster[
                    SCORE_test$optimal_index]),
                 shift = -100)

```

## 3.4 Compare clustering results with other dimensional reduction methods (e.g., tSNE)
```{r compare_clustering, fig.width=5, fig.height=5}
t <- tSNE(expression.mat=assay(mixedpop2))
p2 <-plot_reduced(t, color_fac = factor(colData(mixedpop2)[,1]),
                      palletes =1:length(unique(colData(mixedpop2)[,1])))
p2
```

## 3.5 Find gene markers and annotate clusters

```{r find_markers, warning = FALSE, message = FALSE, fig.width=7}
#load gene list (this can be any lists of user-selected genes)
genes <-training_gene_sample
genes <-genes$Merged_unique

#the gene list can also be objectively identified by differential expression
#analysis cluster information is requied for find_markers. Here, we use
#CORE results.

#colData(mixedpop2)[,1] <- unlist(SCORE_test$Cluster[SCORE_test$optimal_index])

suppressMessages(library(locfit))

DEgenes <- find_markers(expression_matrix=assay(mixedpop2),
                            cluster = colData(mixedpop2)[,1],
                            selected_cluster=unique(colData(mixedpop2)[,1]))

#the output contains dataframes for each cluster.
#the data frame contains all genes, sorted by p-values
names(DEgenes)

#you can annotate the identified clusters
DEgeneList_1vsOthers <- DEgenes$DE_Subpop1vsRemaining$id

#users need to check the format of the gene input to make sure they are
#consistent to the gene names in the expression matrix

#the following command saves the file "PathwayEnrichment.xlsx" to the
#working dir
#use 500 top DE genes
suppressMessages(library(DOSE))
suppressMessages(library(ReactomePA))
suppressMessages(library(clusterProfiler))
genes500 <- as.factor(DEgeneList_1vsOthers[seq_len(500)])
enrichment_test <- annotate_clusters(genes, pvalueCutoff=0.05, gene_symbol=TRUE)

#the enrichment outputs can be displayed by running
clusterProfiler::dotplot(enrichment_test, showCategory=10, font.size = 6)

```

# 4. Relationship between clusters within one sample or between two samples
The purpose of this workflow is to solve the following task:

* Given one or two unknown mixed population(s) and clusters in each mixed population, estimate
* Visualise relationship between clusters*
 
## 4.1 Start the scGPS prediction to find relationship between clusters
 
```{r scGPS_prediction, warning = FALSE, message = FALSE}

#select a subpopulation, and input gene list
c_selectID <- 1
#note make sure the format for genes input here is the same to the format
#for genes in the mixedpop1 and mixedpop2
genes = DEgenes$id[1:500]

#run the test bootstrap with nboots = 2 runs

cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]

LSOLDA_dat <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop1,
                             mixedpop2 = mixedpop2, genes = genes, 
                             c_selectID  = c_selectID,
                             listData = list(),
                             cluster_mixedpop1 = cluster_mixedpop1,
                             cluster_mixedpop2 = cluster_mixedpop2)

```

## 4.2 Display summary results for the prediction

```{r summarise_prediction}
#get the number of rows for the summary matrix
row_cluster <-length(unique(colData(mixedpop2)[,1]))

#summary results LDA to to show the percent of cells classified as cells
#belonging by LDA classifier
summary_prediction_lda(LSOLDA_dat=LSOLDA_dat, nPredSubpop = row_cluster )

#summary results Lasso to show the percent of cells classified as cells
#belonging by Lasso classifier
summary_prediction_lasso(LSOLDA_dat=LSOLDA_dat, nPredSubpop = row_cluster)

# summary maximum deviance explained by the model during the model training
summary_deviance(object = LSOLDA_dat)

# summary accuracy to check the model accuracy in the leave-out test set
summary_accuracy(object = LSOLDA_dat)

```

## 4.3 Plot the relationship between clusters in one sample
Here we look at one example use case to find relationship between clusters
within one sample or between two sample

```{r prediction_one_sample, warning = FALSE, message = FALSE, fig.width=5}
#run prediction for 3 clusters
cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]
#cluster_mixedpop2 <- as.numeric(as.vector(colData(mixedpop2)[,1]))

c_selectID <- 1
#top 200 gene markers distinguishing cluster 1
genes = DEgenes$id[1:200]

LSOLDA_dat1 <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop2,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop2,
                        cluster_mixedpop2 = cluster_mixedpop2)

c_selectID <- 2
genes = DEgenes$id[1:200]

LSOLDA_dat2 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop2,
                        cluster_mixedpop2 = cluster_mixedpop2)

c_selectID <- 3
genes = DEgenes$id[1:200]
LSOLDA_dat3 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop2,
                        cluster_mixedpop2 = cluster_mixedpop2)

c_selectID <- 4
genes = DEgenes$id[1:200]
LSOLDA_dat4 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop2,
                        cluster_mixedpop2 = cluster_mixedpop2)


#prepare table input for sankey plot

LASSO_C1S2  <- reformat_LASSO(c_selectID=1, mp_selectID = 2,
                             LSOLDA_dat=LSOLDA_dat1,
                             nPredSubpop = length(unique(colData(mixedpop2)
                                [,1])),
                             Nodes_group ="#7570b3")

LASSO_C2S2  <- reformat_LASSO(c_selectID=2, mp_selectID =2,
                             LSOLDA_dat=LSOLDA_dat2,
                             nPredSubpop = length(unique(colData(mixedpop2)
                                [,1])),
                             Nodes_group ="#1b9e77")

LASSO_C3S2  <- reformat_LASSO(c_selectID=3, mp_selectID =2,
                             LSOLDA_dat=LSOLDA_dat3,
                             nPredSubpop = length(unique(colData(mixedpop2)
                                [,1])),
                             Nodes_group ="#e7298a")

LASSO_C4S2  <- reformat_LASSO(c_selectID=4, mp_selectID =2,
                             LSOLDA_dat=LSOLDA_dat4,
                             nPredSubpop = length(unique(colData(mixedpop2)
                                [,1])),
                             Nodes_group ="#00FFFF")

combined <- rbind(LASSO_C1S2,LASSO_C2S2,LASSO_C3S2, LASSO_C4S2 )
combined <- combined[is.na(combined$Value) != TRUE,]

nboots = 2
#links: source, target, value
#source: node, nodegroup
combined_D3obj <-list(Nodes=combined[,(nboots+3):(nboots+4)],
                     Links=combined[,c((nboots+2):(nboots+1),ncol(combined))])

library(networkD3)

Node_source <- as.vector(sort(unique(combined_D3obj$Links$Source)))
Node_target <- as.vector(sort(unique(combined_D3obj$Links$Target)))
Node_all <-unique(c(Node_source, Node_target))

#assign IDs for Source (start from 0)
Source <-combined_D3obj$Links$Source
Target <- combined_D3obj$Links$Target

for(i in 1:length(Node_all)){
   Source[Source==Node_all[i]] <-i-1
   Target[Target==Node_all[i]] <-i-1
}
# 
combined_D3obj$Links$Source <- as.numeric(Source)
combined_D3obj$Links$Target <- as.numeric(Target)
combined_D3obj$Links$LinkColor <- combined$NodeGroup

#prepare node info
node_df <-data.frame(Node=Node_all)
node_df$id <-as.numeric(c(0, 1:(length(Node_all)-1)))

suppressMessages(library(dplyr))
Color <- combined %>% count(Node, color=NodeGroup) %>% select(2)
node_df$color <- Color$color

suppressMessages(library(networkD3))
p1<-sankeyNetwork(Links =combined_D3obj$Links, Nodes = node_df,
                 Value = "Value", NodeGroup ="color", LinkGroup = "LinkColor", 
                 NodeID="Node", Source="Source", Target="Target", fontSize = 22)
p1

#saveNetwork(p1, file = paste0(path,'Subpopulation_Net.html'))

```

## 4.3 Plot the relationship between clusters in two samples
Here we look at one example use case to find relationship between clusters within one sample or between two sample
 
```{r prediction_two_samples,warning = FALSE, message = FALSE, fig.width=5}
#run prediction for 3 clusters
cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]
row_cluster <-length(unique(colData(mixedpop2)[,1]))

c_selectID <- 1
#top 200 gene markers distinguishing cluster 1
genes = DEgenes$id[1:200]
LSOLDA_dat1 <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop1,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop1,
                        cluster_mixedpop2 = cluster_mixedpop2)


c_selectID <- 2
genes = DEgenes$id[1:200]
LSOLDA_dat2 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop1,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop1,
                        cluster_mixedpop2 = cluster_mixedpop2)

c_selectID <- 3
genes = DEgenes$id[1:200]
LSOLDA_dat3 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop1,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop1,
                        cluster_mixedpop2 = cluster_mixedpop2)

#prepare table input for sankey plot

LASSO_C1S1  <- reformat_LASSO(c_selectID=1, mp_selectID = 1,
                             LSOLDA_dat=LSOLDA_dat1, nPredSubpop = row_cluster, 
                             Nodes_group = "#7570b3")

LASSO_C2S1  <- reformat_LASSO(c_selectID=2, mp_selectID = 1,
                             LSOLDA_dat=LSOLDA_dat2, nPredSubpop = row_cluster, 
                             Nodes_group = "#1b9e77")

LASSO_C3S1  <- reformat_LASSO(c_selectID=3, mp_selectID = 1,
                             LSOLDA_dat=LSOLDA_dat3, nPredSubpop = row_cluster, 
                             Nodes_group = "#e7298a")


combined <- rbind(LASSO_C1S1,LASSO_C2S1,LASSO_C3S1)

nboots = 2
#links: source, target, value
#source: node, nodegroup
combined_D3obj <-list(Nodes=combined[,(nboots+3):(nboots+4)],
                     Links=combined[,c((nboots+2):(nboots+1),ncol(combined))])
combined <- combined[is.na(combined$Value) != TRUE,]


library(networkD3)

Node_source <- as.vector(sort(unique(combined_D3obj$Links$Source)))
Node_target <- as.vector(sort(unique(combined_D3obj$Links$Target)))
Node_all <-unique(c(Node_source, Node_target))

#assign IDs for Source (start from 0)
Source <-combined_D3obj$Links$Source
Target <- combined_D3obj$Links$Target

for(i in 1:length(Node_all)){
   Source[Source==Node_all[i]] <-i-1
   Target[Target==Node_all[i]] <-i-1
}

combined_D3obj$Links$Source <- as.numeric(Source)
combined_D3obj$Links$Target <- as.numeric(Target)
combined_D3obj$Links$LinkColor <- combined$NodeGroup

#prepare node info
node_df <-data.frame(Node=Node_all)
node_df$id <-as.numeric(c(0, 1:(length(Node_all)-1)))

suppressMessages(library(dplyr))
n <- length(unique(node_df$Node))
getPalette = colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))
Color = getPalette(n)
node_df$color <- Color
suppressMessages(library(networkD3))
p1<-sankeyNetwork(Links =combined_D3obj$Links, Nodes = node_df,
                 Value = "Value", NodeGroup ="color", LinkGroup = "LinkColor",
                 NodeID="Node", Source="Source", Target="Target", fontSize = 22)
p1
#saveNetwork(p1, file = paste0(path,'Subpopulation_Net.html'))
```


```{r session_info, include=TRUE, echo=TRUE, results='markup'}
devtools::session_info()
```