## ----setup, include = FALSE, fig.show='hide'----------------------------------
knitr::knit_hooks$set(pngquant = knitr::hook_pngquant)

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  dev = "ragg_png",
  dpi = 72,
  fig.retina = 2,
  fig.align = "center",
  out.width = "100%",
  pngquant = "--speed=1 --quality=1-5"
)

## ----eval = FALSE, fig.show='hide'--------------------------------------------
#  BiocManager::install("ccb-hms/scDiagnostics")

## ----eval=FALSE, fig.show='hide'----------------------------------------------
#  BiocManager::install("ccb-hms/scDiagnostics",
#                       build_vignettes = TRUE,
#                       dependencies = TRUE)

## ----message=FALSE, fig.show='hide'-------------------------------------------
library(scDiagnostics)

## ----message=FALSE, fig.show='hide'-------------------------------------------
# Load datasets
data("reference_data")
data("query_data")
data("qc_data")

# Set seed for reproducibility
set.seed(0)

## ----message=FALSE, fig.show='hide'-------------------------------------------
# Load library
library(scran)
library(scater)

# Subset to CD4 cells
ref_data_cd4 <- reference_data[, which(
    reference_data$expert_annotation == "CD4")]
query_data_cd4 <- query_data_cd4 <- query_data[, which(
    query_data$expert_annotation == "CD4")]

# Select highly variable genes
ref_top_genes <- getTopHVGs(ref_data_cd4, n = 500)
query_top_genes <- getTopHVGs(query_data_cd4, n = 500)
common_genes <- intersect(ref_top_genes, query_top_genes)

# Subset data by common genes
ref_data_cd4 <- ref_data_cd4[common_genes,]
query_data_cd4 <- query_data_cd4[common_genes,]

# Run PCA on both datasets
ref_data_cd4 <- runPCA(ref_data_cd4)
query_data_cd4 <- runPCA(query_data_cd4)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Plot PCA data
pc_plot <- plotCellTypePCA(
    query_data = query_data, 
    reference_data = reference_data,
    cell_types = c("CD4", "CD8", "B_and_plasma", "Myeloid"),
    query_cell_type_col = "expert_annotation", 
    ref_cell_type_col = "expert_annotation",
    pc_subset = 1:3
)
# Display plot
pc_plot

## ----fig.show='hide'----------------------------------------------------------
disc_output <- calculateDiscriminantSpace(
    reference_data = reference_data,
    query_data = query_data, 
    ref_cell_type_col = "expert_annotation",
    query_cell_type_col = "SingleR_annotation"
)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
plot(disc_output, plot_type = "scatterplot")

## ----fig.height=5, fig.width=10, fig.show='hide', fig.show='hide'-------------
plotMarkerExpression(reference_data = reference_data, 
                     query_data = query_data, 
                     ref_cell_type_col = "expert_annotation", 
                     query_cell_type_col = "SingleR_annotation", 
                     gene_name = "VPREB3", 
                     cell_type = "B_and_plasma")

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Generate scatter plot
library(ggplot2)
p1 <- plotQCvsAnnotation(se_object = qc_data,
                         cell_type_col = "SingleR_annotation",
                         qc_col = "total",
                         score_col = "annotation_scores")
p1 + xlab("Library Size")

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Compare PCA subspaces between query and reference data
subspace_comparison <- comparePCASubspace(
    query_data = query_data_cd4,
    reference_data = ref_data_cd4, 
    query_cell_type_col = "expert_annotation", 
    ref_cell_type_col = "expert_annotation", 
    pc_subset = 1:5
)

# View weighted cosine similarity score
subspace_comparison$weighted_cosine_similarity

# Plot output for PCA subspace comparison (if a plot method is available)
plot(subspace_comparison)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Example usage of the function
plotPairwiseDistancesDensity(query_data = query_data, 
                             reference_data = reference_data, 
                             query_cell_type_col = "expert_annotation", 
                             ref_cell_type_col = "expert_annotation", 
                             cell_type_query = "CD8", 
                             cell_type_ref = "CD8", 
                             pc_subset = 1:10,
                             distance_metric = "correlation",
                             correlation_method = "pearson")

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Generate the Wasserstein distance density plot
wasserstein_data <- calculateWassersteinDistance(
    query_data = query_data_cd4,
    reference_data = ref_data_cd4, 
    query_cell_type_col = "expert_annotation", 
    ref_cell_type_col = "expert_annotation", 
    pc_subset = 1:10,
)
plot(wasserstein_data)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# RF function to compare (between datasets) which genes are best at differentiating cell types
rf_output <- calculateVarImpOverlap(reference_data = reference_data, 
                                    query_data = query_data, 
                                    query_cell_type_col = "SingleR_annotation", 
                                    ref_cell_type_col = "expert_annotation", 
                                    n_tree = 500,
                                    n_top = 50)

# Comparison table
rf_output$var_imp_comparison

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Compute pairwise correlations between specified cell types
cor_matrix_avg <- calculateAveragePairwiseCorrelation(
  query_data = query_data, 
  reference_data = reference_data, 
  query_cell_type_col = "SingleR_annotation", 
  ref_cell_type_col = "expert_annotation", 
  cell_types = c("CD4", "CD8", "B_and_plasma"), 
  pc_subset = 1:5,
  correlation_method = "spearman"
)

# Visualize the average pairwise correlation matrix
plot(cor_matrix_avg)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Perform anomaly detection
anomaly_output <- detectAnomaly(reference_data = reference_data, 
                                query_data = query_data, 
                                ref_cell_type_col = "expert_annotation", 
                                query_cell_type_col = "SingleR_annotation",
                                pc_subset = 1:5)
# Plot the results for a specific cell type
plot(anomaly_output, 
     cell_type = "CD4", 
     data_type = "query",
     pc_subset = 1:5)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Identify outliers for CD4
cd4_anomalies <- detectAnomaly(reference_data = reference_data, 
                               query_data = query_data, 
                               query_cell_type_col = "SingleR_annotation", 
                               ref_cell_type_col = "expert_annotation")
cd4_top6_anomalies <- names(sort(cd4_anomalies$CD4$query_anomaly_scores, 
                                 decreasing = TRUE)[1:6])

# Plot the PC data
distance_data <- calculateCellDistances(
    query_data = query_data, 
    reference_data = reference_data, 
    query_cell_type_col = "SingleR_annotation", 
    ref_cell_type_col = "expert_annotation"
) 

# Plot the densities of the distances
plot(distance_data, ref_cell_type = "CD4", cell_names = cd4_top6_anomalies)

## ----SessionInfo, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.show='hide'----
options(width = 80) 
sessionInfo()