## ----setup, include = FALSE, fig.show='hide'----------------------------------
knitr::knit_hooks$set(pngquant = knitr::hook_pngquant)

knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
dev = "ragg_png",
dpi = 72,
fig.retina = 2,
fig.align = "center",
out.width = "100%",
pngquant = "--speed=1 --quality=1-5"
)

## ----message=FALSE, fig.show='hide'-------------------------------------------
# Load library
library(scDiagnostics)

# Load datasets
data("reference_data")
data("query_data")

# Set seed for reproducibility
set.seed(0)

## ----message=FALSE, fig.show='hide'-------------------------------------------
# Load library
library(scran)
library(scater)

# Subset to CD4 cells
ref_data_cd4 <- reference_data[, which(
    reference_data$expert_annotation == "CD4")]
query_data_cd4 <- query_data_cd4 <- query_data[, which(
    query_data$expert_annotation == "CD4")]

# Select highly variable genes
ref_top_genes <- getTopHVGs(ref_data_cd4, n = 500)
query_top_genes <- getTopHVGs(query_data_cd4, n = 500)
common_genes <- intersect(ref_top_genes, query_top_genes)

# Subset data by common genes
ref_data_cd4 <- ref_data_cd4[common_genes,]
query_data_cd4 <- query_data_cd4[common_genes,]

# Run PCA on both datasets
ref_data_cd4 <- runPCA(ref_data_cd4)
query_data_cd4 <- runPCA(query_data_cd4)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Perform CCA 
cca_comparison <- compareCCA(query_data = query_data_cd4, 
                             reference_data = ref_data_cd4, 
                             query_cell_type_col = "expert_annotation", 
                             ref_cell_type_col = "expert_annotation", 
                             pc_subset = 1:5)
plot(cca_comparison)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Perform PCA 
pca_comparison <- comparePCA(query_data = query_data_cd4, 
                             reference_data = ref_data_cd4, 
                             query_cell_type_col = "expert_annotation", 
                             ref_cell_type_col = "expert_annotation", 
                             pc_subset = 1:5, 
                             metric = "cosine")
plot(pca_comparison)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Compare PCA subspaces between query and reference data
subspace_comparison <- comparePCASubspace(
    query_data = query_data_cd4,
    reference_data = ref_data_cd4, 
    query_cell_type_col = "expert_annotation", 
    ref_cell_type_col = "expert_annotation", 
    pc_subset = 1:5
)

# View weighted cosine similarity score
subspace_comparison$weighted_cosine_similarity

# Plot output for PCA subspace comparison (if a plot method is available)
plot(subspace_comparison)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Example usage of the function
plotPairwiseDistancesDensity(query_data = query_data, 
                             reference_data = reference_data, 
                             query_cell_type_col = "expert_annotation", 
                             ref_cell_type_col = "expert_annotation", 
                             cell_type_query = "CD4", 
                             cell_type_ref = "CD4", 
                             pc_subset = 1:5,
                             distance_metric = "euclidean")

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Generate the Wasserstein distance density plot
wasserstein_data <- calculateWassersteinDistance(
    query_data = query_data_cd4,
    reference_data = ref_data_cd4, 
    query_cell_type_col = "expert_annotation", 
    ref_cell_type_col = "expert_annotation", 
    pc_subset = 1:10,
)
plot(wasserstein_data)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Perform Cramer test for the specified cell types and principal components
cramer_test <- calculateCramerPValue(
    reference_data = reference_data,
    query_data = query_data,
    ref_cell_type_col = "expert_annotation", 
    query_cell_type_col = "SingleR_annotation",
    cell_types = c("CD4", "CD8", "B_and_plasma", "Myeloid"),
    pc_subset = 1:5
)

# Display the Cramer test p-values
print(cramer_test)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Calculate Hotelling's T-squared test p-values
p_values <- calculateHotellingPValue(
    query_data = query_data, 
    reference_data = reference_data, 
    query_cell_type_col = "SingleR_annotation", 
    ref_cell_type_col = "expert_annotation",
    pc_subset = 1:10
)

# Display the p-values rounded to five decimal places
print(round(p_values, 5))

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Compute pairwise correlations between specified cell types
cor_matrix_avg <- calculateAveragePairwiseCorrelation(
  query_data = query_data, 
  reference_data = reference_data, 
  query_cell_type_col = "SingleR_annotation", 
  ref_cell_type_col = "expert_annotation", 
  cell_types = c("CD4", "CD8", "B_and_plasma"), 
  pc_subset = 1:5,
  correlation_method = "spearman"
)

# Visualize the average pairwise correlation matrix
plot(cor_matrix_avg)

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Perform regression analysis using only reference data
regress_res <- regressPC(
    reference_data = reference_data,
    ref_cell_type_col = "expert_annotation",
    cell_types = c("CD4", "CD8", "B_and_plasma", "Myeloid"),
    pc_subset = 1:15
)

# Plot results showing R-squared values
plot(regress_res, plot_type = "r_squared")

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Plot results showing p-values
plot(regress_res, plot_type = "p-value")

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Perform regression analysis using both reference and query data
regress_res <- regressPC(
    reference_data = reference_data,
    query_data = query_data,
    ref_cell_type_col = "expert_annotation",
    query_cell_type_col = "SingleR_annotation",
    cell_types = c("CD4", "CD8", "B_and_plasma", "Myeloid"),
    pc_subset = 1:15
)

# Plot results showing R-squared values
plot(regress_res, plot_type = "r_squared")

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Plot results showing p-values
plot(regress_res, plot_type = "p-value")

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# Load library to get top HVGs
library(scran)

# Select the top 500 highly variable genes from both datasets
ref_var_genes <- getTopHVGs(reference_data, n = 500)
query_var_genes <- getTopHVGs(query_data, n = 500)

# Calculate the overlap coefficient between the reference and query HVGs
overlap_coefficient <- calculateHVGOverlap(reference_genes = ref_var_genes, 
                                           query_genes = query_var_genes)

# Display the overlap coefficient
overlap_coefficient

## ----fig.height=5, fig.width=10, fig.show='hide'------------------------------
# RF function to compare which genes are best at differentiating cell types
rf_output <- calculateVarImpOverlap(reference_data = reference_data, 
                                    query_data = query_data, 
                                    query_cell_type_col = "expert_annotation", 
                                    ref_cell_type_col = "expert_annotation", 
                                    n_tree = 500,
                                    n_top = 50)

# Comparison table
rf_output$var_imp_comparison

## ----SessionInfo, echo=FALSE, message=FALSE, warning=FALSE, comment=NA, fig.show='hide'----
options(width = 80) 
sessionInfo()