## ----setup, include=FALSE--------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## --------------------------------------------------------------------------
suppressPackageStartupMessages(library(HDCytoData))

# Load 'SummarizedExperiment' object using named function
Bodenmiller_BCR_XL_SE()

# Load 'flowSet' object using named function
Bodenmiller_BCR_XL_flowSet()

## --------------------------------------------------------------------------
# Create an ExperimentHub instance
ehub <- ExperimentHub()

# Query ExperimentHub instance to find data sets
query(ehub, "HDCytoData")

# Load 'SummarizedExperiment' object using index of data set
ehub[["EH1119"]]

# Load 'flowSet' object using index of data set
ehub[["EH1120"]]

## --------------------------------------------------------------------------
# Load data set in 'SummarizedExperiment' format
d_SE <- Bodenmiller_BCR_XL_SE()

# Inspect the object
d_SE
assay(d_SE)[1:6, 1:6]
rowData(d_SE)
colData(d_SE)

## --------------------------------------------------------------------------
# Apply transformation to marker columns
cofactor <- 5
cols <- colData(d_SE)$marker_class != "none"
assay(d_SE)[, cols] <- asinh(assay(d_SE)[, cols] / cofactor)

summary(assay(d_SE)[, 1:10])

## --------------------------------------------------------------------------
suppressPackageStartupMessages(library(FlowSOM))
suppressPackageStartupMessages(library(Rtsne))
suppressPackageStartupMessages(library(ggplot2))


# --------------
# Run clustering
# --------------

d_FlowSOM <- as.matrix(assay(d_SE)[, colData(d_SE)$marker_class == "type"])
d_FlowSOM <- flowFrame(d_FlowSOM)

set.seed(123)
out <- ReadInput(d_FlowSOM, transform = FALSE, scale = FALSE)
out <- BuildSOM(out, colsToUse = NULL)

labels_pre <- out$map$mapping[, 1]

# number of meta-clusters
k <- 20
out <- metaClustering_consensus(out$map$codes, k = k, seed = 123)

labels <- out[labels_pre]

# check meta-cluster labels
table(labels)


# ------------------------------
# t-SNE plot with cluster labels
# ------------------------------

d_Rtsne <- as.matrix(assay(d_SE)[, colData(d_SE)$marker_class == "type"])
colnames(d_Rtsne) <- colData(d_SE)$marker_name[colData(d_SE)$marker_class == "type"]

# subsampling
n_sub <- 1000
set.seed(123)
ix <- sample(seq_along(labels), n_sub)

d_Rtsne <- d_Rtsne[ix, ]
labels <- labels[ix]

# remove any duplicate rows (required by Rtsne)
dups <- duplicated(d_Rtsne)

d_Rtnse <- d_Rtsne[!dups, ]
labels <- labels[!dups]

# run Rtsne
set.seed(123)
out_Rtsne <- Rtsne(d_Rtsne, pca = FALSE, verbose = TRUE)

d_plot <- as.data.frame(out_Rtsne$Y)
colnames(d_plot) <- c("tSNE_1", "tSNE_2")
d_plot$cluster <- as.factor(labels)

# create plot
ggplot(d_plot, aes(x = tSNE_1, y = tSNE_2, color = cluster)) +
  geom_point() +
  ggtitle("t-SNE plot: clustering") +
  xlab("t-SNE dimension 1") +
  ylab("t-SNE dimension 2") +
  theme_classic()


# -------------------------------------------
# t-SNE plot with reference population labels
# -------------------------------------------

d_plot$population <- rowData(d_SE)$population_id[ix][!dups]

# create plot
ggplot(d_plot, aes(x = tSNE_1, y = tSNE_2, color = population)) +
  geom_point() +
  ggtitle("t-SNE plot: reference populations") +
  xlab("t-SNE dimension 1") +
  ylab("t-SNE dimension 2") +
  theme_classic()