HDCytoData 1.10.0
This vignette demonstrates several examples and use cases for the datasets in the HDCytoData
package.
Using the clustering datasets, we can generate dimension reduction plots with colors indicating the ground truth cell population labels. This provides a visual representation of the cell population structure in these datasets, which is useful during exploratory data analysis and for representing the output of clustering or other downstream analysis algorithms.
Below, we compare three different dimension reduction algorithms (principal component analysis [PCA], t-distributed stochastic neighbor embedding [tSNE], and uniform manifold approximation and projection [UMAP]), for one of the datasets (Levine_32dim
). This dataset contains ground truth cell population labels for 14 immune cell populations.
suppressPackageStartupMessages(library(HDCytoData))
suppressPackageStartupMessages(library(SummarizedExperiment))
suppressPackageStartupMessages(library(Rtsne))
suppressPackageStartupMessages(library(umap))
suppressPackageStartupMessages(library(ggplot2))
# ---------
# Load data
# ---------
d_SE <- Levine_32dim_SE()
## snapshotDate(): 2020-10-02
## see ?HDCytoData and browseVignettes('HDCytoData') for documentation
## loading from cache
# -------------
# Preprocessing
# -------------
# select 'cell type' marker columns for defining clusters
d_sub <- assay(d_SE[, colData(d_SE)$marker_class == "type"])
# extract cell population labels
population <- rowData(d_SE)$population_id
dim(d_sub)
## [1] 265627 32
stopifnot(nrow(d_sub) == length(population))
# transform data using asinh with cofactor 5
cofactor <- 5
d_sub <- asinh(d_sub / cofactor)
summary(d_sub)
## CD45RA CD133 CD19 CD22
## Min. :-0.05731 Min. :-0.05808 Min. :-0.05809 Min. :-0.05734
## 1st Qu.: 0.20463 1st Qu.:-0.02294 1st Qu.:-0.01884 1st Qu.:-0.02069
## Median : 0.54939 Median : 0.02535 Median : 0.07521 Median : 0.05879
## Mean : 0.68813 Mean : 0.14596 Mean : 0.50930 Mean : 0.39732
## 3rd Qu.: 1.03120 3rd Qu.: 0.22430 3rd Qu.: 0.54839 3rd Qu.: 0.38648
## Max. : 6.69120 Max. : 5.52750 Max. : 4.99008 Max. : 5.16048
## CD11b CD4 CD8 CD34
## Min. :-0.058236 Min. :-0.05775 Min. :-0.05800 Min. :-0.05801
## 1st Qu.:-0.000294 1st Qu.:-0.01259 1st Qu.:-0.01732 1st Qu.:-0.01117
## Median : 0.257923 Median : 0.13122 Median : 0.07363 Median : 0.11071
## Mean : 0.710319 Mean : 0.36760 Mean : 0.56522 Mean : 0.33989
## 3rd Qu.: 0.923517 3rd Qu.: 0.57812 3rd Qu.: 0.48642 3rd Qu.: 0.39281
## Max. : 5.260789 Max. : 6.58176 Max. : 4.69369 Max. : 5.14800
## Flt3 CD20 CXCR4 CD235ab
## Min. :-0.057884 Min. :-0.05813 Min. :-0.05704 Min. :-0.05761
## 1st Qu.:-0.007793 1st Qu.:-0.02207 1st Qu.: 0.25290 1st Qu.: 0.23100
## Median : 0.110317 Median : 0.03382 Median : 0.66539 Median : 0.54043
## Mean : 0.229768 Mean : 0.38441 Mean : 0.79247 Mean : 0.63189
## 3rd Qu.: 0.336117 3rd Qu.: 0.32551 3rd Qu.: 1.20168 3rd Qu.: 0.92358
## Max. : 7.117323 Max. : 6.05141 Max. : 5.69667 Max. : 6.64670
## CD45 CD123 CD321 CD14
## Min. :2.040 Min. :-0.05800 Min. :-0.05355 Min. :-0.057954
## 1st Qu.:5.116 1st Qu.:-0.01162 1st Qu.: 1.32346 1st Qu.:-0.026326
## Median :5.645 Median : 0.09602 Median : 1.90479 Median :-0.005379
## Mean :5.408 Mean : 0.37241 Mean : 1.93542 Mean : 0.077030
## 3rd Qu.:5.939 3rd Qu.: 0.41310 3rd Qu.: 2.51781 3rd Qu.: 0.089789
## Max. :7.238 Max. : 6.64063 Max. : 6.86739 Max. : 5.006121
## CD33 CD47 CD11c CD7
## Min. :-0.05808 Min. :-0.05509 Min. :-0.058053 Min. :-0.05816
## 1st Qu.:-0.01813 1st Qu.: 2.08788 1st Qu.:-0.002711 1st Qu.:-0.01567
## Median : 0.06107 Median : 2.71442 Median : 0.212063 Median : 0.13002
## Mean : 0.30792 Mean : 2.65608 Mean : 0.703504 Mean : 0.81384
## 3rd Qu.: 0.34147 3rd Qu.: 3.27654 3rd Qu.: 0.861448 3rd Qu.: 1.37083
## Max. : 5.61247 Max. : 6.40249 Max. : 6.520939 Max. : 6.31922
## CD15 CD16 CD44 CD38
## Min. :-0.05808 Min. :-0.05778 Min. :0.02606 Min. :-0.05719
## 1st Qu.:-0.01502 1st Qu.:-0.02255 1st Qu.:3.12712 1st Qu.: 0.40198
## Median : 0.09355 Median : 0.01424 Median :3.87967 Median : 1.02032
## Mean : 0.23136 Mean : 0.16123 Mean :3.76018 Mean : 1.47781
## 3rd Qu.: 0.38331 3rd Qu.: 0.16077 3rd Qu.:4.47392 3rd Qu.: 2.19146
## Max. : 1.53415 Max. : 5.33831 Max. :7.40456 Max. : 7.29308
## CD13 CD3 CD61 CD117
## Min. :-0.05773 Min. :-0.05824 Min. :-0.05764 Min. :-0.05767
## 1st Qu.: 0.02110 1st Qu.: 0.08495 1st Qu.:-0.01285 1st Qu.:-0.02396
## Median : 0.18706 Median : 0.60376 Median : 0.09569 Median :-0.00041
## Mean : 0.36856 Mean : 2.16576 Mean : 0.34446 Mean : 0.13120
## 3rd Qu.: 0.53550 3rd Qu.: 4.66522 3rd Qu.: 0.41579 3rd Qu.: 0.15474
## Max. : 6.98119 Max. : 6.74836 Max. : 7.74850 Max. : 5.50213
## CD49d HLA-DR CD64 CD41
## Min. :-0.05806 Min. :-0.05797 Min. :-0.05820 Min. :-0.05824
## 1st Qu.: 0.28301 1st Qu.: 0.05771 1st Qu.:-0.01058 1st Qu.:-0.02017
## Median : 0.67721 Median : 0.61133 Median : 0.12249 Median : 0.05223
## Mean : 0.79494 Mean : 1.52181 Mean : 0.55151 Mean : 0.26175
## 3rd Qu.: 1.19079 3rd Qu.: 2.88824 3rd Qu.: 0.60413 3rd Qu.: 0.30559
## Max. : 5.15344 Max. : 7.05251 Max. : 4.51784 Max. : 7.71829
# subsample cells for faster runtimes in vignette
n <- 2000
set.seed(123)
ix <- sample(seq_len(nrow(d_sub)), n)
d_sub <- d_sub[ix, ]
population <- population[ix]
dim(d_sub)
## [1] 2000 32
stopifnot(nrow(d_sub) == length(population))
# remove any near-duplicate rows (required by Rtsne)
dups <- duplicated(d_sub)
d_sub <- d_sub[!dups, ]
population <- population[!dups]
dim(d_sub)
## [1] 1998 32
stopifnot(nrow(d_sub) == length(population))
# ------------------------
# Dimension reduction: PCA
# ------------------------
n_dims <- 2
# run PCA
# (note: no scaling, since asinh-transformed dimensions are already comparable)
out_PCA <- prcomp(d_sub, center = TRUE, scale. = FALSE)
dims_PCA <- out_PCA$x[, seq_len(n_dims)]
colnames(dims_PCA) <- c("PC_1", "PC_2")
head(dims_PCA)
## PC_1 PC_2
## [1,] 1.450702 3.1573053
## [2,] 2.453109 -0.9381139
## [3,] -2.705226 0.7090551
## [4,] 2.718284 -2.2801305
## [5,] -2.714230 -0.1954170
## [6,] -3.003650 -0.1938087
stopifnot(nrow(dims_PCA) == length(population))
colnames(dims_PCA) <- c("dimension_x", "dimension_y")
dims_PCA <- cbind(as.data.frame(dims_PCA), population, type = "PCA")
head(dims_PCA)
## dimension_x dimension_y population type
## 1 1.450702 3.1573053 unassigned PCA
## 2 2.453109 -0.9381139 unassigned PCA
## 3 -2.705226 0.7090551 unassigned PCA
## 4 2.718284 -2.2801305 unassigned PCA
## 5 -2.714230 -0.1954170 unassigned PCA
## 6 -3.003650 -0.1938087 unassigned PCA
str(dims_PCA)
## 'data.frame': 1998 obs. of 4 variables:
## $ dimension_x: num 1.45 2.45 -2.71 2.72 -2.71 ...
## $ dimension_y: num 3.157 -0.938 0.709 -2.28 -0.195 ...
## $ population : Factor w/ 15 levels "Basophils","CD16-_NK_cells",..: 15 15 15 15 15 15 15 9 10 10 ...
## $ type : chr "PCA" "PCA" "PCA" "PCA" ...
# generate plot
d_plot <- dims_PCA
str(d_plot)
## 'data.frame': 1998 obs. of 4 variables:
## $ dimension_x: num 1.45 2.45 -2.71 2.72 -2.71 ...
## $ dimension_y: num 3.157 -0.938 0.709 -2.28 -0.195 ...
## $ population : Factor w/ 15 levels "Basophils","CD16-_NK_cells",..: 15 15 15 15 15 15 15 9 10 10 ...
## $ type : chr "PCA" "PCA" "PCA" "PCA" ...
colors <- c(rainbow(14), "gray75")
ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) +
facet_wrap(~ type, scales = "free") +
geom_point(size = 0.7, alpha = 0.5) +
scale_color_manual(values = colors) +
labs(x = "dimension x", y = "dimension y") +
theme_bw() +
theme(aspect.ratio = 1,
legend.key.height = unit(4, "mm"))
# -------------------------
# Dimension reduction: tSNE
# -------------------------
# run Rtsne
set.seed(123)
out_Rtsne <- Rtsne(as.matrix(d_sub), dims = n_dims)
dims_Rtsne <- out_Rtsne$Y
colnames(dims_Rtsne) <- c("tSNE_1", "tSNE_2")
head(dims_Rtsne)
## tSNE_1 tSNE_2
## [1,] 21.317627 -9.102072
## [2,] -6.457918 21.639648
## [3,] -4.640114 -21.309045
## [4,] -5.916015 26.265756
## [5,] -18.297775 -15.155949
## [6,] -5.113306 -13.962952
stopifnot(nrow(dims_Rtsne) == length(population))
colnames(dims_Rtsne) <- c("dimension_x", "dimension_y")
dims_Rtsne <- cbind(as.data.frame(dims_Rtsne), population, type = "tSNE")
head(dims_Rtsne)
## dimension_x dimension_y population type
## 1 21.317627 -9.102072 unassigned tSNE
## 2 -6.457918 21.639648 unassigned tSNE
## 3 -4.640114 -21.309045 unassigned tSNE
## 4 -5.916015 26.265756 unassigned tSNE
## 5 -18.297775 -15.155949 unassigned tSNE
## 6 -5.113306 -13.962952 unassigned tSNE
str(dims_Rtsne)
## 'data.frame': 1998 obs. of 4 variables:
## $ dimension_x: num 21.32 -6.46 -4.64 -5.92 -18.3 ...
## $ dimension_y: num -9.1 21.6 -21.3 26.3 -15.2 ...
## $ population : Factor w/ 15 levels "Basophils","CD16-_NK_cells",..: 15 15 15 15 15 15 15 9 10 10 ...
## $ type : chr "tSNE" "tSNE" "tSNE" "tSNE" ...
# generate plot
d_plot <- dims_Rtsne
ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) +
facet_wrap(~ type, scales = "free") +
geom_point(size = 0.7, alpha = 0.5) +
scale_color_manual(values = colors) +
labs(x = "dimension x", y = "dimension y") +
theme_bw() +
theme(aspect.ratio = 1,
legend.key.height = unit(4, "mm"))
# -------------------------
# Dimension reduction: UMAP
# -------------------------
# run umap
set.seed(123)
out_umap <- umap(d_sub)
dims_umap <- out_umap$layout
colnames(dims_umap) <- c("UMAP_1", "UMAP_2")
head(dims_umap)
## UMAP_1 UMAP_2
## [1,] 7.4298928 3.545640
## [2,] -6.6991260 6.304541
## [3,] 0.9351728 -6.959218
## [4,] -6.2004201 6.935849
## [5,] -1.5881799 -7.895300
## [6,] -0.3589191 -6.045946
stopifnot(nrow(dims_umap) == length(population))
colnames(dims_umap) <- c("dimension_x", "dimension_y")
dims_umap <- cbind(as.data.frame(dims_umap), population, type = "UMAP")
head(dims_umap)
## dimension_x dimension_y population type
## 1 7.4298928 3.545640 unassigned UMAP
## 2 -6.6991260 6.304541 unassigned UMAP
## 3 0.9351728 -6.959218 unassigned UMAP
## 4 -6.2004201 6.935849 unassigned UMAP
## 5 -1.5881799 -7.895300 unassigned UMAP
## 6 -0.3589191 -6.045946 unassigned UMAP
str(dims_umap)
## 'data.frame': 1998 obs. of 4 variables:
## $ dimension_x: num 7.43 -6.699 0.935 -6.2 -1.588 ...
## $ dimension_y: num 3.55 6.3 -6.96 6.94 -7.9 ...
## $ population : Factor w/ 15 levels "Basophils","CD16-_NK_cells",..: 15 15 15 15 15 15 15 9 10 10 ...
## $ type : chr "UMAP" "UMAP" "UMAP" "UMAP" ...
# generate plot
d_plot <- dims_umap
ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) +
facet_wrap(~ type, scales = "free") +
geom_point(size = 0.7, alpha = 0.5) +
scale_color_manual(values = colors) +
labs(x = "dimension x", y = "dimension y") +
theme_bw() +
theme(aspect.ratio = 1,
legend.key.height = unit(4, "mm"))
We can also use the clustering datasets to calculate a new clustering using an algorithm of our choice, and then evaluate the performance of the clustering by comparing against the ground truth cell population labels. Common metrics for evaluating clustering performance include the mean F1 score and adjusted Rand index.
Below, we use the FlowSOM clustering algorithm (Van Gassen et al. 2015) to calculate a new clustering on the Samusik_01
dataset, and then calculate clustering performance using the ground truth labels.
Note that for simplicity in this vignette, we only calculate the adjusted Rand index. This calculation does not take into account the number of cells per cluster, so could potentially be dominated by one or two large clusters. For more sophisticated evaluations based on the mean F1 score, where we (i) weight clusters equally (instead of weighting cells equally), and (ii) check for multi-mapping populations (i.e. prevent multiple clusters mapping to the same ground truth population), see the code in our GitHub repository from our previous publication (Weber and Robinson, 2016), available at https://github.com/lmweber/cytometry-clustering-comparison.
suppressPackageStartupMessages(library(HDCytoData))
suppressPackageStartupMessages(library(FlowSOM))
suppressPackageStartupMessages(library(flowCore))
suppressPackageStartupMessages(library(mclust))
suppressPackageStartupMessages(library(umap))
suppressPackageStartupMessages(library(ggplot2))
# ---------
# Load data
# ---------
d_SE <- Samusik_01_SE()
## snapshotDate(): 2020-10-02
## see ?HDCytoData and browseVignettes('HDCytoData') for documentation
## loading from cache
dim(d_SE)
## [1] 86864 51
# -------------
# Preprocessing
# -------------
# select 'cell type' marker columns for defining clusters
d_sub <- assay(d_SE[, colData(d_SE)$marker_class == "type"])
# extract cell population labels
population <- rowData(d_SE)$population_id
dim(d_sub)
## [1] 86864 39
stopifnot(nrow(d_sub) == length(population))
# transform data using asinh with cofactor 5
cofactor <- 5
d_sub <- asinh(d_sub / cofactor)
# create flowFrame object (required input format for FlowSOM)
d_FlowSOM <- flowFrame(d_sub)
# -----------
# Run FlowSOM
# -----------
# set seed for reproducibility
set.seed(123)
# run FlowSOM (initial steps prior to meta-clustering)
out <- ReadInput(d_FlowSOM, transform = FALSE, scale = FALSE)
out <- BuildSOM(out)
## Building SOM
## Mapping data to SOM
out <- BuildMST(out)
## Building MST
# optional FlowSOM visualization
#PlotStars(out)
# extract cluster labels (pre meta-clustering) from output object
labels_pre <- out$map$mapping[, 1]
# specify final number of clusters for meta-clustering
k <- 40
# run meta-clustering
seed <- 123
out <- metaClustering_consensus(out$map$codes, k = k, seed = seed)
# extract cluster labels from output object
labels <- out[labels_pre]
# summary of cluster sizes and number of clusters
table(labels)
## labels
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 1257 15597 20715 387 5248 3912 287 1499 1035 497 1984 292 322
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## 620 469 909 303 105 369 555 542 1603 260 341 698 9767
## 27 28 29 30 31 32 33 34 35 36 37 38 39
## 477 5913 757 815 231 721 2876 595 293 1469 434 787 739
## 40
## 1184
length(table(labels))
## [1] 40
# -------------------------------
# Evaluate clustering performance
# -------------------------------
# calculate adjusted Rand index
# note: this calculation weights all cells equally, which may not be
# appropriate for some datasets (see above)
stopifnot(nrow(d_sub) == length(labels))
stopifnot(length(population) == length(labels))
# remove "unassigned" cells from cluster evaluation (but note these were
# included for clustering)
ix_unassigned <- population == "unassigned"
d_sub_eval <- d_sub[!ix_unassigned, ]
population_eval <- population[!ix_unassigned]
labels_eval <- labels[!ix_unassigned]
stopifnot(nrow(d_sub_eval) == length(labels_eval))
stopifnot(length(population_eval) == length(labels_eval))
# calculate adjusted Rand index
adjustedRandIndex(population_eval, labels_eval)
## [1] 0.8935566
# ------------
# Plot results
# ------------
# subsample cells for faster runtimes in vignette
n <- 4000
set.seed(1004)
ix <- sample(seq_len(nrow(d_sub)), n)
d_sub <- d_sub[ix, ]
population <- population[ix]
labels <- labels[ix]
dim(d_sub)
## [1] 4000 39
stopifnot(nrow(d_sub) == length(population))
stopifnot(nrow(population) == length(labels))
# run umap
set.seed(1234)
out_umap <- umap(d_sub)
dims_umap <- out_umap$layout
colnames(dims_umap) <- c("UMAP_1", "UMAP_2")
stopifnot(nrow(dims_umap) == length(population))
stopifnot(nrow(population) == length(labels))
d_plot <- cbind(
as.data.frame(dims_umap),
population,
labels = as.factor(labels),
type = "UMAP"
)
# generate plots
colors <- c(rainbow(24), "gray75")
ggplot(d_plot, aes(x = UMAP_1, y = UMAP_2, color = population)) +
geom_point(size = 0.7, alpha = 0.5) +
scale_color_manual(values = colors) +
ggtitle("Ground truth population labels") +
theme_bw() +
theme(aspect.ratio = 1,
legend.key.height = unit(4, "mm"))
ggplot(d_plot, aes(x = UMAP_1, y = UMAP_2, color = labels)) +
geom_point(size = 0.7, alpha = 0.5) +
ggtitle("FlowSOM cluster labels") +
theme_bw() +
theme(aspect.ratio = 1,
legend.key.height = unit(4, "mm"))
In this section, we use the semi-simulated differential analysis datasets (Weber_AML_sim
and Weber_BCR_XL_sim
) to demonstrate how to perform differential analyses using the diffcyt
package (Weber et al. 2019).
For more more details on how to use the diffcyt
package, see the Bioconductor vignette.
For a complete workflow for performing differential discovery analyses in high-dimensional cytometry data, including exploratory analyses, differential testing, and visualizations, see Nowicka et al. (2017, 2019) (also available as a Bioconductor workflow package).
We perform two sets of differential analyses: testing for differential abundance (DA) of cell populations (using the Weber_AML_sim
dataset), and testing for differential states (DS) within cell populations (using the Weber_BCR_XL_sim
dataset). In both cases, clusters are defined using “cell type” markers, while for DS testing we also use additional “cell state” markers to test for differential expression within clusters. See our paper introducing the diffcyt
framework (Weber et al. 2019) for more details. For extended evaluations showing how to calculate performance using the ground truth labels (spike-in cells), see the code in our GitHub repository accompanying our previous publication (Weber et al. 2019), available at https://github.com/lmweber/diffcyt-evaluations.
suppressPackageStartupMessages(library(diffcyt))
suppressPackageStartupMessages(library(SummarizedExperiment))
# ---------
# Load data
# ---------
d_SE <- Weber_AML_sim_main_5pc_SE()
## snapshotDate(): 2020-10-02
## see ?HDCytoData and browseVignettes('HDCytoData') for documentation
## loading from cache
# ---------------
# Set up metadata
# ---------------
# set column names
colnames(d_SE) <- colData(d_SE)$marker_name
# split input data into one matrix per sample
d_input <- split(as.data.frame(assay(d_SE)), rowData(d_SE)$sample_id)
# extract sample information
experiment_info <- metadata(d_SE)$experiment_info
experiment_info
## group_id patient_id sample_id
## 1 healthy H1 healthy_H1
## 2 healthy H2 healthy_H2
## 3 healthy H3 healthy_H3
## 4 healthy H4 healthy_H4
## 5 healthy H5 healthy_H5
## 6 CN H1 CN_H1
## 7 CN H2 CN_H2
## 8 CN H3 CN_H3
## 9 CN H4 CN_H4
## 10 CN H5 CN_H5
## 11 CBF H1 CBF_H1
## 12 CBF H2 CBF_H2
## 13 CBF H3 CBF_H3
## 14 CBF H4 CBF_H4
## 15 CBF H5 CBF_H5
# extract marker information
marker_info <- colData(d_SE)
marker_info
## DataFrame with 45 rows and 3 columns
## channel_name marker_name marker_class
## <character> <character> <factor>
## Time Time Time none
## Cell_length Cell_length Cell_length none
## DNA1 DNA1(Ir191)Di DNA1 none
## DNA2 DNA2(Ir193)Di DNA2 none
## BC1 BC1(Pd104)Di BC1 none
## ... ... ... ...
## CD41 CD41(Lu175)Di CD41 type
## Viability Viability(Pt195)Di Viability none
## file_number file_number file_number none
## event_number event_number event_number none
## barcode barcode barcode none
# -----------------------------------
# Differential abundance (DA) testing
# -----------------------------------
# create design matrix
design <- createDesignMatrix(
experiment_info, cols_design = c("group_id", "patient_id")
)
design
## (Intercept) group_idCN group_idCBF patient_idH2 patient_idH3 patient_idH4
## 1 1 0 0 0 0 0
## 2 1 0 0 1 0 0
## 3 1 0 0 0 1 0
## 4 1 0 0 0 0 1
## 5 1 0 0 0 0 0
## 6 1 1 0 0 0 0
## 7 1 1 0 1 0 0
## 8 1 1 0 0 1 0
## 9 1 1 0 0 0 1
## 10 1 1 0 0 0 0
## 11 1 0 1 0 0 0
## 12 1 0 1 1 0 0
## 13 1 0 1 0 1 0
## 14 1 0 1 0 0 1
## 15 1 0 1 0 0 0
## patient_idH5
## 1 0
## 2 0
## 3 0
## 4 0
## 5 1
## 6 0
## 7 0
## 8 0
## 9 0
## 10 1
## 11 0
## 12 0
## 13 0
## 14 0
## 15 1
## attr(,"assign")
## [1] 0 1 1 2 2 2 2
## attr(,"contrasts")
## attr(,"contrasts")$group_id
## [1] "contr.treatment"
##
## attr(,"contrasts")$patient_id
## [1] "contr.treatment"
# create contrast matrix
# note: testing condition CN vs. healthy
contrast <- createContrast(c(0, 1, 0, 0, 0, 0, 0))
contrast
## [,1]
## [1,] 0
## [2,] 1
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
# test for differential abundance (DA) of clusters
out_DA <- diffcyt(
d_input,
experiment_info,
marker_info,
design = design,
contrast = contrast,
analysis_type = "DA",
seed_clustering = 1234
)
## preparing data...
## transforming data...
## generating clusters...
## FlowSOM clustering completed in 7.4 seconds
## calculating features...
## calculating DA tests using method 'diffcyt-DA-edgeR'...
# display results for top DA clusters
topTable(out_DA, format_vals = TRUE)
## DataFrame with 20 rows and 3 columns
## cluster_id p_val p_adj
## <factor> <numeric> <numeric>
## 2 2 7.87e-268 7.80e-266
## 97 97 9.17e-04 4.54e-02
## 84 84 3.87e-03 1.28e-01
## 25 25 8.31e-03 2.06e-01
## 17 17 9.94e-02 6.51e-01
## ... ... ... ...
## 98 98 0.079 0.651
## 14 14 0.130 0.759
## 7 7 0.237 0.904
## 28 28 0.236 0.904
## 32 32 0.205 0.904
# ---------
# Load data
# ---------
d_SE <- Weber_BCR_XL_sim_main_SE()
## snapshotDate(): 2020-10-02
## see ?HDCytoData and browseVignettes('HDCytoData') for documentation
## loading from cache
# ---------------
# Set up metadata
# ---------------
# set column names
colnames(d_SE) <- colData(d_SE)$marker_name
# split input data into one matrix per sample
d_input <- split(as.data.frame(assay(d_SE)), rowData(d_SE)$sample_id)
# extract sample information
experiment_info <- metadata(d_SE)$experiment_info
experiment_info
## group_id patient_id sample_id
## 1 base patient1 patient1_base
## 2 base patient2 patient2_base
## 3 base patient3 patient3_base
## 4 base patient4 patient4_base
## 5 base patient5 patient5_base
## 6 base patient6 patient6_base
## 7 base patient7 patient7_base
## 8 base patient8 patient8_base
## 9 spike patient1 patient1_spike
## 10 spike patient2 patient2_spike
## 11 spike patient3 patient3_spike
## 12 spike patient4 patient4_spike
## 13 spike patient5 patient5_spike
## 14 spike patient6 patient6_spike
## 15 spike patient7 patient7_spike
## 16 spike patient8 patient8_spike
# extract marker information
marker_info <- colData(d_SE)
marker_info
## DataFrame with 35 rows and 3 columns
## channel_name marker_name marker_class
## <character> <character> <factor>
## Time Time Time none
## Cell_length Cell_length Cell_length none
## CD3 CD3(110:114)Dd CD3 type
## CD45 CD45(In115)Dd CD45 type
## BC1 BC1(La139)Dd BC1 none
## ... ... ... ...
## HLA-DR HLA-DR(Yb174)Dd HLA-DR type
## BC7 BC7(Lu175)Dd BC7 none
## CD7 CD7(Yb176)Dd CD7 type
## DNA-1 DNA-1(Ir191)Dd DNA-1 none
## DNA-2 DNA-2(Ir193)Dd DNA-2 none
# -------------------------------
# Differential state (DS) testing
# -------------------------------
# create design matrix
design <- createDesignMatrix(
experiment_info, cols_design = c("group_id", "patient_id")
)
design
## (Intercept) group_idspike patient_idpatient2 patient_idpatient3
## 1 1 0 0 0
## 2 1 0 1 0
## 3 1 0 0 1
## 4 1 0 0 0
## 5 1 0 0 0
## 6 1 0 0 0
## 7 1 0 0 0
## 8 1 0 0 0
## 9 1 1 0 0
## 10 1 1 1 0
## 11 1 1 0 1
## 12 1 1 0 0
## 13 1 1 0 0
## 14 1 1 0 0
## 15 1 1 0 0
## 16 1 1 0 0
## patient_idpatient4 patient_idpatient5 patient_idpatient6 patient_idpatient7
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 1 0 0 0
## 5 0 1 0 0
## 6 0 0 1 0
## 7 0 0 0 1
## 8 0 0 0 0
## 9 0 0 0 0
## 10 0 0 0 0
## 11 0 0 0 0
## 12 1 0 0 0
## 13 0 1 0 0
## 14 0 0 1 0
## 15 0 0 0 1
## 16 0 0 0 0
## patient_idpatient8
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 1
## 9 0
## 10 0
## 11 0
## 12 0
## 13 0
## 14 0
## 15 0
## 16 1
## attr(,"assign")
## [1] 0 1 2 2 2 2 2 2 2
## attr(,"contrasts")
## attr(,"contrasts")$group_id
## [1] "contr.treatment"
##
## attr(,"contrasts")$patient_id
## [1] "contr.treatment"
# create contrast matrix
# note: testing condition spike vs. base
contrast <- createContrast(c(0, 1, 0, 0, 0, 0, 0, 0, 0))
contrast
## [,1]
## [1,] 0
## [2,] 1
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 0
## [9,] 0
# test for differential abundance (DA) of clusters
out_DS <- diffcyt(
d_input,
experiment_info,
marker_info,
design = design,
contrast = contrast,
analysis_type = "DS",
seed_clustering = 1234
)
## preparing data...
## transforming data...
## generating clusters...
## FlowSOM clustering completed in 3.1 seconds
## calculating features...
## calculating DS tests using method 'diffcyt-DS-limma'...
## Warning: Partial NA coefficients for 28 probe(s)
# display results for top DA clusters
topTable(out_DS, format_vals = TRUE)
## DataFrame with 20 rows and 4 columns
## cluster_id marker_id p_val p_adj
## <factor> <factor> <numeric> <numeric>
## 89 89 pS6 7.08e-15 9.91e-12
## 90 90 pS6 5.73e-13 4.01e-10
## 80 80 pS6 8.35e-12 3.90e-09
## 80 80 pPlcg2 3.99e-11 1.40e-08
## 99 99 pS6 1.14e-10 3.19e-08
## ... ... ... ... ...
## 80 80 pAkt 1.85e-06 0.000161
## 80 80 pNFkB 2.00e-06 0.000164
## 99 99 pErk 2.54e-06 0.000198
## 89 89 pErk 3.39e-06 0.000250
## 89 89 pAkt 5.36e-06 0.000375