## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, warning = FALSE, message = FALSE, comment = "#>" ) options( rmarkdown.html_vignette.check_title = FALSE ) ## ----setup-------------------------------------------------------------------- library(tidytof) library(dplyr) library(ggplot2) library(forcats) ## ----eval = FALSE------------------------------------------------------------- # if (!requireNamespace("BiocManager", quietly = TRUE)) { # install.packages("BiocManager") # } # # BiocManager::install("HDCytoData") ## ----message = FALSE, warning = FALSE----------------------------------------- levine <- HDCytoData::Levine_32dim_flowSet() |> as_tof_tbl() |> # a bit of data cleaning dplyr::mutate(population_id = as.character(population_id)) |> dplyr::rename_with( .fn = \(x) stringr::str_to_lower(stringr::str_remove(x, "\\|.+")) ) |> dplyr::mutate(dplyr::across(c(file_number, population_id), as.character)) |> # arcsinh transformation tof_preprocess( channel_cols = c(-time, -cell_length, -event_number, -file_number, -population_id) ) ## ----------------------------------------------------------------------------- # convert 5 counts to asinh value with a cofactor of 5 threshold <- asinh(5 / 5) levine |> tof_assess_channels( negative_threshold = threshold, negative_proportion_flag = 0.975 ) ## ----------------------------------------------------------------------------- levine |> tof_plot_cells_density(marker_col = cd14) ## ----------------------------------------------------------------------------- levine |> tof_assess_flow_rate( time_col = time, num_timesteps = 200, # flag timepoints in which flow rates are high or low at a signicance level # of p = 0.01 alpha_threshold = 0.01, # plot the number of cells in each timestep, and whether or not the # rates were flagged as too high or too low visualize = TRUE ) ## ----------------------------------------------------------------------------- levine |> tof_assess_flow_rate( time_col = time, # analyze two files in the levine dataset separately group_cols = file_number, alpha_threshold = 0.01, visualize = TRUE ) ## ----------------------------------------------------------------------------- set.seed(2020L) # simulate large population of cells that truly belong in their assigned cluster sim_data_base <- dplyr::tibble( cd45 = c(rnorm(n = 600), rnorm(n = 500, mean = -4)), cd38 = c( rnorm(n = 100, sd = 0.5), rnorm(n = 500, mean = -3), rnorm(n = 500, mean = 8) ), cd34 = c( rnorm(n = 100, sd = 0.2, mean = -10), rnorm(n = 500, mean = 4), rnorm(n = 500, mean = 60) ), cd19 = c(rnorm(n = 100, sd = 0.3, mean = 10), rnorm(n = 1000)), cluster_id = c(rep("a", 100), rep("b", 500), rep("c", 500)), dataset = "non-outlier" ) # simulate outlier cells that do not belong in their assigned cluster sim_data_outlier <- dplyr::tibble( cd45 = c(rnorm(n = 10), rnorm(50, mean = 3), rnorm(n = 50, mean = -12)), cd38 = c( rnorm(n = 10, sd = 0.5), rnorm(n = 50, mean = -10), rnorm(n = 50, mean = 10) ), cd34 = c( rnorm(n = 10, sd = 0.2, mean = -15), rnorm(n = 50, mean = 15), rnorm(n = 50, mean = 70) ), cd19 = c(rnorm(n = 10, sd = 0.3, mean = 19), rnorm(n = 100)), cluster_id = c(rep("a", 10), rep("b", 50), rep("c", 50)), dataset = "outlier" ) # bind simulated data together sim_data <- bind_rows(sim_data_base, sim_data_outlier) ## ----------------------------------------------------------------------------- sim_data |> tof_plot_cells_embedding(color_col = cluster_id) ## ----------------------------------------------------------------------------- sim_data |> tof_plot_cells_embedding(color_col = dataset) ## ----------------------------------------------------------------------------- sim_data |> tof_assess_clusters_distance( cluster_col = cluster_id, # flag cells with a mahalanobis distance z-score over 3 z_threshold = 3, augment = TRUE ) |> # visualize result as above dplyr::select(-dplyr::starts_with(".mahala"), -z_score) |> dplyr::mutate(flagged_cell = as.character(flagged_cell)) |> tof_plot_cells_embedding(color_col = flagged_cell) + scale_fill_manual(values = tof_generate_palette(num_colors = 2)) ## ----------------------------------------------------------------------------- sim_data <- dplyr::tibble( cd45 = c( rnorm(n = 1000, sd = 2), rnorm(n = 1000, mean = 2), rnorm(n = 1000, mean = -2) ), cd38 = c( rnorm(n = 1000, sd = 2), rnorm(n = 1000, mean = 2), rnorm(n = 1000, mean = -2) ), cd34 = c( rnorm(n = 1000, sd = 2), rnorm(n = 1000, mean = 2), rnorm(n = 1000, mean = -2) ), cd19 = c( rnorm(n = 1000, sd = 2), rnorm(n = 1000, mean = 2), rnorm(n = 1000, mean = -2) ), cluster_id = c(rep("a", 1000), rep("b", 1000), rep("c", 1000)) ) ## ----------------------------------------------------------------------------- sim_data |> tof_reduce_dimensions(method = "pca") |> tof_plot_cells_embedding( embedding_cols = c(.pc1, .pc2), color_col = cluster_id ) ## ----------------------------------------------------------------------------- set.seed(17L) entropy_result <- sim_data |> # cluster into 2 clusters tof_cluster( num_clusters = 2, method = "kmeans" ) |> # calculate the entropy of all cells tof_assess_clusters_entropy( cluster_col = .kmeans_cluster, marker_cols = starts_with("cd"), entropy_quantile = 0.8, augment = TRUE ) # plot the clusters in PCA space entropy_result |> select(-starts_with(".mahala"), -flagged_cell) |> tof_reduce_dimensions(pca_cols = starts_with("cd"), method = "pca") |> tof_plot_cells_embedding(embedding_cols = c(.pc1, .pc2), color_col = .kmeans_cluster) # show the entropy values for each cell entropy_result |> select(-starts_with(".mahala"), -flagged_cell) |> tof_reduce_dimensions(pca_cols = starts_with("cd"), method = "pca") |> tof_plot_cells_embedding(embedding_cols = c(.pc1, .pc2), color_col = entropy) + scale_fill_viridis_c() ## ----------------------------------------------------------------------------- entropy_result |> select(-starts_with(".mahala")) |> tof_reduce_dimensions(pca_cols = starts_with("cd"), method = "pca") |> tof_plot_cells_embedding(embedding_cols = c(.pc1, .pc2), color_col = flagged_cell) + scale_fill_viridis_d() ## ----------------------------------------------------------------------------- entropy_result |> ggplot(aes(x = entropy, fill = cluster_id)) + geom_density(alpha = 0.4) + theme_bw() ## ----------------------------------------------------------------------------- clusters_to_keep <- levine |> dplyr::count(population_id) |> dplyr::slice_max(order_by = n, n = 5L) |> dplyr::arrange(n) |> pull(population_id) smallest_cluster <- clusters_to_keep[1] largest_cluster <- clusters_to_keep[[length(clusters_to_keep)]] small_levine <- levine |> dplyr::filter(population_id %in% clusters_to_keep) ## ----------------------------------------------------------------------------- # perform the perturbation small_levine <- small_levine |> dplyr::mutate( new_population_id = dplyr::if_else( population_id %in% smallest_cluster, sample( clusters_to_keep[-which(clusters_to_keep %in% smallest_cluster)], size = nrow(small_levine), replace = TRUE ), population_id ) ) # perform the entropy assessment entropy_levine <- small_levine |> tof_assess_clusters_entropy( cluster_col = new_population_id, marker_cols = starts_with("cd"), augment = TRUE ) ## ----------------------------------------------------------------------------- entropy_levine |> mutate(population_id = fct_reorder(population_id, entropy)) |> tof_plot_cells_density( marker_col = entropy, group_col = population_id, use_ggridges = TRUE, scale = 0.1 ) + ggplot2::theme(legend.position = "none") + ggplot2::labs(x = "Entropy", y = "Cluster ID") ## ----------------------------------------------------------------------------- entropy_levine |> mutate(flagged_cell = entropy > quantile(entropy, prob = 0.9)) |> dplyr::count(population_id, flagged_cell) |> group_by(population_id) |> mutate(prop = n / sum(n)) |> ungroup() |> dplyr::filter(flagged_cell) ## ----------------------------------------------------------------------------- sessionInfo()