## ----options, include=FALSE, echo=FALSE--------------------------------------- knitr::opts_chunk$set(warning = FALSE, error = FALSE, message = FALSE) ## ---- eval=FALSE-------------------------------------------------------------- # devtools::install_github("mcalgaro93/benchdamic") ## ----load_packs--------------------------------------------------------------- library(benchdamic) # Data management library(phyloseq) library(plyr) # Graphics library(ggplot2) library(cowplot) ## ----dataloading-------------------------------------------------------------- data("ps_stool_16S") ps_stool_16S ## ----16S_stool_download, eval=FALSE------------------------------------------- # ## 16S HMP data download # library(HMP16SData) # # Extracting V3-V5 16S sequenced regions' count data # ps_stool_16S_raw <- V35() %>% # subset(select = HMP_BODY_SUBSITE == "Stool" & # Only fecal samples # RUN_CENTER == "BI" & # Only sequenced at the BI RUN CENTER # SEX == "Male" & # Only male subject # VISITNO == 1 & # Only the first visit # !duplicated(RSID)) %>% # Duplicated SubjectID removal # as_phyloseq() # # Remove low depth samples # ps_stool_16S_pruned <- prune_samples(sample_sums(ps_stool_16S_raw) >= 10^3, ps_stool_16S_raw) # # Remove features with zero counts # ps_stool_16S_filtered <- filter_taxa(ps_stool_16S_pruned, function(x) { # sum(x > 0) > 0 # }, 1) # # Collapse counts to the genus level # ps_stool_16S <- tax_glom(ps_stool_16S_filtered, taxrank = "GENUS") ## ----------------------------------------------------------------------------- example_HURDLE <- fitHURDLE( counts = ps_stool_16S, scale = "median" ) head(example_HURDLE) ## ----------------------------------------------------------------------------- observed_hurdle <- prepareObserved( counts = ps_stool_16S, scale = "median") head(observed_hurdle) ## ----------------------------------------------------------------------------- head(prepareObserved(counts = ps_stool_16S)) ## ----------------------------------------------------------------------------- head(meanDifferences( estimated = example_HURDLE, observed = observed_hurdle )) ## ----fitting------------------------------------------------------------------ GOF_stool_16S <- fitModels( counts = ps_stool_16S, models = c("NB", "ZINB", "DM", "ZIG", "HURDLE"), scale_ZIG = c("median", "default"), scale_HURDLE = c("median", "default"), verbose = FALSE ) ## ----RMSE_MD------------------------------------------------------------------ plotRMSE(GOF_stool_16S, difference = "MD", plotIt = FALSE) ## ----RMSE_ZPD----------------------------------------------------------------- plotRMSE(GOF_stool_16S, difference = "ZPD", plotIt = FALSE) ## ----plotGOF_MD, fig.width=15, fig.height=4----------------------------------- plotMD( data = GOF_stool_16S, difference = "MD", split = TRUE ) ## ----plotGOF_MD_noHurdleDefault, fig.width=15, fig.height=4------------------- plotMD( data = GOF_stool_16S[1:6], difference = "MD", split = TRUE ) ## ----plotGOF_ZPD, fig.width=15, fig.height=4---------------------------------- plotMD( data = GOF_stool_16S[1:6], difference = "ZPD", split = TRUE ) ## ----plotGOF_MD_collapsed, fig.width=10, fig.height=5------------------------- plot_grid(plotMD(data = GOF_stool_16S[1:6], difference = "MD", split = FALSE), plotMD(data = GOF_stool_16S[1:6], difference = "ZPD", split = FALSE), ncol = 2 ) ## ----plotGOF_RMSE, fig.width=10,fig.height=7---------------------------------- plot_grid(plotRMSE(GOF_stool_16S, difference = "MD"), plotRMSE(GOF_stool_16S, difference = "ZPD"), ncol = 2 ) ## ----createMocks-------------------------------------------------------------- set.seed(123) my_mocks <- createMocks( nsamples = phyloseq::nsamples(ps_stool_16S), N = 10 ) # At least N = 1000 is suggested ## ----normalizationManual, eval=FALSE------------------------------------------ # ps_stool_16S <- norm_edgeR( # object = ps_stool_16S, # method = "TMM" # ) # ps_stool_16S <- norm_DESeq2( # object = ps_stool_16S, # method = "poscounts" # ) # ps_stool_16S <- norm_CSS( # object = ps_stool_16S, # "median" # ) ## ----setNormalization--------------------------------------------------------- my_normalizations <- setNormalizations(fun = c("norm_edgeR", "norm_DESeq2", "norm_CSS"), method = c("TMM", "poscounts", "median")) ps_stool_16S <- runNormalizations(normalization_list = my_normalizations, object = ps_stool_16S, verbose = TRUE) ## ----weights------------------------------------------------------------------ zinbweights <- weights_ZINB( object = ps_stool_16S, K = 0, design = "~ 1" ) ## ----set_Methods-------------------------------------------------------------- my_edgeR <- set_edgeR( pseudo_count = FALSE, group_name = "group", design = ~ group, robust = FALSE, coef = 2, norm = "TMM", weights_logical = c(TRUE, FALSE), expand = TRUE ) my_DESeq2 <- set_DESeq2( pseudo_count = FALSE, design = ~ group, contrast = c("group", "grp2", "grp1"), norm = "poscounts", weights_logical = c(TRUE, FALSE), alpha = 0.05, expand = TRUE ) my_limma <- set_limma( pseudo_count = FALSE, design = ~ group, coef = 2, norm = c("TMM", "TMM", "CSSmedian"), weights_logical = c(FALSE, TRUE, FALSE), expand = FALSE) my_methods <- c(my_edgeR, my_DESeq2, my_limma) ## ----runMocks----------------------------------------------------------------- # Random grouping each time Stool_16S_mockDA <- runMocks(mocks = my_mocks, method_list = my_methods, object = ps_stool_16S, weights = zinbweights, verbose = FALSE) ## ----customExample, eval=FALSE------------------------------------------------ # DA_yourMethod <- function(object, parameters) # others # { # ### your method code ### # # ### extract important statistics ### # vector_of_pval <- NA # contains the p-values # vector_of_adjusted_pval <- NA # contains the adjusted p-values # name_of_your_features <- NA # contains the OTU, or ASV, or other feature # # names. Usually extracted from the rownames of # # the count data # vector_of_logFC <- NA # contains the logFCs # vector_of_statistics <- NA # contains other statistics # # ### prepare the output ### # pValMat <- data.frame("rawP" = vector_of_pval, # "adjP" = vector_of_adjusted_pval) # statInfo <- data.frame("logFC" = vector_of_logFC, # "statistics" = vector_of_statistics) # name <- "write.here.the.name" # # Be sure that your method hasn't changed the order of the features. If it # # happens, you'll need to re-establish the original order. # rownames(pValMat) <- rownames(statInfo) <- name_of_your_features # # # Return the output as a list # return(list("pValMat" = pValMat, "statInfo" = statInfo, "name" = name)) # } # END - function: DA_yourMethod ## ----customExampleInstances, eval=FALSE--------------------------------------- # my_custom_method <- list( # customMethod.1 = list(method = "DA_yourMethod", parameters), # customMethod.2 = list(method = "DA_yourMethod", parameters) # ) ## ----customExampleRun, eval=FALSE--------------------------------------------- # # Add the custom method instances to the others # my_methods <- c(my_edgeR, my_DESeq2, my_limma, my_custom_method) # # Run all the methods on a specific data object... # runDA(my_methods = my_methods, object = dataObject) # # ... Or on the mock datasets to investigate TIEC # runMocks(mocks = mock_df, my_methods = my_methods, object = dataObject) ## ----createTIEC, warning=FALSE------------------------------------------------ TIEC_summary <- createTIEC(Stool_16S_mockDA) ## ----FPRplot------------------------------------------------------------------ cols <- createColors(variable = levels(TIEC_summary$df_pval$Method)) plotFPR(df_FPR = TIEC_summary$df_FPR, cols = cols) ## ----QQplot------------------------------------------------------------------- plotQQ(df_QQ = TIEC_summary$df_QQ, zoom = c(0, 0.1), cols = cols) ## ----KSplot------------------------------------------------------------------- plotKS(df_KS = TIEC_summary$df_KS, cols = cols) ## ----dataloading_concordance-------------------------------------------------- data("ps_plaque_16S") ## ----16S_plaque_download, eval=FALSE------------------------------------------ # ps_plaque_16S_raw <- V35() %>% # Extracting V3-V5 16S sequenced regions' count data # subset(select = HMP_BODY_SUBSITE %in% c("Supragingival Plaque", "Subgingival Plaque") & # Only gingival plaque samples # RUN_CENTER == "WUGC" & # Only sequenced at the WUCG RUN CENTER # SEX == "Male" & # Only male subject # VISITNO == 1) %>% # Only the first visit # as_phyloseq() # # # Only paired samples # paired <- names(which(table(sample_data(ps_plaque_16S_raw)[, "RSID"]) == 2)) # ps_plaque_16S_paired <- subset_samples(ps_plaque_16S_raw, RSID %in% paired) # # # Remove low depth samples # ps_plaque_16S_pruned <- prune_samples(sample_sums(ps_plaque_16S_paired) >= 10^3, ps_plaque_16S_paired) # # # Remove features with zero counts # ps_plaque_16S_filtered <- filter_taxa(ps_plaque_16S_pruned, function(x) sum(x > 0) > 0, 1) # # # Collapse counts to the genus level # ps_plaque_16S <- tax_glom(ps_plaque_16S_filtered, taxrank = "GENUS") ## ----createSplits------------------------------------------------------------- set.seed(123) sample_data(ps_plaque_16S)$HMP_BODY_SUBSITE <- factor(sample_data(ps_plaque_16S)$HMP_BODY_SUBSITE) my_splits <- createSplits( object = ps_plaque_16S, varName = "HMP_BODY_SUBSITE", paired = "RSID", balanced = TRUE, N = 10 ) # At least 100 is suggested ## ----set_Methods_noweights---------------------------------------------------- my_edgeR_noWeights <- set_edgeR(group_name = "HMP_BODY_SUBSITE", design = ~ HMP_BODY_SUBSITE, coef = 2, norm = "TMM") my_DESeq2_noWeights <- set_DESeq2(contrast = c("HMP_BODY_SUBSITE","Supragingival Plaque", "Subgingival Plaque"), design = ~ HMP_BODY_SUBSITE, norm = "poscounts") my_limma_noWeights <- set_limma(design = ~ HMP_BODY_SUBSITE, coef = 2, norm = c("TMM", "CSSmedian")) my_methods_noWeights <- c(my_edgeR_noWeights, my_DESeq2_noWeights, my_limma_noWeights) ## ----info_normalizations------------------------------------------------------ str(my_normalizations) ## ----runSplits---------------------------------------------------------------- Plaque_16S_splitsDA <- runSplits(split_list = my_splits, method_list = my_methods_noWeights, normalization_list = my_normalizations, object = ps_plaque_16S, verbose = FALSE) ## ----createConcordance-------------------------------------------------------- concordance <- createConcordance( object = Plaque_16S_splitsDA, slot = "pValMat", colName = "rawP", type = "pvalue" ) head(concordance) ## ----getNames----------------------------------------------------------------- names(Plaque_16S_splitsDA$Subset1$Comparison1) ## ----logFC_names-------------------------------------------------------------- names(Plaque_16S_splitsDA$Subset1$Comparison1$edgeR.TMM$statInfo) names(Plaque_16S_splitsDA$Subset1$Comparison1$DESeq2.poscounts$statInfo) names(Plaque_16S_splitsDA$Subset1$Comparison1$limma.CSSmedian$statInfo) names(Plaque_16S_splitsDA$Subset1$Comparison1$limma.TMM$statInfo) ## ----alternativeConcordance, eval=FALSE--------------------------------------- # concordance_alternative <- createConcordance( # object = Plaque_16S_splitsDA, # slot = "statInfo", # colName = c("logFC", "log2FoldChange", "logFC", "logFC"), # type = "logfc" # ) ## ----plotConcordance---------------------------------------------------------- pC <- plotConcordance(concordance = concordance, threshold = 30) cowplot::plot_grid(plotlist = pC, ncol = 2, align = "h", axis = "tb", rel_widths = c(1, 3)) ## ----priorKnowledge----------------------------------------------------------- data("microbial_metabolism") head(microbial_metabolism) ## ----exampleOfIntegration----------------------------------------------------- # Extract genera from the phyloseq tax_table slot genera <- tax_table(ps_plaque_16S)[, "GENUS"] # Genera as rownames of microbial_metabolism data.frame rownames(microbial_metabolism) <- microbial_metabolism$Genus # Match OTUs to their metabolism priorInfo <- data.frame(genera, "Type" = microbial_metabolism[genera, "Type"]) unknown_metabolism <- is.na(priorInfo$Type) priorInfo[unknown_metabolism, "Type"] <- "Unknown" # Relabel 'F Anaerobic' to 'F_Anaerobic' to remove space priorInfo$Type <- factor(priorInfo$Type, levels = c("Aerobic","Anaerobic","F Anaerobic","Unknown"), labels = c("Aerobic","Anaerobic","F_Anaerobic","Unknown")) # Add a more informative names column priorInfo[, "newNames"] <- paste0(rownames(priorInfo), "|", priorInfo[, "GENUS"]) ## ----setNormalizations_enrichment--------------------------------------------- none_normalization <- setNormalizations(fun = "norm_edgeR", method = "none") my_normalizations_enrichment <- c(my_normalizations, none_normalization) ps_plaque_16S <- runNormalizations(normalization_list = my_normalizations_enrichment, object = ps_plaque_16S, verbose = FALSE) ## ----set_Methods_enrichment--------------------------------------------------- my_metagenomeSeq <- set_metagenomeSeq(design = ~ HMP_BODY_SUBSITE, coef = 2, norm = "CSSmedian") my_ALDEx2 <- set_ALDEx2(conditions = "HMP_BODY_SUBSITE", test = "t", norm = "none") my_corncob <- set_corncob(formula = ~ HMP_BODY_SUBSITE, phi.formula = ~ HMP_BODY_SUBSITE, formula_null = ~ 1, phi.formula_null = ~ HMP_BODY_SUBSITE, test = "Wald", coefficient = "HMP_BODY_SUBSITESupragingival Plaque", norm = "none") my_MAST <- set_MAST(rescale = "median", design = ~ HMP_BODY_SUBSITE, coefficient = "HMP_BODY_SUBSITESupragingival Plaque", norm = "none") my_Seurat <- set_Seurat(test.use = "wilcox", contrast = c("HMP_BODY_SUBSITE", "Supragingival Plaque", "Subgingival Plaque"), norm = "none") my_methods_enrichment <- c( my_methods_noWeights, my_metagenomeSeq, my_ALDEx2, my_corncob, my_MAST, my_Seurat ) ## ----runDA_enrichment, message=FALSE------------------------------------------ # Convert to factor sample_data(ps_plaque_16S)$HMP_BODY_SUBSITE <- factor(sample_data(ps_plaque_16S)$HMP_BODY_SUBSITE) # Reference level = "Subgingival Plaque" sample_data(ps_plaque_16S)$HMP_BODY_SUBSITE <- relevel( x = sample_data(ps_plaque_16S)$HMP_BODY_SUBSITE, ref = "Subgingival Plaque" ) Plaque_16S_DA <- runDA(method_list = my_methods_enrichment, object = ps_plaque_16S, weights = zinbweights) ## ----info_DA------------------------------------------------------------------ names(Plaque_16S_DA) ## ----createEnrichment--------------------------------------------------------- enrichment <- createEnrichment( object = Plaque_16S_DA, priorKnowledge = priorInfo, enrichmentCol = "Type", namesCol = "newNames", slot = "pValMat", colName = "adjP", type = "pvalue", direction = c("logFC", # edgeR "log2FoldChange", # DEseq2 "logFC", # limma "logFC", # limma "HMP_BODY_SUBSITESupragingival Plaque", # metagenomeSeq "effect", # ALDEx2 "Estimate", # corncob "logFC",# MAST "avg_log2FC"), # Seurat threshold_pvalue = 0.1, threshold_logfc = 0, top = NULL, alternative = "greater", verbose = TRUE ) ## ----plotContingency---------------------------------------------------------- plotContingency(enrichment = enrichment, levels_to_plot = c("Aerobic", "Anaerobic"), method = "metagenomeSeq.CSSmedian") ## ----plotEnrichment, fig.width=7, fig.height=7-------------------------------- plotEnrichment(enrichment = enrichment, enrichmentCol = "Type", levels_to_plot = c("Aerobic", "Anaerobic")) ## ----plotMutualFindings, fig.width=6, fig.height=6---------------------------- plotMutualFindings(enrichment, enrichmentCol = "Type", levels_to_plot = c("Aerobic", "Anaerobic"), n_methods = 1) ## ----createPositives---------------------------------------------------------- positives <- createPositives( object = Plaque_16S_DA, priorKnowledge = priorInfo, enrichmentCol = "Type", namesCol = "newNames", slot = "pValMat", colName = "rawP", type = "pvalue", direction = c("logFC", # edgeR "log2FoldChange", # DEseq2 "logFC", # limma "logFC", # limma "HMP_BODY_SUBSITESupragingival Plaque", # metagenomeSeq "effect", # ALDEx2 "Estimate", # corncob "logFC",# MAST "avg_log2FC"), # Seurat threshold_pvalue = 1, threshold_logfc = 0, top = seq.int(from = 0, to = 50, by = 5), alternative = "greater", verbose = FALSE, TP = list(c("DOWN Abundant", "Anaerobic"), c("UP Abundant", "Aerobic")), FP = list(c("DOWN Abundant", "Aerobic"), c("UP Abundant", "Anaerobic")) ) head(positives) ## ----plotPositives------------------------------------------------------------ plotPositives(positives) ## ----sessionInfo-------------------------------------------------------------- sessionInfo()