--- title: "ANCOM-BC" author: - Huang Lin$^1$ - $^1$Department of Biostatistics, University of Pittsburgh, 130 De Soto Street, Pittsburgh, PA 15261 USA date: '`r format(Sys.Date(), "%B %d, %Y")`' output: html_document: toc: true theme: united bibliography: bibliography.bib vignette: > %\VignetteIndexEntry{ANCOMBC} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r setup, include = FALSE} knitr::opts_chunk$set(message = FALSE, warning = FALSE, comment = NA, fig.width = 6.25, fig.height = 5) library(tidyverse) library(microbiome) library(magrittr) library(qwraps2) library(ANCOMBC) library(DT) options(DT.options = list( initComplete = JS("function(settings, json) {", "$(this.api().table().header()).css({'background-color': '#000', 'color': '#fff'});","}"))) ``` # 1. Introduction Analysis of Compositions of Microbiomes with Bias Correction (ANCOM-BC) [@lin2020analysis] is a methodology of differential abundance (DA) analysis for microbial absolute abundance data. ANCOM-BC estimates the unknown sampling fractions, corrects the bias induced by their differences among samples, and identifies taxa that are differentially abundant according to the covariate of interest. For more details, please refer to the [ANCOM-BC](https://doi.org/10.1038/s41467-020-17041-7) paper. # 2. Installation Download package. ```{r getPackage, eval=FALSE} if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("ANCOMBC") ``` Load the package. ```{r load, eval=FALSE} library(ANCOMBC) ``` # 3. Running ANCOMBC ## 3.1 Intestinal microbiota profiling data The HITChip Atlas data set [@lahti2014tipping] is available via the microbiome R package [@lahti2017tools] in phyloseq [@mcmurdie2013phyloseq] format, and via Data Dryad in tabular format. This data set comes with 130 genus-like taxonomic groups across 1006 western adults with no reported health complications. Some subjects have also short time series. ```{r importData} data(atlas1006) # Subset to baseline pseq = subset_samples(atlas1006, time == 0) # Re-code the bmi group sample_data(pseq)$bmi_group = recode(sample_data(pseq)$bmi_group, underweight = "lean", lean = "lean", overweight = "overweight", obese = "obese", severeobese = "obese", morbidobese = "obese") # Re-code the nationality group sample_data(pseq)$nation = recode(sample_data(pseq)$nationality, Scandinavia = "NE", UKIE = "NE", SouthEurope = "SE", CentralEurope = "CE", EasternEurope = "EE") # Aggregate to phylum level phylum_data = aggregate_taxa(pseq, "Phylum") ``` ## 3.2 Data summary ```{r dataSummary, results = "asis"} options(qwraps2_markup = "markdown") summary_template = list("Age" = list("min" = ~ min(age, na.rm = TRUE), "max" = ~ max(age, na.rm = TRUE), "mean (sd)" = ~ mean_sd(age, na_rm = TRUE, show_n = "never")), "Gender" = list("F" = ~ n_perc0(sex == "female", na_rm = TRUE), "M" = ~ n_perc0(sex == "male", na_rm = TRUE), "NA" = ~ n_perc0(is.na(sex))), "Nationality" = list("Central Europe" = ~ n_perc0(nation == "CE", na_rm = TRUE), "Eastern Europe" = ~ n_perc0(nation == "EE", na_rm = TRUE), "Northern Europe" = ~ n_perc0(nation == "NE", na_rm = TRUE), "Southern Europe" = ~ n_perc0(nation == "SE", na_rm = TRUE), "US" = ~ n_perc0(nation == "US", na_rm = TRUE), "NA" = ~ n_perc0(is.na(nation))), "BMI" = list("Lean" = ~ n_perc0(bmi_group == "lean", na_rm = TRUE), "Overweight" = ~ n_perc0(bmi_group == "overweight", na_rm = TRUE), "Obese" = ~ n_perc0(bmi_group == "obese", na_rm = TRUE), "NA" = ~ n_perc0(is.na(bmi_group))) ) data_summary = summary_table(meta(pseq), summary_template) data_summary ``` 1. The number of samples: `r nsamples(phylum_data)`. 2. The number of phyla: `r ntaxa(phylum_data)`. ## 3.3 Running ancombc function ```{r ancombc} out = ancombc(phyloseq = phylum_data, formula = "age + nation + bmi_group", p_adj_method = "holm", zero_cut = 0.90, lib_cut = 1000, group = "nation", struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5, max_iter = 100, conserve = TRUE, alpha = 0.05, global = TRUE) res = out$res res_global = out$res_global ``` ## 3.4 ANCOMBC primary result Result from the ANCOM-BC log-linear model to determine taxa that are differentially abundant according to the covariate of interest. It contains: 1) coefficients; 2) standard errors; 3) test statistics; 4) p-values; 5) adjusted p-values; 6) indicators whether the taxon is differentially abundant (TRUE) or not (FALSE). ### 3.41 Coefficients ```{r} tab_coef = res$beta col_name = c("Age", "EE - CE", "NE - CE", "SE - CE", "US - CE", "Oerweight - Lean", "Obese - Lean") colnames(tab_coef) = col_name tab_coef %>% datatable(caption = "Coefficients from the Primary Result") %>% formatRound(col_name, digits = 2) ``` ### 3.42 SEs ```{r} tab_se = res$se colnames(tab_se) = col_name tab_se %>% datatable(caption = "SEs from the Primary Result") %>% formatRound(col_name, digits = 2) ``` ### 3.43 Test statistics ```{r} tab_w = res$W colnames(tab_w) = col_name tab_w %>% datatable(caption = "Test Statistics from the Primary Result") %>% formatRound(col_name, digits = 2) ``` ### 3.44 P-values ```{r} tab_p = res$p_val colnames(tab_p) = col_name tab_p %>% datatable(caption = "P-values from the Primary Result") %>% formatRound(col_name, digits = 2) ``` ### 3.45 Adjusted p-values ```{r} tab_q = res$q colnames(tab_q) = col_name tab_q %>% datatable(caption = "Adjusted p-values from the Primary Result") %>% formatRound(col_name, digits = 2) ``` ### 3.46 Differentially abundant taxa ```{r} tab_diff = res$diff_abn colnames(tab_diff) = col_name tab_diff %>% datatable(caption = "Differentially Abundant Taxa from the Primary Result") ``` ### 3.47 Bias-adjusted abundances Step 1: estimate sample-specific sampling fractions (log scale). Note that for each sample, if it contains missing values for any variable specified in the formula, the corresponding sampling fraction estimate for this sample will return NA since the sampling fraction is not estimable with the presence of missing values. Step 2: adjust the log observed abundances by subtracting the estimated sampling fraction from log observed abundances of each sample. ```{r} samp_frac = out$samp_frac # Replace NA with 0 samp_frac[is.na(samp_frac)] = 0 # Add pesudo-count (1) to avoid taking the log of 0 log_obs_abn = log(abundances(phylum_data) + 1) # Adjust the log observed abundances log_obs_abn_adj = t(t(log_obs_abn) - samp_frac) # Show the first 6 samples round(log_obs_abn_adj[, 1:6], 2) %>% datatable(caption = "Bias-adjusted log observed abundances") ``` ### 3.48 Visualizations for "age" *"Age" is a continuous variable.* ```{r} df_fig1 = data.frame(res$beta * res$diff_abn, check.names = FALSE) %>% rownames_to_column("taxon_id") df_fig2 = data.frame(res$se * res$diff_abn, check.names = FALSE) %>% rownames_to_column("taxon_id") colnames(df_fig2)[-1] = paste0(colnames(df_fig2)[-1], "SD") df_fig = df_fig1 %>% left_join(df_fig2, by = "taxon_id") %>% transmute(taxon_id, age, ageSD) %>% filter(age != 0) %>% arrange(desc(age)) %>% mutate(group = ifelse(age > 0, "g1", "g2")) df_fig$taxon_id = factor(df_fig$taxon_id, levels = df_fig$taxon_id) p = ggplot(data = df_fig, aes(x = taxon_id, y = age, fill = group, color = group)) + geom_bar(stat = "identity", width = 0.7, position = position_dodge(width = 0.4)) + geom_errorbar(aes(ymin = age - ageSD, ymax = age + ageSD), width = 0.2, position = position_dodge(0.05), color = "black") + labs(x = NULL, y = "Log fold change", title = "Waterfall Plot for the Age Effect") + theme_bw() + theme(legend.position = "none", plot.title = element_text(hjust = 0.5), panel.grid.minor.y = element_blank(), axis.text.x = element_text(angle = 60, hjust = 1)) p ``` ## 3.5 ANCOMBC global test result Result from the ANCOM-BC global test to determine taxa that are differentially abundant between three or more groups of multiple samples. It contains: 1) test statistics; 2) p-values; 3) adjusted p-values; 4) indicators whether the taxon is differentially abundant (TRUE) or not (FALSE). ### 3.51 Test statistics ```{r} tab_w = res_global[, "W", drop = FALSE] colnames(tab_w) = "Nationality" tab_w %>% datatable(caption = "Test Statistics from the Global Test Result") %>% formatRound(c("Nationality"), digits = 2) ``` ### 3.52 P-values ```{r} tab_p = res_global[, "p_val", drop = FALSE] colnames(tab_p) = "Nationality" tab_p %>% datatable(caption = "P-values from the Global Test Result") %>% formatRound(c("Nationality"), digits = 2) ``` ### 3.53 Adjusted p-values ```{r} tab_q = res_global[, "q_val", drop = FALSE] colnames(tab_q) = "Nationality" tab_q %>% datatable(caption = "Adjusted p-values from the Global Test Result") %>% formatRound(c("Nationality"), digits = 2) ``` ### 3.54 Differentially abundant taxa ```{r} tab_diff = res_global[, "diff_abn", drop = FALSE] colnames(tab_diff) = "Nationality" tab_diff %>% datatable(caption = "Differentially Abundant Taxa from the Global Test Result") ``` # Session information ```{r sessionInfo, message = FALSE, warning = FALSE, comment = NA} sessionInfo() ``` # References