## ----setup, message = FALSE, warning = FALSE, comment = NA--------------------
knitr::opts_chunk$set(message = FALSE, warning = FALSE, comment = NA,
                      fig.width = 6.25, fig.height = 5)
library(ANCOMBC)
library(tidyverse)

## ----getPackage, eval=FALSE---------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  BiocManager::install("ANCOMBC")

## ----load, eval=FALSE---------------------------------------------------------
#  library(ANCOMBC)

## -----------------------------------------------------------------------------
data(atlas1006, package = "microbiome")

# Subset to baseline
pseq = phyloseq::subset_samples(atlas1006, time == 0)

# Re-code the bmi group
meta_data = microbiome::meta(pseq)
meta_data$bmi = recode(meta_data$bmi_group,
                       obese = "obese",
                       severeobese = "obese",
                       morbidobese = "obese")

# Note that by default, levels of a categorical variable in R are sorted 
# alphabetically. In this case, the reference level for `bmi` will be 
# `lean`. To manually change the reference level, for instance, setting `obese`
# as the reference level, use:
meta_data$bmi = factor(meta_data$bmi, levels = c("obese", "overweight", "lean"))
# You can verify the change by checking:
# levels(meta_data$bmi)

# Create the region variable
meta_data$region = recode(as.character(meta_data$nationality),
                          Scandinavia = "NE", UKIE = "NE", SouthEurope = "SE", 
                          CentralEurope = "CE", EasternEurope = "EE",
                          .missing = "unknown")

phyloseq::sample_data(pseq) = meta_data

# Subset to lean, overweight, and obese subjects
pseq = phyloseq::subset_samples(pseq, bmi %in% c("lean", "overweight", "obese"))
# Discard "EE" as it contains only 1 subject
# Discard subjects with missing values of region
pseq = phyloseq::subset_samples(pseq, ! region %in% c("EE", "unknown"))

print(pseq)

## -----------------------------------------------------------------------------
set.seed(123)
out = ancom(data = pseq, tax_level = "Family", meta_data = NULL,
            p_adj_method = "holm", prv_cut = 0.10,
            lib_cut = 1000, main_var = "bmi", adj_formula = "age + region", 
            rand_formula = NULL, lme_control = NULL, struc_zero = TRUE,
            neg_lb = TRUE, alpha = 0.05, n_cl = 2, verbose = TRUE)

res = out$res

# Similarly, if the main variable of interest is continuous, such as age, the
# ancom model can be specified as
# out = ancom(data = pseq, tax_level = "Family", meta_data = NULL,
#             p_adj_method = "holm", prv_cut = 0.10,
#             lib_cut = 1000, main_var = "age", adj_formula = "bmi + region",
#             rand_formula = NULL, lme_control = NULL, struc_zero = FALSE,
#             neg_lb = FALSE, alpha = 0.05, n_cl = 2, verbose = TRUE)

## -----------------------------------------------------------------------------
q_val = out$q_data
beta_val = out$beta_data
# Only consider the effect sizes with the corresponding q-value less than alpha
beta_val = beta_val * (q_val < 0.05) 
# Choose the maximum of beta's as the effect size
beta_pos = apply(abs(beta_val), 2, which.max) 
beta_max = vapply(seq_along(beta_pos), function(i) 
    beta_val[beta_pos[i], i], FUN.VALUE = double(1))
# Number of taxa except structural zeros
n_taxa = ifelse(is.null(out$zero_ind), 
                nrow(tse), 
                sum(apply(out$zero_ind[, -1], 1, sum) == 0))
# Cutoff values for declaring differentially abundant taxa
cut_off = 0.7 * (n_taxa - 1)

df_fig_w = res %>%
  dplyr::mutate(beta = beta_max,
                direct = case_when(
                  detected_0.7 == TRUE & beta > 0 ~ "Positive",
                  detected_0.7 == TRUE & beta <= 0 ~ "Negative",
                  TRUE ~ "Not Significant"
                  )) %>%
  dplyr::arrange(W)
df_fig_w$taxon = factor(df_fig_w$taxon, levels = df_fig_w$taxon)
df_fig_w$W = replace(df_fig_w$W, is.infinite(df_fig_w$W), n_taxa - 1)
df_fig_w$direct = factor(df_fig_w$direct, 
                         levels = c("Negative", "Positive", "Not Significant"))

p_w = df_fig_w %>%
  ggplot(aes(x = taxon, y = W, color = direct)) +
  geom_point(size = 2, alpha = 0.6) +
  labs(x = "Taxon", y = "W") +
  scale_color_discrete(name = NULL) + 
  geom_hline(yintercept = cut_off, linetype = "dotted", 
             color = "blue", size = 1.5) +
  geom_text(aes(x = 2, y = cut_off + 0.5, label = "W[0.7]"), 
            size = 5, vjust = -0.5, hjust = 0, color = "orange", parse = TRUE) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major = element_blank())
p_w

## ----eval=FALSE---------------------------------------------------------------
#  tse = mia::makeTreeSummarizedExperimentFromPhyloseq(pseq)
#  
#  set.seed(123)
#  out = ancom(data = tse, assay_name = "counts",
#              tax_level = "Family", meta_data = NULL,
#              p_adj_method = "holm", prv_cut = 0.10,
#              lib_cut = 1000, main_var = "bmi", adj_formula = "age + region",
#              rand_formula = NULL, lme_control = NULL, struc_zero = TRUE,
#              neg_lb = TRUE, alpha = 0.05, n_cl = 2, verbose = TRUE)
#  
#  res = out$res

## ----eval=FALSE---------------------------------------------------------------
#  abundance_data = microbiome::abundances(pseq)
#  aggregate_data = microbiome::abundances(microbiome::aggregate_taxa(pseq, "Family"))
#  meta_data = microbiome::meta(pseq)
#  
#  set.seed(123)
#  out = ancom(data = abundance_data, aggregate_data = aggregate_data,
#              meta_data = meta_data, p_adj_method = "holm", prv_cut = 0.10,
#              lib_cut = 1000, main_var = "bmi", adj_formula = "age + region",
#              rand_formula = NULL, lme_control = NULL, struc_zero = TRUE,
#              neg_lb = TRUE, alpha = 0.05, n_cl = 2, verbose = TRUE)
#  
#  res = out$res

## -----------------------------------------------------------------------------
data(dietswap, package = "microbiome")
print(dietswap)

## -----------------------------------------------------------------------------
set.seed(123)
out = ancom(data = dietswap, tax_level = "Family", 
            p_adj_method = "holm", prv_cut = 0.10, lib_cut = 1000, 
            main_var = "group",
            adj_formula = "nationality + timepoint", 
            rand_formula = "(timepoint | subject)", 
            lme_control = lme4::lmerControl(), 
            struc_zero = TRUE, neg_lb = TRUE, alpha = 0.05, n_cl = 2)

res = out$res

## -----------------------------------------------------------------------------
q_val = out$q_data
beta_val = out$beta_data
# Only consider the effect sizes with the corresponding q-value less than alpha
beta_val = beta_val * (q_val < 0.05) 
# Choose the maximum of beta's as the effect size
beta_pos = apply(abs(beta_val), 2, which.max) 
beta_max = vapply(seq_along(beta_pos), function(i) beta_val[beta_pos[i], i],
                  FUN.VALUE = double(1))
# Number of taxa except structural zeros
n_taxa = ifelse(is.null(out$zero_ind), 
                nrow(tse), 
                sum(apply(out$zero_ind[, -1], 1, sum) == 0))
# Cutoff values for declaring differentially abundant taxa
cut_off = 0.7 * (n_taxa - 1)

df_fig_w = res %>%
  dplyr::mutate(beta = beta_max,
                direct = case_when(
                  detected_0.7 == TRUE & beta > 0 ~ "Positive",
                  detected_0.7 == TRUE & beta <= 0 ~ "Negative",
                  TRUE ~ "Not Significant"
                  )) %>%
  dplyr::arrange(W)
df_fig_w$taxon = factor(df_fig_w$taxon, levels = df_fig_w$taxon)
df_fig_w$W = replace(df_fig_w$W, is.infinite(df_fig_w$W), n_taxa - 1)
df_fig_w$direct = factor(df_fig_w$direct, 
                     levels = c("Negative", "Positive", "Not Significant"))

p_w = df_fig_w %>%
  ggplot(aes(x = taxon, y = W, color = direct)) +
  geom_point(size = 2, alpha = 0.6) +
  labs(x = "Taxon", y = "W") +
  scale_color_discrete(name = NULL) + 
  geom_hline(yintercept = cut_off, linetype = "dotted", 
             color = "blue", size = 1.5) +
  geom_text(aes(x = 2, y = cut_off + 0.5, label = "W[0.7]"), 
            size = 5, vjust = -0.5, hjust = 0, color = "orange", parse = TRUE) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major = element_blank())
p_w

## ----eval=FALSE---------------------------------------------------------------
#  tse = mia::makeTreeSummarizedExperimentFromPhyloseq(dietswap)
#  
#  set.seed(123)
#  out = ancom(data = tse, assay_name = "counts", tax_level = "Family",
#              p_adj_method = "holm", prv_cut = 0.10, lib_cut = 1000,
#              main_var = "group",
#              adj_formula = "nationality + timepoint",
#              rand_formula = "(timepoint | subject)",
#              lme_control = lme4::lmerControl(),
#              struc_zero = TRUE, neg_lb = TRUE, alpha = 0.05, n_cl = 2)
#  
#  res = out$res

## ----eval=FALSE---------------------------------------------------------------
#  abundance_data = microbiome::abundances(dietswap)
#  aggregate_data = microbiome::abundances(microbiome::aggregate_taxa(dietswap, "Family"))
#  meta_data = microbiome::meta(dietswap)
#  
#  set.seed(123)
#  out = ancom(data = abundance_data, aggregate_data = aggregate_data,
#              meta_data = meta_data, p_adj_method = "holm",
#              prv_cut = 0.10, lib_cut = 1000, main_var = "group",
#              adj_formula = "nationality + timepoint",
#              rand_formula = "(timepoint | subject)",
#              lme_control = lme4::lmerControl(),
#              struc_zero = TRUE, neg_lb = TRUE, alpha = 0.05, n_cl = 2)
#  
#  res = out$res

## ----sessionInfo, message = FALSE, warning = FALSE, comment = NA--------------
sessionInfo()