---
title: "Case study: Nutrient thresholds"
author: "The miaSim package authors."
date: "`r Sys.Date()`"
package: miaSim
output: 
  BiocStyle::html_document:
    fig_height: 7
    fig_width: 10
    toc: yes
    toc_depth: 2
    number_sections: true
vignette: >
  %\VignetteIndexEntry{caseStudy2-NutrientThreshold}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
  \usepackage[utf8]{inputenc}
---

```{r, echo=FALSE}
knitr::opts_chunk$set(
  cache = FALSE,
  fig.width = 9,
  message = FALSE,
  eval=FALSE,
  warning = FALSE)

knitr::opts_knit$set(global.par = TRUE)
```

# Case study with miaSim: Nutrient concentration threshold

Reference: [Available energy fluxes drive a transition in the
diversity, stability, and functional structure of microbial
communities](https://doi.org/10.1371/journal.pcbi.1006793).

The aim of this case study is to design and demonstrate the existence
of nutrient concentration threshold which limits the beta-diversity of
communities.

To fulfill this aim, we designed a gradient of environments, as well
as a gradient of communities.

## Load dependencies

```{r deps}
library(ggplot2)
library(vegan)
library(reshape2)
library(miaSim)
library(philentropy)
library(cluster)
```

This batch of simulations is time-consuming. To reduce the calculation
burden, we have decreased the numbers of environments, resources, and
communities from the original 10 to 5, and made other minor
modifications.

## Set random seed and initial shared parameters

```{r pars}
set.seed(42)
n_species <- 5
n_resources <- 5
E <- randomE(n_species, n_resources, mean_consumption = 1, mean_production = 3)
growth_rates <- runif(n_species)
monod_constant <- matrix(rbeta(n_species*n_resources, 10,10),
                               nrow=n_species,
			       ncol=n_resources)
t_store <- 50
n.instances <- 1 # no stochastic process: no need to repeat
```

## Generating function

This function generates a data frame, where each row is arranged in an
increasing dissimilarity to the first row.

```{r funcs}
gradient.df.generator <- function(n_row, n_col, density_row, max_gradient, error_interval){
    list_initial <- list()
    dissimilarity.gradient <- seq(from = 0, to = max_gradient, length.out = n_row)
    for (i in seq_len(n_row)){
        print(i)
        if (i == 1){
            row_temp <- rbeta(n_col, 1, 1/n_col)
            col_to_remove <- sample(x = seq_len(n_col), size = n_col-n_col*density_row)
            row_temp[col_to_remove] <- 0
            list_initial[[i]] <- row_temp
        } else {
            while (length(list_initial) < i) {
                row_temp <- rbeta(n_col, 1, 1/n_col)
                col_to_remove <- sample(x = seq_len(n_col), size = n_col-n_col*density_row)
                row_temp[col_to_remove] <- 0
                diff_temp <- abs(vegdist(rbind(list_initial[[1]], row_temp), method = "bray") - dissimilarity.gradient[i])
                if (diff_temp < error_interval) {
                    list_initial[[i]] <- row_temp
                }
            }
        }
    }
    dataframe_to_return <- as.data.frame(t(matrix(unlist(list_initial), ncol = n_row)))
    return(dataframe_to_return)
}
```

## generate communities

```{r gen}
n.community <- 5 # you can also try 20 or even 50.
density.community <- 0.8
set.seed(42)
community.initial.df <- gradient.df.generator(n_row = n.community,
                                              n_col = n_species,
                                              density_row = density.community,
					      max_gradient = 0.7,
					      error_interval = 0.1)
dist.community.initial.df <- vegdist(community.initial.df, method = "bray")
community.initial.tse <- TreeSummarizedExperiment(assays=SimpleList(abundances=t(as.matrix(community.initial.df))))
```

## Load plotting functions

These will be replaced soon by the TreeSummarizedExperiment equivalents.

```{r plots}
makePlot <- function(out_matrix, title = "abundance of species by time", obj = "species", y.label = "x.t"){
    df <- as.data.frame(out_matrix)
    dft <-  melt(df, id="time")
    names(dft)[2] = obj
    names(dft)[3] = y.label
    lgd = ncol(df)<= 20
    ggplot(dft, aes_string(names(dft)[1], names(dft)[3], col = names(dft)[2])) +
        geom_line(show.legend = lgd, lwd=0.5) +
        ggtitle(title) +
        theme_linedraw() +
        theme(plot.title = element_text(hjust = 0.5, size = 14))
}
makePlotRes <- function(out_matrix, title = "quantity of compounds by time"){
    df <- as.data.frame(out_matrix)
    dft <-  melt(df, id="time")
    names(dft)[2] = "resources"
    names(dft)[3] = "S.t"
    lgd = ncol(df)<= 20
    ggplot(dft, aes(time, S.t, col = resources)) +
        geom_line(show.legend = lgd, lwd=0.5) +
        ggtitle(title) +
        theme_linedraw() +
        theme(plot.title = element_text(hjust = 0.5, size = 14))
}
makeHeatmap <-function(matrix.A,
                       title = "Consumption/production matrix",
                       y.label = 'resources',
                       x.label = 'species',
                       midpoint_color = NULL,
                       lowColor = "red",
                       midColor = "white",
                       highColor = "blue"){
    df <- melt(t(matrix.A))
    if (is.null(midpoint_color)) {
        midpoint_color <- 0
    }
    names(df)<- c("x", "y", "strength")
    df$y <- factor(df$y, levels=rev(unique(sort(df$y))))
    fig <- ggplot(df, aes(x,y,fill=strength)) + geom_tile() + coord_equal() +
        theme(axis.title = element_blank()) +
        scale_fill_gradient2('strength', low = lowColor,
	    mid = midColor, high = highColor, midpoint = midpoint_color)+
        theme_void() + ggtitle(title)

    if (ncol(matrix.A)<=10 & nrow(matrix.A)<=10){
        fig <- fig + geom_text(aes(label = round(strength, 2)))
    } else if (ncol(matrix.A)<=15 & nrow(matrix.A)<=15){
        fig <- fig + geom_text(aes(label = round(strength, 1)))
    } else {
        fig <- fig
    }

    fig <- fig + labs(x = x.label, y = y.label)+
        theme_linedraw() +
        theme(plot.title = element_text(hjust = 0.5, size = 14),
	      axis.text.x = element_text(
            angle = 90))

    if (nrow(matrix.A) >= 20){
        # too many species
        fig <- fig + theme(
            axis.title.y=element_blank(),
            axis.text.y=element_blank(),
            axis.ticks.y=element_blank(),
        )
    }
    if (ncol(matrix.A) >= 20){
        # too many resources
        fig <- fig + theme(
            axis.title.x=element_blank(),
            axis.text.x=element_blank(),
            axis.ticks.x=element_blank()
        )
    }
    fig
}

makeHeatmap(as.matrix(dist.community.initial.df),
            title = "dissimilarity matrix",
            x.label = "community.1",
            y.label = "community.2")
```

## initialize shared parameters

```{r crm}
crm_params <- list(
    n_species = n_species,
    n_resources = n_resources,
    x0 = NULL,
    E = E,
    resources = rep(1,n_resources),
    monod_constant = monod_constant,
    migration_p = 0,
    stochastic = FALSE,
    t_start = 0,
    t_end = 50,
    t_step = 1,
    t_store = t_store,
    growth_rates = growth_rates,
    norm=FALSE)
```

## Generate resource gradients

```{r grads}
resourceConcentration <- 10^seq(0,4,1) # 1 to 10000
n.medium <- 5
density.medium <- 0.8
n_species <- 5
set.seed(42)
resource.initial.df <- gradient.df.generator(n_row = n.medium,
    n_col = n_resources, density_row = density.medium,
    max_gradient = 0.7, error_interval = 0.1)
crmExample <- simulateConsumerResource(
    n_species = n_species,
    n_resources = n_resources,
    E = E,
    x0 = as.numeric(community.initial.df[1,]),
    resources = as.numeric(resourceConcentration[3]*resource.initial.df[1,]),
    growth_rates = growth_rates,
    monod_constant = monod_constant,
    stochastic = FALSE,
    t_end = 50,
    t_step = 1,
    t_store = 50,
    norm = FALSE)
#makePlot(crmExample$matrix)
#makePlotRes(crmExample$resources)

## Generate simulations and store the final community in community.simulation
## In this step, the final relative abundance table is basisComposition_prop

set.seed(42)
community.simulation <- list()
counter_i <- 1
resourceConcentration <- 10^seq(0,4,1) # 1 to 10000
n.medium <- 5
for (resConc in resourceConcentration) {
    for (medium in seq_len(n.medium)){
        crm_params$resources <- as.numeric(resource.initial.df[medium,]*resConc)
        paramx0 <- as.list(as.data.frame(t(community.initial.df)))
        crm_param_iter <- list(x0 = paramx0)
        print(paste("resConc", resConc, "medium", medium))
        crmMoments <- generateSimulations(model = "simulateConsumerResource",
                                          params_list = crm_params,
                                          param_iter = crm_param_iter,
                                          n_instances = n.instances,
                                          t_end = 50)
        # pick community composition at the last time point		  
        community.simulation[[counter_i]] <- as.data.frame(do.call(rbind, lapply(crmMoments, function (x) {assay(x, "counts")[, ncol(x)]})))

        counter_i <- counter_i + 1
    }
}
basisComposition <- do.call(rbind.data.frame, community.simulation)
rm(counter_i, community.simulation)
basisComposition_prop <- basisComposition / rowSums(basisComposition)


## Make UMAP plots
## In this step, plot result is stored in umap_CRM_gradient_plot, and
##   this is visualized in different facets.

resourceConcentration <- 10^seq(0,4,1) # 1 to 10000
n.medium <- 5
n.community <- 5
concentration <- as.factor(rep(resourceConcentration, each = n.medium*n.community))
medium <- as.factor(rep(seq_len(n.medium), each = n.community ,times = length(resourceConcentration) ))
community <- as.factor(rep(seq_len(n.community), times = length(resourceConcentration)*n.medium))

# Visualize with UMAP 

## Provide the community data as TreeSE object
library(scater)
tse <- TreeSummarizedExperiment(
                assays=SimpleList(abundances=t(as.matrix(basisComposition))),
		colData=DataFrame(Medium=medium,
		                      Concentration=concentration,
				      Community=community
				      )
		    )
## Add UMAP
tse <- runUMAP(tse, name = "UMAP", exprs_values = "abundances")
## Plot UMAP
plotReducedDim(tse, "UMAP", colour_by="Medium", shape_by="Concentration")

# Same for compositional abundance data
library(mia)
## -- add relative abundances;
tse <- transformSamples(tse, assay.type="abundances", method="relabundance")
tse <- runUMAP(tse, name = "UMAP_compositional", exprs_values = "relabundance")
plotReducedDim(tse, "UMAP_compositional", colour_by="Medium", shape_by="Concentration")
# Finally with communities
umap_CRM_gradient_plot <<- plotReducedDim(tse, "UMAP_compositional",
   colour_by="Medium", shape_by="Community", size_by="Concentration")
```

## Visualization of the results

In this part, different visualization of results demonstrate (in
various facets) the gradual change of communities' beta diversity.
The first figure indicates that the initial community composition is
more important than the combinations of initial available resources.

The first sub-figure in the second figure demonstrates that in an
oligotrophic (less available nutrients) environment, communities won't
change much in a given time, whilst the last two sub-figures resemble
each other, implying that the nutrient is no longer the limiting
factor of the beta-diversity of the community. This pattern is further
displayed in the following "curve plot".

In the third figure, the second and the th community always stays more
similar, despite their initial dissimilarity, indicating that they
might belong to one community type. This can be validated by input 20
or even 50 as n.community in this case study: communities turns into
clusters in each sub-figures.

```{r umapplots, eval=FALSE}
# FIXME: the visual output can be polished later.
print(umap_CRM_gradient_plot)
umap_CRM_gradient_plot + facet_grid(size_by ~ ., labeller = label_both)
umap_CRM_gradient_plot + facet_grid(colour_by ~ size_by, labeller = label_both)
umap_CRM_gradient_plot + facet_grid(shape_by ~ size_by, labeller = label_both)
umap_CRM_gradient_plot + facet_grid(shape_by ~ colour_by, labeller = label_both)
```

## Saturation curve 

Saturation curve of average beta-diversity between communities with
community 1.

In this part, we demonstrate that the average distance from
other communities to community 1 will reach to a threshold of
nutrients, after which the average distance won't increase along with
the total concentration of nutrients.

Let us first define a function calculating the mean distance to the
first community.

## construct a function taking umap_CRM_coor as df and return the mean distance

```{r misc, eval=FALSE}
average_distance <- function(df, res_conc_type, com_type, method = "euclidean"){
    sub_df <- df[df$concentration == res_conc_type & df$community == com_type,]
    combines <- combn(sub_df$medium, 2)
    distances <- NULL
    for (i in seq_len(ncol(combines))) {
        distances[i] <- dist(sub_df[combines[,i], c(1, 2)])
    }
    return(mean(distances))
}
```

## UMAP distance saturation analysis

This shows how distance saturations could be calculated. Not evaluated
currently.

```{r df, eval=FALSE}
distance_saturation_data <- data.frame(concentration = integer(),
                                       community = integer(),
                                       average_distance = numeric())

#umap_CRM_coor <- cbind(umap_CRM_coor, concentration, medium, community)
umap_CRM_coor <- data.frame(reducedDim(tse), colData(tse)) # cbind(umap_CRM_coor, concentration, medium, community)
for (res_conc_type in unique(concentration)){
    for (com_type in unique(community)){
        ave_dist <- average_distance(umap_CRM_coor, res_conc_type, com_type)
        distance_saturation_data[nrow(distance_saturation_data)+1,] <-
            c(res_conc_type, com_type, ave_dist)
    }
}
# View(distance_saturation_data)
distance_saturation_data$average_distance <- as.numeric(distance_saturation_data$average_distance)
distance_saturation_data$concentration <- as.factor(distance_saturation_data$concentration)
distance_saturation_data$community <- as.factor(distance_saturation_data$community)

# distance_saturation_data_plot
p <- ggplot(distance_saturation_data,
                                 aes(concentration, average_distance,
                                     color = community,
                                     group = community)) +
    geom_line() +
    geom_point() +
    scale_shape_manual(values = c(0, 1, 2, 5, 6, 8, 15, 16, 17, 18)) +
    labs(x = "Resource concentration",
         y = "Average distance between communities in UMAP") +
    theme_bw()

print(p)
```