---
title: "POMA EDA Example"
author: 
- name: Pol Castellano-Escuder
  affiliation: Duke University
  email: polcaes@gmail.com
date: "`r BiocStyle::doc_date()`"
output: 
    BiocStyle::html_document
vignette: >
  %\VignetteIndexEntry{POMA EDA Example}
  %\VignetteEngine{knitr::rmarkdown}
  %\usepackage[utf8]{inputenc}
  %\VignetteEncoding{UTF-8}
biblio-style: apalike
link-citations: true
---

**Compiled date**: `r Sys.Date()`

**Last edited**: 2021-12-12

**License**: `r packageDescription("POMA")[["License"]]`

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  # fig.align = "center",
  comment = ">"
)
```

# Installation

Run the following code to install the Bioconductor version of package.        

```{r, eval = FALSE}
# install.packages("BiocManager")
BiocManager::install("POMA")
```

# Load POMA 

```{r, warning = FALSE, message = FALSE, comment = FALSE}
library(POMA)
```

# Automatic EDA Report 

The following function will return an Exploratory Data Analysis (EDA) PDF report. The input object must be a **SummarizedExperiment** object.     

```{r, eval = FALSE}
data("st000336")
PomaEDA(st000336)
```

**Generated EDA PDF report starts here.**  

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
# This file is part of POMA.

# POMA is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# POMA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with POMA. If not, see <https://www.gnu.org/licenses/>.

library(knitr)
library(dplyr)
library(tibble)
library(ggplot2)
library(SummarizedExperiment)
library(POMA)

data <- st000336
imputation <- "knn"
normalization <- "log_pareto"
clean_outliers <- TRUE
coeff_outliers <- 1.5

##

e <- t(SummarizedExperiment::assay(data))
target <- SummarizedExperiment::colData(data) %>% 
  as.data.frame() %>% 
  rownames_to_column("ID") %>% 
  dplyr::rename(Group = 2) %>% 
  dplyr::select(ID, Group)
```

\pagebreak

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
if(clean_outliers){

  imputed <- PomaImpute(data, method = imputation)
  pre_processed <- PomaNorm(imputed, method = normalization) %>%
    PomaOutliers(coef = coeff_outliers)
  
} else {
  
    imputed <- PomaImpute(data, method = imputation)
    pre_processed <- PomaNorm(imputed, method = normalization)
  
}
```

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
# zeros
zeros <- data.frame(number = colSums(e == 0, na.rm = TRUE)) %>%
  rownames_to_column("names") %>%
  filter(number != 0)

all_zero <- zeros %>% 
  filter(number == nrow(e))

# missing values
nas <- data.frame(number = colSums(is.na(e))) %>%
  rownames_to_column("names") %>%
  filter(number != 0)

# zero variance
var_zero <- e %>%
  as.data.frame() %>%
  summarise_all(~ var(., na.rm = TRUE)) %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column("names") %>%
  filter(V1 == 0)
```

\pagebreak

# Know your data

  + Your data have **`r nrow(e)`** samples, **`r ncol(e)`** features and **`r length(table(target$Group))`** groups, that are **`r noquote(paste(shQuote(levels(as.factor(target$Group))), collapse=", "))`**. `r ifelse(ncol(SummarizedExperiment::colData(data)) > 1, paste0("Furthermore, **", ncol(SummarizedExperiment::colData(data)) - 1,"** covariates have been found in your data. These covariates are **",noquote(paste(shQuote(paste0(colnames(SummarizedExperiment::colData(data))[2:ncol(SummarizedExperiment::colData(data))])), collapse=", ")),"**."), "")`          

  + A **`r round((sum(is.na(e))/(nrow(e)*ncol(e)))*100, 2)`%** of values in your data are NAs (missing values). `r ifelse(nrow(nas) >= 1, paste0("Variables that have NA values are **",noquote(paste(shQuote(paste0(nas$names," (",nas$number,")")), collapse=", ")),"**."), "")`

  + A **`r round((sum(zeros$number)/(nrow(e)*ncol(e)))*100, 2)`%** of values in your data are zeros. `r ifelse(nrow(zeros) >= 1, paste0("Variables that have zeros are **",noquote(paste(shQuote(paste0(zeros$names," (",zeros$number,")")), collapse=", ")),"**."), "")`
  
  + Removed from the exploratory data analysis **`r nrow(all_zero)`** features that only have zeros. `r ifelse(nrow(all_zero) >= 1, paste0("These variables are **",noquote(paste(shQuote(all_zero$names), collapse=", ")),"**."), "")` 
  
  + Removed from the exploratory data analysis **`r nrow(var_zero)`** features that have zero variance. `r ifelse(nrow(var_zero) >= 1, paste0("These variables are **",noquote(paste(shQuote(var_zero$names), collapse=", ")),"**."), "")`

## Summary Table

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
summary_table1 <- data.frame(Samples = nrow(e),
                             Features = ncol(e),
                             Covariates = ncol(SummarizedExperiment::colData(data)) - 1)
summary_table2 <- data.frame(Counts_Zero = sum(zeros$number),
                             Percent_Zero = paste(round((sum(zeros$number)/(nrow(e)*ncol(e)))*100, 2), "%"))
summary_table3 <- data.frame(Counts_NA = sum(is.na(e)),
                             Percent_NA = paste(round((sum(is.na(e))/(nrow(e)*ncol(e)))*100, 2), "%")) 
knitr::kable(summary_table1)
knitr::kable(summary_table2)
knitr::kable(summary_table3)
```

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
if (nrow(nas) >= 1){
  ggplot(nas, aes(reorder(names, number), number, fill = number)) +
    geom_col() +
    ylab("Missing values") +
    xlab("") +
    ggtitle("Missing Value Plot") +
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "none")
}
```

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
if (nrow(zeros) >= 1){
  ggplot(zeros, aes(reorder(names, number), number, fill = number)) +
    geom_col() +
    ylab("Zeros") +
    xlab("") +
    ggtitle("Zeros Plot") +
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "none") +
    scale_fill_viridis_d(begin = 0, end = 0.8)
}
```

## Samples by Group

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
counts <- data.frame(table(target$Group))
colnames(counts) <- c("Group", "Counts")

ggplot(counts, aes(reorder(Group, Counts), Counts, fill = Group)) +
  geom_col() +
  ylab("Counts") +
  xlab("") +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none") +
    scale_fill_viridis_d(begin = 0, end = 0.8)
```

\pagebreak

# Normalization Plots

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
indNum <- nrow(SummarizedExperiment::colData(pre_processed))
jttr <- ifelse(indNum <= 10, TRUE, FALSE)

p1 <- PomaBoxplots(imputed, jitter = jttr, label_size = 8) +
  xlab("Samples") +
  ylab("Value") +
  ggtitle("Not Normalized") +
  theme(legend.position = "bottom")

p2 <- PomaBoxplots(pre_processed, jitter = jttr, label_size = 8) +
  xlab("Samples") +
  ylab("Value") +
  ggtitle(paste0("Normalized (", normalization, ")")) +
  theme(legend.position = "bottom")

p1
p2
```

\pagebreak

# Group Distribution Plots

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
p3 <- PomaDensity(imputed) +
  ggtitle("Not Normalized")

p4 <- PomaDensity(pre_processed) +
  ggtitle(paste0("Normalized (", normalization, ")"))

p3
p4
```

\pagebreak

# Outlier Detection

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
outliers <- data %>% 
  PomaImpute(method = imputation) %>%
  PomaNorm(method = normalization) %>%
  PomaOutliers(do = "analyze", coef = coeff_outliers)
outliers$polygon_plot
```

**`r nrow(outliers$outliers)`** possible outliers detected in your data. `r ifelse(nrow(outliers$outliers) >= 1, paste0("These outliers are **",noquote(paste(shQuote(paste0(outliers$outliers$sample)), collapse=", ")),"**."), "")`

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
if(nrow(outliers$outliers) >= 1){
  knitr::kable(outliers$outliers)
  }
```

\pagebreak

# High Correlated Features (r > 0.97)

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
correlations <- PomaCorr(pre_processed, 
                         label_size = 8)
high_correlations <- correlations$correlations %>% filter(abs(R) > 0.97)
```

There are **`r nrow(high_correlations)`** high correlated feature pairs in your data. `r ifelse(nrow(high_correlations) >= 1, paste0("These features are **",noquote(paste(shQuote(paste0(high_correlations$feature1, " - " , high_correlations$feature2)), collapse=", ")),"**."), "")`

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE, fig.align = 'center'}
correlations$corrplot
```

\pagebreak

# Heatmap and Clustering

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
PomaHeatmap(pre_processed)
```

\pagebreak

# Principal Component Analysis

```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE}
PomaMultivariate(pre_processed, method = "pca", ellipse = FALSE)$scoresplot
```