--- title: "POMA EDA Example" author: - name: Pol Castellano-Escuder affiliation: Duke University email: polcaes@gmail.com date: "`r BiocStyle::doc_date()`" output: BiocStyle::html_document vignette: > %\VignetteIndexEntry{POMA EDA Example} %\VignetteEngine{knitr::rmarkdown} %\usepackage[utf8]{inputenc} %\VignetteEncoding{UTF-8} biblio-style: apalike link-citations: true --- **Compiled date**: `r Sys.Date()` **Last edited**: 2021-12-12 **License**: `r packageDescription("POMA")[["License"]]` ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, # fig.align = "center", comment = ">" ) ``` # Installation Run the following code to install the Bioconductor version of package. ```{r, eval = FALSE} # install.packages("BiocManager") BiocManager::install("POMA") ``` # Load POMA ```{r, warning = FALSE, message = FALSE, comment = FALSE} library(POMA) ``` # Automatic EDA Report The following function will return an Exploratory Data Analysis (EDA) PDF report. The input object must be a **SummarizedExperiment** object. ```{r, eval = FALSE} data("st000336") PomaEDA(st000336) ``` **Generated EDA PDF report starts here.** ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} # This file is part of POMA. # POMA is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # POMA is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with POMA. If not, see . library(knitr) library(dplyr) library(tibble) library(ggplot2) library(SummarizedExperiment) library(POMA) data <- st000336 imputation <- "knn" normalization <- "log_pareto" clean_outliers <- TRUE coeff_outliers <- 1.5 ## e <- t(SummarizedExperiment::assay(data)) target <- SummarizedExperiment::colData(data) %>% as.data.frame() %>% rownames_to_column("ID") %>% dplyr::rename(Group = 2) %>% dplyr::select(ID, Group) ``` \pagebreak ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} if(clean_outliers){ imputed <- PomaImpute(data, method = imputation) pre_processed <- PomaNorm(imputed, method = normalization) %>% PomaOutliers(coef = coeff_outliers) } else { imputed <- PomaImpute(data, method = imputation) pre_processed <- PomaNorm(imputed, method = normalization) } ``` ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} # zeros zeros <- data.frame(number = colSums(e == 0, na.rm = TRUE)) %>% rownames_to_column("names") %>% filter(number != 0) all_zero <- zeros %>% filter(number == nrow(e)) # missing values nas <- data.frame(number = colSums(is.na(e))) %>% rownames_to_column("names") %>% filter(number != 0) # zero variance var_zero <- e %>% as.data.frame() %>% summarise_all(~ var(., na.rm = TRUE)) %>% t() %>% as.data.frame() %>% rownames_to_column("names") %>% filter(V1 == 0) ``` \pagebreak # Know your data + Your data have **`r nrow(e)`** samples, **`r ncol(e)`** features and **`r length(table(target$Group))`** groups, that are **`r noquote(paste(shQuote(levels(as.factor(target$Group))), collapse=", "))`**. `r ifelse(ncol(SummarizedExperiment::colData(data)) > 1, paste0("Furthermore, **", ncol(SummarizedExperiment::colData(data)) - 1,"** covariates have been found in your data. These covariates are **",noquote(paste(shQuote(paste0(colnames(SummarizedExperiment::colData(data))[2:ncol(SummarizedExperiment::colData(data))])), collapse=", ")),"**."), "")` + A **`r round((sum(is.na(e))/(nrow(e)*ncol(e)))*100, 2)`%** of values in your data are NAs (missing values). `r ifelse(nrow(nas) >= 1, paste0("Variables that have NA values are **",noquote(paste(shQuote(paste0(nas$names," (",nas$number,")")), collapse=", ")),"**."), "")` + A **`r round((sum(zeros$number)/(nrow(e)*ncol(e)))*100, 2)`%** of values in your data are zeros. `r ifelse(nrow(zeros) >= 1, paste0("Variables that have zeros are **",noquote(paste(shQuote(paste0(zeros$names," (",zeros$number,")")), collapse=", ")),"**."), "")` + Removed from the exploratory data analysis **`r nrow(all_zero)`** features that only have zeros. `r ifelse(nrow(all_zero) >= 1, paste0("These variables are **",noquote(paste(shQuote(all_zero$names), collapse=", ")),"**."), "")` + Removed from the exploratory data analysis **`r nrow(var_zero)`** features that have zero variance. `r ifelse(nrow(var_zero) >= 1, paste0("These variables are **",noquote(paste(shQuote(var_zero$names), collapse=", ")),"**."), "")` ## Summary Table ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} summary_table1 <- data.frame(Samples = nrow(e), Features = ncol(e), Covariates = ncol(SummarizedExperiment::colData(data)) - 1) summary_table2 <- data.frame(Counts_Zero = sum(zeros$number), Percent_Zero = paste(round((sum(zeros$number)/(nrow(e)*ncol(e)))*100, 2), "%")) summary_table3 <- data.frame(Counts_NA = sum(is.na(e)), Percent_NA = paste(round((sum(is.na(e))/(nrow(e)*ncol(e)))*100, 2), "%")) knitr::kable(summary_table1) knitr::kable(summary_table2) knitr::kable(summary_table3) ``` ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} if (nrow(nas) >= 1){ ggplot(nas, aes(reorder(names, number), number, fill = number)) + geom_col() + ylab("Missing values") + xlab("") + ggtitle("Missing Value Plot") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") } ``` ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} if (nrow(zeros) >= 1){ ggplot(zeros, aes(reorder(names, number), number, fill = number)) + geom_col() + ylab("Zeros") + xlab("") + ggtitle("Zeros Plot") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") + scale_fill_viridis_d(begin = 0, end = 0.8) } ``` ## Samples by Group ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} counts <- data.frame(table(target$Group)) colnames(counts) <- c("Group", "Counts") ggplot(counts, aes(reorder(Group, Counts), Counts, fill = Group)) + geom_col() + ylab("Counts") + xlab("") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") + scale_fill_viridis_d(begin = 0, end = 0.8) ``` \pagebreak # Normalization Plots ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} indNum <- nrow(SummarizedExperiment::colData(pre_processed)) jttr <- ifelse(indNum <= 10, TRUE, FALSE) p1 <- PomaBoxplots(imputed, jitter = jttr, label_size = 8) + xlab("Samples") + ylab("Value") + ggtitle("Not Normalized") + theme(legend.position = "bottom") p2 <- PomaBoxplots(pre_processed, jitter = jttr, label_size = 8) + xlab("Samples") + ylab("Value") + ggtitle(paste0("Normalized (", normalization, ")")) + theme(legend.position = "bottom") p1 p2 ``` \pagebreak # Group Distribution Plots ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} p3 <- PomaDensity(imputed) + ggtitle("Not Normalized") p4 <- PomaDensity(pre_processed) + ggtitle(paste0("Normalized (", normalization, ")")) p3 p4 ``` \pagebreak # Outlier Detection ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} outliers <- data %>% PomaImpute(method = imputation) %>% PomaNorm(method = normalization) %>% PomaOutliers(do = "analyze", coef = coeff_outliers) outliers$polygon_plot ``` **`r nrow(outliers$outliers)`** possible outliers detected in your data. `r ifelse(nrow(outliers$outliers) >= 1, paste0("These outliers are **",noquote(paste(shQuote(paste0(outliers$outliers$sample)), collapse=", ")),"**."), "")` ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} if(nrow(outliers$outliers) >= 1){ knitr::kable(outliers$outliers) } ``` \pagebreak # High Correlated Features (r > 0.97) ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} correlations <- PomaCorr(pre_processed, label_size = 8) high_correlations <- correlations$correlations %>% filter(abs(R) > 0.97) ``` There are **`r nrow(high_correlations)`** high correlated feature pairs in your data. `r ifelse(nrow(high_correlations) >= 1, paste0("These features are **",noquote(paste(shQuote(paste0(high_correlations$feature1, " - " , high_correlations$feature2)), collapse=", ")),"**."), "")` ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE, fig.align = 'center'} correlations$corrplot ``` \pagebreak # Heatmap and Clustering ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} PomaHeatmap(pre_processed) ``` \pagebreak # Principal Component Analysis ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} PomaMultivariate(pre_processed, method = "pca", ellipse = FALSE)$scoresplot ```