--- title: "POMA EDA Example" author: - name: Pol Castellano-Escuder affiliation: University of Barcelona, Spain. email: polcaes@gmail.com date: "`r BiocStyle::doc_date()`" output: BiocStyle::html_document vignette: > %\VignetteIndexEntry{POMA EDA Example} %\VignetteEngine{knitr::rmarkdown} %\usepackage[utf8]{inputenc} %\VignetteEncoding{UTF-8} biblio-style: apalike link-citations: true --- **Compiled date**: `r Sys.Date()` **Last edited**: 2020-08-03 **License**: `r packageDescription("POMA")[["License"]]` ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, # fig.align = "center", comment = ">" ) ``` # Installation Run the following code to install the Bioconductor version of package. ```{r, eval = FALSE} # install.packages("BiocManager") BiocManager::install("POMA") ``` # Load POMA ```{r, warning = FALSE, message = FALSE, comment = FALSE} library(POMA) ``` # Automatic EDA Report The following function will return an Exploratory Data Analysis (EDA) PDF report. The input object must be an **MSnSet** object. ```{r, eval = FALSE} data("st000336") PomaEDA(st000336) ``` ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} library(patchwork) library(knitr) library(dplyr) library(tibble) library(ggplot2) library(reshape2) library(Biobase) e <- t(Biobase::exprs(st000336)) target <- Biobase::pData(st000336) %>% rownames_to_column("ID") %>% rename(Group = 2) %>% select(ID, Group) ``` ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} imputed <- PomaImpute(st000336, method = "knn") pre_processed <- PomaNorm(imputed, method = "log_pareto") %>% PomaOutliers(coef = 1.5) ``` ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} # zeros zeros <- data.frame(number = colSums(e == 0, na.rm = TRUE)) %>% rownames_to_column("names") %>% filter(number != 0) all_zero <- zeros %>% filter(number == nrow(e)) # missing values nas <- data.frame(number = colSums(is.na(e))) %>% rownames_to_column("names") %>% filter(number != 0) # zero variance var_zero <- e %>% as.data.frame() %>% summarise_all(~ var(., na.rm = TRUE)) %>% t() %>% as.data.frame() %>% rownames_to_column("names") %>% filter(V1 == 0) ``` **Generated EDA PDF report starts here.** # Know your data - Your data have **57** samples, **31** features and **2** groups, that are **‘Controls’, ‘DMD’**. Furthermore, **1** covariates have been found in your data. These covariates are **‘steroids’**. - A **3.45%** of values in your data are NAs (missing values). Variables that have NA values are **‘aspartic_acid (5)’, ‘citrulline (28)’, ‘glutamic_acid (15)’, ‘methionine (3)’, ‘x3_hba (1)’, ‘a_kg (1)’, ‘citrate (2)’, ‘fumarate (1)’, ‘lactate (2)’, ‘malate (1)’, ‘pyruvate (1)’, ‘succinate (1)’**. - A **0%** of values in your data are zeros. - Removed from the exploratory data analysis **0** features that only have zeros. - Removed from the exploratory data analysis **0** features that have zero variance. ## Summary Table ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} summary_table1 <- data.frame(Samples = nrow(e), Features = ncol(e), Covariates = ncol(Biobase::pData(st000336)) - 1) summary_table2 <- data.frame(Counts_Zero = sum(zeros$number), Percent_Zero = paste(round((sum(zeros$number)/(nrow(e)*ncol(e)))*100, 2), "%")) summary_table3 <- data.frame(Counts_NA = sum(is.na(e)), Percent_NA = paste(round((sum(is.na(e))/(nrow(e)*ncol(e)))*100, 2), "%")) knitr::kable(summary_table1) knitr::kable(summary_table2) knitr::kable(summary_table3) ``` ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} if (nrow(nas) >= 1){ ggplot(nas, aes(reorder(names, number), number, fill = number)) + geom_col() + ylab("Missing values") + xlab("") + ggtitle("Missing Value Plot") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") } ``` ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} if (nrow(zeros) >= 1){ ggplot(zeros, aes(reorder(names, number), number, fill = number)) + geom_col() + ylab("Zeros") + xlab("") + ggtitle("Zeros Plot") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") } ``` ## Samples by Group ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} counts <- data.frame(table(target$Group)) colnames(counts) <- c("Group", "Counts") ggplot(counts, aes(reorder(Group, Counts), Counts, fill = Group)) + geom_col() + ylab("Counts") + xlab("") + theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") ``` # Normalization Plots ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} indNum <- nrow(Biobase::pData(pre_processed)) jttr <- ifelse(indNum <= 10, TRUE, FALSE) p1 <- PomaBoxplots(imputed, jitter = jttr) + xlab("Samples") + ylab("Value") + ggtitle("Not Normalized") + theme(legend.position = "bottom") p2 <- PomaBoxplots(pre_processed, jitter = jttr) + xlab("Samples") + ylab("Value") + ggtitle("Normalized ('log_pareto')") + theme(legend.position = "bottom") p1 + p2 ``` # Group Distribution Plots ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} p3 <- PomaDensity(imputed) + ggtitle("Not Normalized") p4 <- PomaDensity(pre_processed) + ggtitle("Normalized ('log_pareto')") p3/p4 ``` # Outlier Detection ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} outliers <- st000336 %>% PomaImpute(method = "knn") %>% PomaNorm(method = "log_pareto") %>% PomaOutliers(do = "analyze", coef = 1.5) outliers$polygon_plot ``` **7** possible outliers detected in your data. These outliers are **‘DMD119.2.U02’, ‘DMD084.11.U02’, ‘DMD087.12.U02’, ‘DMD023.10.U02’, ‘DMD046.11.U02’, ‘DMD133.9.U02’, ‘DMD135.10.U02’**. ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} if(nrow(outliers$outliers) >= 1){ knitr::kable(outliers$outliers) } ``` # High Correlated Features (r > 0.97) ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} correlations <- PomaCorr(pre_processed) high_correlations <- correlations$correlations %>% filter(abs(corr) > 0.97) ``` There are **0** high correlated feature pairs in your data. # Heatmap and Clustering ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} PomaHeatmap(pre_processed) ``` # Principal Component Analysis ```{r, echo = FALSE, warning = FALSE, comment = NA, message = FALSE} PomaMultivariate(pre_processed, method = "pca")$scoresplot ```