## ----global_options, include=FALSE-------------------------------------------- knitr::opts_chunk$set( echo = TRUE, warning = FALSE, message = FALSE, fig.pos = "h", collapse = TRUE, comment = "#>" ) ## ----setup, include = FALSE--------------------------------------------------- chooseCRANmirror(graphics = FALSE, ind = 1) options(tinytex.verbose = TRUE) ## ----batch_workflow, include = TRUE, fig.align = 'center', echo=FALSE, fig.cap='proBatch in batch correction workflow', out.width = '50%'---- knitr::include_graphics("Batch_effects_workflow_staircase.png") ## ----install_proBatch, fig.show='hold', eval = FALSE-------------------------- # if (!requireNamespace("BiocManager", quietly = TRUE)) { # install.packages("BiocManager") # } # BiocManager::install("proBatch") # # library(proBatch) ## ----load_packages------------------------------------------------------------ require(dplyr) require(tibble) require(ggplot2) ## ----col_names---------------------------------------------------------------- feature_id_col <- "peptide_group_label" measure_col <- "Intensity" sample_id_col <- "FullRunName" essential_columns <- c(feature_id_col, measure_col, sample_id_col) ## ----tech_bio_cols------------------------------------------------------------ technical_factors <- c("MS_batch", "digestion_batch") biological_factors <- c("Strain", "Diet", "Sex") biospecimen_id_col <- "EarTag" ## ----set_batch_col------------------------------------------------------------ batch_col <- "MS_batch" ## ----load_data, fig.show='hold'----------------------------------------------- library(proBatch) data( "example_proteome", "example_sample_annotation", "example_peptide_annotation", package = "proBatch" ) ## ----date_to_order, fig.show='hold'------------------------------------------- generated_sample_annotation <- date_to_sample_order( example_sample_annotation, time_column = c("RunDate", "RunTime"), new_time_column = "generated_DateTime", dateTimeFormat = c("%b_%d", "%H:%M:%S"), new_order_col = "generated_order", instrument_col = NULL ) library(knitr) kable(generated_sample_annotation[1:5, ] %>% select(c("RunDate", "RunTime", "order", "generated_DateTime", "generated_order"))) ## ----pep_annotation, fig.show='hold'------------------------------------------ generated_peptide_annotation <- create_peptide_annotation( example_proteome, feature_id_col = "peptide_group_label", protein_col = "Protein" ) ## ----reduce_proteome---------------------------------------------------------- example_proteome <- example_proteome %>% select(one_of(essential_columns)) gc() ## ----long_to_matrix----------------------------------------------------------- example_matrix <- long_to_matrix( example_proteome, feature_id_col = "peptide_group_label", measure_col = "Intensity", sample_id_col = "FullRunName" ) ## ----log_transform------------------------------------------------------------ log_transformed_matrix <- log_transform_dm( example_matrix, log_base = 2, offset = 1 ) ## ----color_scheme, fig.show='hold'-------------------------------------------- color_list <- sample_annotation_to_colors( example_sample_annotation, factor_columns = c( "MS_batch", "digestion_batch", "EarTag", "Strain", "Diet", "Sex" ), numeric_columns = c("DateTime", "order") ) ## ----plot_mean, fig.show='hold', fig.width=5, fig.height=2-------------------- plot_sample_mean( log_transformed_matrix, example_sample_annotation, order_col = "order", batch_col = batch_col, color_by_batch = TRUE, ylimits = c(12, 16.5), color_scheme = color_list[[batch_col]], base_size = 10 ) ## ----plot_boxplots, fig.show='hold', fig.width=10, fig.height=5--------------- log_transformed_long <- matrix_to_long(log_transformed_matrix) batch_col <- "MS_batch" plot_boxplot( log_transformed_long, example_sample_annotation, batch_col = batch_col, color_scheme = color_list[[batch_col]] ) ## ----median_normalization_log, fig.show='hold'-------------------------------- median_normalized_matrix <- normalize_data_dm( log_transformed_matrix, normalize_func = "medianCentering" ) ## ----median_normalization_raw, fig.show='hold'-------------------------------- median_normalized_matrix <- normalize_data_dm( example_matrix, normalize_func = "medianCentering", log_base = 2, offset = 1 ) ## ----quantile_norm, fig.show='hold'------------------------------------------- quantile_normalized_matrix <- normalize_data_dm( log_transformed_matrix, normalize_func = "quantile" ) ## ----plot_mean_normalized, fig.show='hold', fig.width=5, fig.height=2--------- plot_sample_mean( quantile_normalized_matrix, example_sample_annotation, color_by_batch = TRUE, ylimits = c(12, 16), color_scheme = color_list[[batch_col]] ) ## ----plot_hierarchical, fig.show='hold', fig.width=10, fig.height=5----------- selected_annotations <- c("MS_batch", "digestion_batch", "Diet") # Plot clustering between samples plot_hierarchical_clustering( quantile_normalized_matrix, sample_annotation = example_sample_annotation, color_list = color_list, factors_to_plot = selected_annotations, distance = "euclidean", agglomeration = "complete", label_samples = FALSE ) ## ----plot_heatmap, fig.show='hold', fig.width=10, fig.height=11--------------- plot_heatmap_diagnostic( quantile_normalized_matrix, example_sample_annotation, factors_to_plot = selected_annotations, cluster_cols = TRUE, color_list = color_list, show_rownames = FALSE, show_colnames = FALSE ) ## ----plot_PCA, fig.show='hold', fig.width=14, fig.height=8.5------------------ pca1 <- plot_PCA( quantile_normalized_matrix, example_sample_annotation, color_by = "MS_batch", plot_title = "MS batch", color_scheme = color_list[["MS_batch"]] ) pca2 <- plot_PCA( quantile_normalized_matrix, example_sample_annotation, color_by = "digestion_batch", plot_title = "Digestion batch", color_scheme = color_list[["digestion_batch"]] ) pca3 <- plot_PCA( quantile_normalized_matrix, example_sample_annotation, color_by = "Diet", plot_title = "Diet", color_scheme = color_list[["Diet"]] ) pca4 <- plot_PCA( quantile_normalized_matrix, example_sample_annotation, color_by = "DateTime", plot_title = "DateTime", color_scheme = color_list[["DateTime"]] ) library(gridExtra) grid.arrange(pca1, pca2, pca3, pca4, ncol = 2, nrow = 2) ## ----plot_PCA_spec, fig.show='hold', fig.width=5, fig.height=3.5-------------- pca_spec <- plot_PCA( quantile_normalized_matrix, example_sample_annotation, color_by = "digestion_batch", plot_title = "Digestion batch" ) pca_spec ## ----plot_PVCA, fig.show='hold', fig.width=5, fig.height=5-------------------- plot_PVCA( quantile_normalized_matrix, example_sample_annotation, technical_factors = technical_factors, biological_factors = biological_factors ) ## ----plot_spikeIn, fig.show='hold'-------------------------------------------- quantile_normalized_long <- matrix_to_long(quantile_normalized_matrix) plot_spike_in( quantile_normalized_long, example_sample_annotation, peptide_annotation = example_peptide_annotation, protein_col = "Gene", spike_ins = "BOVINE_A1ag", plot_title = "Spike-in BOVINE protein peptides", color_by_batch = TRUE, color_scheme = color_list[[batch_col]], base_size = 8 ) ## ----loess_fit, fig.show='hold', fig.width=5, fig.height=2.4------------------ loess_fit_df <- adjust_batch_trend_df(quantile_normalized_long, example_sample_annotation) ## ----loess_30, fig.show='hold', fig.width=5, fig.height=2.4------------------- loess_fit_30 <- adjust_batch_trend_df( quantile_normalized_long, example_sample_annotation, span = 0.3 ) plot_with_fitting_curve( feature_name = "10231_QDVDVWLWQQEGSSK_2", fit_df = loess_fit_30, fit_value_col = "fit", df_long = quantile_normalized_long, sample_annotation = example_sample_annotation, color_by_batch = TRUE, color_scheme = color_list[[batch_col]], plot_title = "Span = 30%", base_size = 10 # text size ) ## ----loess_70, fig.show='hold', fig.width=5, fig.height=2.4------------------- loess_fit_70 <- adjust_batch_trend_df( quantile_normalized_long, example_sample_annotation, span = 0.7 ) plot_with_fitting_curve( feature_name = "10231_QDVDVWLWQQEGSSK_2", fit_df = loess_fit_70, fit_value_col = "fit", df_long = quantile_normalized_long, sample_annotation = example_sample_annotation, color_by_batch = TRUE, color_scheme = color_list[[batch_col]], plot_title = "Span = 70%", base_size = 10 # text size ) ## ----median_batch, fig.show='hold', fig.width=3, fig.height=2.4--------------- peptide_median_df <- center_feature_batch_medians_df( loess_fit_df, example_sample_annotation ) plot_single_feature( feature_name = "10231_QDVDVWLWQQEGSSK_2", df_long = peptide_median_df, sample_annotation = example_sample_annotation, measure_col = "Intensity", plot_title = "Feature-level Median Centered", base_size = 10 # text size ) ## ----comBat, fig.show='hold'-------------------------------------------------- comBat_df <- correct_with_ComBat_df(loess_fit_df, example_sample_annotation) ## ----combat_result, fig.show='hold', fig.width=3, fig.height=2.4------------- plot_single_feature( feature_name = "10231_QDVDVWLWQQEGSSK_2", df_long = loess_fit_df, sample_annotation = example_sample_annotation, plot_title = "Loess Fitted", base_size = 10 # text and dots size ) plot_single_feature( feature_name = "10231_QDVDVWLWQQEGSSK_2", df_long = comBat_df, sample_annotation = example_sample_annotation, plot_title = "ComBat corrected", base_size = 10 # text and dots size ) ## ----batch_corr_general, fig.show='hold'-------------------------------------- batch_corrected_df <- correct_batch_effects_df( df_long = quantile_normalized_long, sample_annotation = example_sample_annotation, discrete_func = "ComBat", continuous_func = "loess_regression", abs_threshold = 5, pct_threshold = 0.20 ) batch_corrected_matrix <- long_to_matrix(batch_corrected_df) ## ----setup_corr_heatmap, fig.show='hold', fig.height=5, fig.width=8----------- earTags <- c( "ET1524", "ET2078", "ET1322", "ET1566", "ET1354", "ET1420", "ET2154", "ET1515", "ET1506", "ET2577", "ET1681", "ET1585", "ET1518", "ET1906" ) replicate_filenames <- example_sample_annotation %>% filter(MS_batch %in% c("Batch_2", "Batch_3")) %>% filter(EarTag %in% earTags) %>% pull(!!sym("FullRunName")) ## ----corr_heatmap, fig.show='hold', fig.width=10, fig.height=11--------------- p1_exp <- plot_sample_corr_heatmap( log_transformed_matrix, samples_to_plot = replicate_filenames, plot_title = "Correlation of selected samples" ) ## ----color_scale, fig.show='hold'--------------------------------------------- breaksList <- seq(0.7, 1, by = 0.01) # color scale of pheatmap heatmap_colors <- colorRampPalette( rev(RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")) )(length(breaksList) + 1) ## ----corr_samples_heatmap, fig.show='hold', fig.height=4, fig.width=5--------- # Plot the heatmap with annotations for the chosen samples factors_to_show <- c(batch_col, biospecimen_id_col) p1 <- plot_sample_corr_heatmap( log_transformed_matrix, samples_to_plot = replicate_filenames, sample_annotation = example_sample_annotation, factors_to_plot = factors_to_show, plot_title = "Log transformed correlation matrix of\nselected replicated samples", color_list = color_list, heatmap_color = heatmap_colors, breaks = breaksList, cluster_rows = FALSE, cluster_cols = FALSE, fontsize = 6, annotation_names_col = TRUE, annotation_legend = FALSE, show_colnames = FALSE ) p2 <- plot_sample_corr_heatmap( batch_corrected_matrix, samples_to_plot = replicate_filenames, sample_annotation = example_sample_annotation, factors_to_plot = factors_to_show, plot_title = "Batch Corrected\nselected replicated samples", color_list = color_list, heatmap_color = heatmap_colors, breaks = breaksList, cluster_rows = FALSE, cluster_cols = FALSE, fontsize = 6, annotation_names_col = TRUE, annotation_legend = FALSE, show_colnames = FALSE ) ## ----corr_samples_heatmap_2, fig.show='hold', fig.height=4, fig.width=10------ library(gridExtra) grid.arrange(grobs = list(p1$gtable, p2$gtable), ncol = 2) ## ----corr_samples_distrib----------------------------------------------------- sample_cor_raw <- plot_sample_corr_distribution( log_transformed_matrix, example_sample_annotation, # repeated_samples = replicate_filenames, batch_col = "MS_batch", biospecimen_id_col = "EarTag", plot_title = "Correlation of samples (raw)", plot_param = "batch_replicate" ) raw_corr <- sample_cor_raw + theme( axis.text.x = element_text(angle = 45, hjust = 1), text = element_text(size = 8) ) + ylim(0.7, 1) + xlab(NULL) sample_cor_batchCor <- plot_sample_corr_distribution( batch_corrected_matrix, example_sample_annotation, batch_col = "MS_batch", plot_title = "Batch corrected", plot_param = "batch_replicate" ) corr_corr <- sample_cor_batchCor + theme( axis.text.x = element_text(angle = 45, hjust = 1), text = element_text(size = 8) ) + ylim(0.7, 1) + xlab(NULL) ## ----combine_corr, fig.height=2.5, fig.show='hold', fig.width=6, warning=FALSE---- grid.arrange( grobs = list(raw_corr, corr_corr), size = "first", ncol = 2 ) ## ----correlation_of_peptides, fig.height=2.5, fig.show='hold', fig.width=6---- peptide_cor_raw <- plot_peptide_corr_distribution( log_transformed_matrix, example_peptide_annotation, protein_col = "Gene", plot_title = "Peptide correlation (raw)" ) peptide_cor_batchCor <- plot_peptide_corr_distribution( batch_corrected_matrix, example_peptide_annotation, protein_col = "Gene", plot_title = "Peptide correlation (batch corrected)" ) g2 <- peptide_cor_raw + theme(text = element_text(size = 8)) + ylim(c(-1, 1)) g3 <- peptide_cor_batchCor + ylim(c(-1, 1)) + theme(text = element_text(size = 8)) grid.arrange(grobs = list(g2, g3), size = "first", ncol = 2) ## ----sessionInfo, eval=TRUE--------------------------------------------------- sessionInfo() ## ----citation----------------------------------------------------------------- citation("proBatch")