## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", crop = NULL ) ## ----install_demo, eval=FALSE------------------------------------------------- # if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") # # BiocManager::install("squallms") ## ----library_squallms, message=FALSE------------------------------------------ library(squallms) ## ----run_xcms, message=FALSE-------------------------------------------------- suppressPackageStartupMessages({ library(dplyr) library(tidyr) library(tibble) library(ggplot2) library(xcms) library(RaMS) }) mzML_files <- system.file("extdata", package = "RaMS") %>% list.files(full.names = TRUE, pattern = "[A-F].mzML") register(BPPARAM = SerialParam()) cwp <- CentWaveParam(snthresh = 0, extendLengthMSW = TRUE, integrate = 2) obp <- ObiwarpParam(binSize = 0.1, response = 1, distFun = "cor_opt") pdp <- PeakDensityParam( sampleGroups = 1:3, bw = 12, minFraction = 0, binSize = 0.001, minSamples = 0 ) xcms_major_version <- as.numeric(substr(as.character(packageVersion("xcms")), 1, 1)) if (xcms_major_version >= 4) { library(MSnbase) msexp <- readMSData(mzML_files, msLevel. = 1, mode = "onDisk") fpp <- FillChromPeaksParam(ppm = 5) } else { library(MsExperiment) msexp <- readMsExperiment(mzML_files, msLevel. = 1, mode = "onDisk") fpp <- ChromPeakAreaParam() } xcms_filled <- msexp %>% findChromPeaks(cwp) %>% adjustRtime(obp) %>% groupChromPeaks(pdp) %>% fillChromPeaks(fpp) ## ----makeXcmsObjFlat---------------------------------------------------------- feat_peak_info <- makeXcmsObjFlat(xcms_filled) %>% select(feature, starts_with("mz"), starts_with("rt"), filename, filepath) feat_peak_info %>% head() %>% mutate(filepath = paste0(substr(filepath, 1, 13), "~")) %>% knitr::kable(caption = "Output from makeXcmsObjFlat.") ## ----pooled_files_only-------------------------------------------------------- feat_peak_info_subset <- feat_peak_info %>% filter(grepl("AB|CD", filename)) ## ----extractChromMetrics, warning=FALSE--------------------------------------- msdata <- grabMSdata(unique(feat_peak_info$filepath), verbosity = 0) shape_metrics <- extractChromMetrics(feat_peak_info, ms1_data = msdata$MS1) knitr::kable(head(shape_metrics), caption = "Format of the output from extractChromMetrics") ## ----plotmetrics, warning=FALSE, fig.cap="Mass by time diagram of all features returned by XCMS. Points have been colored by their calculated peak shape metric and are sized by their calculated signal-to-noise ratio. Large, pale points tend to be higher quality than small dark ones."---- shape_metrics %>% arrange(desc(med_snr)) %>% ggplot() + geom_point(aes(x = med_rt, y = med_mz, color = med_cor, size = med_snr), alpha = 0.5) + theme_bw() ## ----plotrandompeaks, fig.cap="Eight randomly selected features from the XCMS object shown as extracted ion chromatograms. Colors correspond to the med_cor metric and the med_snr value is shown as a label in the upper right. As shown above, high-quality peaks tend to be pale and have large SNR values but several features show very poor quality."---- # Set seed for reproducibility, ensuring that slice_sample always returns the # same features for discussion below. set.seed(123) some_random_feats <- shape_metrics %>% slice_sample(n = 8) some_random_feats %>% mutate(mzmin = med_mz - med_mz * 5 / 1e6) %>% mutate(mzmax = med_mz + med_mz * 5 / 1e6) %>% mutate(rtmin = med_rt - 1) %>% mutate(rtmax = med_rt + 1) %>% left_join(msdata$MS1, join_by( between(y$rt, x$rtmin, x$rtmax), between(y$mz, x$mzmin, x$mzmax) )) %>% qplotMS1data(color_col = "med_cor") + geom_vline(aes(xintercept = med_rt), color = "red") + geom_text(aes(x = Inf, y = Inf, label = paste0("SNR: ", round(med_snr))), data = some_random_feats, hjust = 1, vjust = 1, color = "black" ) + facet_wrap(~feature, scales = "free", nrow = 3) + scale_color_continuous(limits = c(0, 1), name = "Peak shape metric") + scale_y_continuous(expand = expansion(c(0, 0.25))) + theme(legend.position.inside = c(0.82, 0.12), legend.position = "inside") + guides(color = guide_colorbar(direction = "horizontal", title.position = "top")) ## ----labelFeatsManual, eval=FALSE--------------------------------------------- # # Chunk not run during vignette build because interactive input required # # Annotations may differ from the saved output loaded below based on user preference # manual_classes <- labelFeatsManual(feat_peak_info, ms1_data = msdata$MS1) ## ----plotManualFeatWindow, echo=FALSE, fig.cap="Screenshot of lasso labeling dashboard showing the first feature in the dataset as seen via labelFeatsManual."---- knitr::include_graphics(file.path("intro_manual_ss.png")) ## ----load_prior_manual_labels------------------------------------------------- manual_classes <- readRDS(system.file("extdata", "intro_manual_labels.rds", package = "squallms")) table(manual_classes, useNA = "ifany") ## ----pcaoutput creation------------------------------------------------------- pcaoutput <- feat_peak_info %>% filter(feature %in% sprintf("FT%03d", 30:50)) %>% pickyPCA( ms1_data = msdata$MS1, rt_window_width = 1, ppm_window_width = 5, verbosity = 0 ) ## ----pixelmatrixggplot, fig.cap="File by time views of extracted ion chromatograms for a subset of features shown as a pixel matrix. Colors correspond to the relative intensity of the chromatogram across the gridded retention time axis, with good features consistently showing up as a bright streak down the center of the plot, while noise features look like static. XCMS feature IDs are shown in the facet labels."---- pcaoutput$interp_df %>% ggplot() + geom_tile(aes(x = approx_rt, y = filename, fill = approx_int)) + facet_wrap(~feature, nrow = 4) + scale_x_continuous( breaks = c(1, 25, 50), labels = c("0", "0.5", "1"), expand = expansion() ) + scale_y_discrete(expand = expansion()) + theme( axis.text = element_blank(), axis.ticks = element_blank(), legend.position = "none" ) + labs(x = "Scaled retention time", y = "Individual files") ## ----principalcomponentplot, fig.cap="Principal component analysis of the pixel matrix features. Each 'pixel' in the previous plot was a dimension along which the PCA detected redundancy, allowing for a majority of the variance to be explained by the first axis in which good features separate from bad ones."---- pcaoutput$pcamat %>% prcomp() %>% .$rotation %>% .[, 1:2] %>% as.data.frame() %>% rownames_to_column("feature") %>% ggplot(aes(x = PC1, y = PC2, label = feature, key = feature)) + geom_vline(xintercept = 0) + geom_hline(yintercept = 0) + geom_text() + coord_fixed() + scale_x_continuous(expand = expansion(0.2)) + labs(x = "Principal component 1", y = "Principal component 2") + theme_bw() ## ----labelFeatsLasso, eval=FALSE---------------------------------------------- # # Chunk not run during vignette build because interactive input required # # Annotations may differ from the saved output loaded below based on user preference # lasso_classes <- labelFeatsLasso(feat_peak_info, ms1_data = msdata$MS1) ## ----goodss, echo=FALSE, fig.cap="Screenshot of lasso labeling dashboard when 'Good' features are being lassoed in the principal components graph. Central plot shows the interactive PCA with feature IDs set to react to both mouse hover and lasso selection. Hovering over a given feature shows its extracted ion chromatogram in the lower left plot. Clicking and dragging to select multiple features with the lasso option produces the lower right figure which represents the aggregate chromatogram after normalization. Plot in the upper right shows the aggregate chromatogram for each cluster of the central PCA."---- knitr::include_graphics(file.path("intro_good_ss.png")) ## ----badss, echo=FALSE, fig.cap="Screenshot of lasso labeling dashboard when 'Bad' features are being lassoed in the principal components graph."---- knitr::include_graphics(file.path("intro_bad_ss.png")) ## ----load_prior_lasso_labels-------------------------------------------------- lasso_classes <- readRDS(system.file("extdata", "intro_lasso_labels.rds", package = "squallms")) table(lasso_classes, useNA = "ifany") ## ----caret_confusion---------------------------------------------------------- suppressPackageStartupMessages(library(caret)) data.frame(manual = manual_classes, lasso = lasso_classes) %>% filter(!is.na(manual) & !is.na(lasso)) data.frame(manual = factor(manual_classes), lasso = factor(lasso_classes)) %>% filter(!is.na(manual)) %>% with(confusionMatrix(lasso, manual, positive = "Good")) ## ----update_existing---------------------------------------------------------- class_labels <- lasso_classes class_labels[!is.na(manual_classes)] <- manual_classes[!is.na(manual_classes)] ## ----logModelFeatProb, warning=FALSE, fig.cap="Scatterplot of features showing the relationship between med_cor and med_snr, with colors indicating the model likelihood estimation for known good features (filled circles), known bad features (X characters), and unclassified feature (open circles). The logistic fit can be seen as diagonal stripes of a given color descending from the upper left to the bottom right."---- pred_probs <- logModelFeatProb(shape_metrics, class_labels) ## ----logModelFeatQuality, warning=FALSE--------------------------------------- pred_classes <- logModelFeatQuality(shape_metrics, class_labels, verbosity = 1) ## ----updateXcmsObjFeats------------------------------------------------------- final_xcms <- updateXcmsObjFeats(xcms_filled, shape_metrics, class_labels, verbosity = 0, likelihood_threshold = 0.5 ) ## ----------------------------------------------------------------------------- sessionInfo()