--- title: "Predicting New MCIA scores" author: - Max Mattessich - Joaquin Reyna - Edel Aron - Anna Konstorum date: "Compiled: `r format(Sys.time(), '%B %d, %Y')`" output: BiocStyle::html_document: dev: 'jpeg' fig_retina: 1 number_sections: FALSE toc_depth: 2 toc_float: TRUE vignette: > %\VignetteIndexEntry{Predicting New MCIA scores} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Predicting MCIA global (factor) scores for new test samples It may be of interest to use the embedding that is calculated on a training sample set to predict scores on a test set (or, equivalently, on new data). After loading the `nipalsMCIA` library, we randomly split the NCI60 cancer cell line data into training and test sets. ## Installation ```{r installation-github, eval = FALSE} # install.packages("devtools") devtools::install_github("Muunraker/nipalsMCIA", ref = "code-development", force = TRUE, build_vignettes = TRUE) # devel version ``` ```{r installation-bioconductor, eval = FALSE} # after acceptance # install.packages("BiocManager") BiocManager::install("nipalsMCIA") ``` ```{r load-packages, message = FALSE} library(ggplot2) library(MultiAssayExperiment) library(nipalsMCIA) ``` ## Split the data ```{r split-data} data(NCI60) set.seed(8) num_samples <- dim(data_blocks[[1]])[1] num_train <- round(num_samples * 0.7, 0) train_samples <- sample.int(num_samples, num_train) data_blocks_train <- data_blocks data_blocks_test <- data_blocks for (i in seq_along(data_blocks)) { data_blocks_train[[i]] <- data_blocks_train[[i]][train_samples, ] data_blocks_test[[i]] <- data_blocks_test[[i]][-train_samples, ] } # Split corresponding metadata metadata_train <- data.frame(metadata_NCI60[train_samples, ], row.names = rownames(data_blocks_train$mrna)) colnames(metadata_train) <- c("cancerType") metadata_test <- data.frame(metadata_NCI60[-train_samples, ], row.names = rownames(data_blocks_test$mrna)) colnames(metadata_test) <- c("cancerType") # Create train and test mae objects data_blocks_train_mae <- simple_mae(data_blocks_train, row_format = "sample", colData = metadata_train) data_blocks_test_mae <- simple_mae(data_blocks_test, row_format = "sample", colData = metadata_test) ``` ## Run nipalsMCIA on training data ```{r computing-MCIA-on-training-data, message = FALSE, fig.show = "hide"} MCIA_train <- nipals_multiblock(data_blocks = data_blocks_train_mae, col_preproc_method = "colprofile", num_PCs = 10, plots = "none", tol = 1e-9) ``` ## Visualize model on training data using metadata on cancer type The `get_metadata_colors()` function returns an assignment of a color for the metadata columns. The `nmb_get_gs()` function returns the global scores from the input `NipalsResult` object. ```{r visualize-training-model, fig.height = 2.5} meta_colors <- get_metadata_colors(mcia_results = MCIA_train, color_col = 1, color_pal_params = list(option = "E")) global_scores <- nmb_get_gs(MCIA_train) MCIA_out <- data.frame(global_scores[, 1:2]) MCIA_out$cancerType <- nmb_get_metadata(MCIA_train)$cancerType colnames(MCIA_out) <- c("Factor.1", "Factor.2", "cancerType") # plot the results ggplot(data = MCIA_out, aes(x = Factor.1, y = Factor.2, color = cancerType)) + geom_point(size = 3) + labs(title = "MCIA for NCI60 training data") + scale_color_manual(values = meta_colors) + theme_bw() ``` ## Generate factor scores for test data using the MCIA_train model We use the \verb+predict_gs+ function to generate new factor scores on the test data set using the MCIA_train model. The new dataset in the form of an MAE object is input using the parameter `test_data`. ```{r generate-new-scores, message = FALSE} MCIA_test_scores <- predict_gs(mcia_results = MCIA_train, test_data = data_blocks_test_mae) ``` ## Visualize new scores with old We once again plot the top two factor scores for both the training and test datasets ```{r visualize-both-models, fig.height = 2.5} MCIA_out_test <- data.frame(MCIA_test_scores[, 1:2]) MCIA_out_test$cancerType <- MultiAssayExperiment::colData(data_blocks_test_mae)$cancerType colnames(MCIA_out_test) <- c("Factor.1", "Factor.2", "cancerType") MCIA_out_test$set <- "test" MCIA_out$set <- "train" MCIA_out_full <- rbind(MCIA_out, MCIA_out_test) rownames(MCIA_out_full) <- NULL # plot the results ggplot(data = MCIA_out_full, aes(x = Factor.1, y = Factor.2, color = cancerType, shape = set)) + geom_point(size = 3) + labs(title = "MCIA for NCI60 training and test data") + scale_color_manual(values = meta_colors) + theme_bw() ``` # Session Info
**Session Info** ```{r session-info} sessionInfo() ```