Predicting MCIA global (factor) scores for new test samples

It may be of interest to use the embedding that is calculated on a training sample set to predict scores on a test set (or, equivalently, on new data).

After loading the nipalsMCIA library, we randomly split the NCI60 cancer cell line data into training and test sets.

Installation

# devel version

# install.packages("devtools")
devtools::install_github("Muunraker/nipalsMCIA", ref = "devel",
                         force = TRUE, build_vignettes = TRUE) # devel version
# release version
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("nipalsMCIA")
library(ggplot2)
library(MultiAssayExperiment)
library(nipalsMCIA)

Split the data

data(NCI60)
set.seed(8)

num_samples <- dim(data_blocks[[1]])[1]
num_train <- round(num_samples * 0.7, 0)
train_samples <- sample.int(num_samples, num_train)

data_blocks_train <- data_blocks
data_blocks_test <- data_blocks

for (i in seq_along(data_blocks)) {
  data_blocks_train[[i]] <- data_blocks_train[[i]][train_samples, ]
  data_blocks_test[[i]] <- data_blocks_test[[i]][-train_samples, ]
}

# Split corresponding metadata
metadata_train <- data.frame(metadata_NCI60[train_samples, ],
                             row.names = rownames(data_blocks_train$mrna))
colnames(metadata_train) <- c("cancerType")

metadata_test <- data.frame(metadata_NCI60[-train_samples, ],
                            row.names = rownames(data_blocks_test$mrna))
colnames(metadata_test) <- c("cancerType")

# Create train and test mae objects
data_blocks_train_mae <- simple_mae(data_blocks_train, row_format = "sample",
                                    colData = metadata_train)
data_blocks_test_mae <- simple_mae(data_blocks_test, row_format = "sample",
                                   colData = metadata_test)

Run nipalsMCIA on training data

MCIA_train <- nipals_multiblock(data_blocks = data_blocks_train_mae,
                                col_preproc_method = "colprofile", num_PCs = 10,
                                plots = "none", tol = 1e-9)

Visualize model on training data using metadata on cancer type

The get_metadata_colors() function returns an assignment of a color for the metadata columns. The nmb_get_gs() function returns the global scores from the input NipalsResult object.

meta_colors <- get_metadata_colors(mcia_results = MCIA_train, color_col = 1,
                                   color_pal_params = list(option = "E"))

global_scores <- nmb_get_gs(MCIA_train)
MCIA_out <- data.frame(global_scores[, 1:2])
MCIA_out$cancerType <- nmb_get_metadata(MCIA_train)$cancerType
colnames(MCIA_out) <- c("Factor.1", "Factor.2", "cancerType")

# plot the results
ggplot(data = MCIA_out, aes(x = Factor.1, y = Factor.2, color = cancerType)) +
  geom_point(size = 3) +
  labs(title = "MCIA for NCI60 training data") +
  scale_color_manual(values = meta_colors) +
  theme_bw()

Generate factor scores for test data using the MCIA_train model

We use the function to generate new factor scores on the test data set using the MCIA_train model. The new dataset in the form of an MAE object is input using the parameter test_data.

MCIA_test_scores <- predict_gs(mcia_results = MCIA_train,
                               test_data = data_blocks_test_mae)

Visualize new scores with old

We once again plot the top two factor scores for both the training and test datasets

MCIA_out_test <- data.frame(MCIA_test_scores[, 1:2])
MCIA_out_test$cancerType <-
  MultiAssayExperiment::colData(data_blocks_test_mae)$cancerType

colnames(MCIA_out_test) <- c("Factor.1", "Factor.2", "cancerType")
MCIA_out_test$set <- "test"
MCIA_out$set <- "train"
MCIA_out_full <- rbind(MCIA_out, MCIA_out_test)
rownames(MCIA_out_full) <- NULL

# plot the results
ggplot(data = MCIA_out_full,
       aes(x = Factor.1, y = Factor.2, color = cancerType, shape = set)) +
  geom_point(size = 3) +
  labs(title = "MCIA for NCI60 training and test data") +
  scale_color_manual(values = meta_colors) +
  theme_bw()

Session Info

Session Info

sessionInfo()
## R version 4.4.0 RC (2024-04-16 r86468 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows Server 2022 x64 (build 20348)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=C                          
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] MultiAssayExperiment_1.31.0 SummarizedExperiment_1.35.0
##  [3] Biobase_2.65.0              GenomicRanges_1.57.0       
##  [5] GenomeInfoDb_1.41.0         IRanges_2.39.0             
##  [7] S4Vectors_0.43.0            BiocGenerics_0.51.0        
##  [9] MatrixGenerics_1.17.0       matrixStats_1.3.0          
## [11] Seurat_5.0.3                SeuratObject_5.0.1         
## [13] sp_2.1-4                    piggyback_0.1.5            
## [15] BiocFileCache_2.13.0        dbplyr_2.5.0               
## [17] stringr_1.5.1               nipalsMCIA_1.3.0           
## [19] ggpubr_0.6.0                ggplot2_3.5.1              
## [21] fgsea_1.31.0                dplyr_1.1.4                
## [23] ComplexHeatmap_2.21.0       BiocStyle_2.33.0           
## 
## loaded via a namespace (and not attached):
##   [1] spatstat.sparse_3.0-3   lubridate_1.9.3         httr_1.4.7             
##   [4] RColorBrewer_1.1-3      doParallel_1.0.17       gh_1.4.1               
##   [7] tools_4.4.0             sctransform_0.4.1       backports_1.4.1        
##  [10] utf8_1.2.4              R6_2.5.1                lazyeval_0.2.2         
##  [13] uwot_0.2.2              GetoptLong_1.0.5        withr_3.0.0            
##  [16] gridExtra_2.3           progressr_0.14.0        cli_3.6.2              
##  [19] Cairo_1.6-2             spatstat.explore_3.2-7  fastDummies_1.7.3      
##  [22] labeling_0.4.3          sass_0.4.9              spatstat.data_3.0-4    
##  [25] ggridges_0.5.6          pbapply_1.7-2           parallelly_1.37.1      
##  [28] RSQLite_2.3.6           generics_0.1.3          shape_1.4.6.1          
##  [31] ica_1.0-3               spatstat.random_3.2-3   car_3.1-2              
##  [34] Matrix_1.7-0            ggbeeswarm_0.7.2        fansi_1.0.6            
##  [37] abind_1.4-5             lifecycle_1.0.4         yaml_2.3.8             
##  [40] carData_3.0-5           SparseArray_1.5.0       Rtsne_0.17             
##  [43] blob_1.2.4              promises_1.3.0          crayon_1.5.2           
##  [46] miniUI_0.1.1.1          lattice_0.22-6          cowplot_1.1.3          
##  [49] magick_2.8.3            pillar_1.9.0            knitr_1.46             
##  [52] rjson_0.2.21            future.apply_1.11.2     codetools_0.2-20       
##  [55] fastmatch_1.1-4         leiden_0.4.3.1          glue_1.7.0             
##  [58] data.table_1.15.4       vctrs_0.6.5             png_0.1-8              
##  [61] spam_2.10-0             gtable_0.3.5            cachem_1.0.8           
##  [64] xfun_0.43               S4Arrays_1.5.0          mime_0.12              
##  [67] pracma_2.4.4            survival_3.6-4          iterators_1.0.14       
##  [70] tinytex_0.50            fitdistrplus_1.1-11     ROCR_1.0-11            
##  [73] nlme_3.1-164            bit64_4.0.5             filelock_1.0.3         
##  [76] RcppAnnoy_0.0.22        bslib_0.7.0             irlba_2.3.5.1          
##  [79] vipor_0.4.7             KernSmooth_2.23-22      colorspace_2.1-0       
##  [82] DBI_1.2.2               ggrastr_1.0.2           tidyselect_1.2.1       
##  [85] bit_4.0.5               compiler_4.4.0          curl_5.2.1             
##  [88] httr2_1.0.1             DelayedArray_0.31.0     plotly_4.10.4          
##  [91] bookdown_0.39           scales_1.3.0            lmtest_0.9-40          
##  [94] rappdirs_0.3.3          digest_0.6.35           goftest_1.2-3          
##  [97] spatstat.utils_3.0-4    rmarkdown_2.26          XVector_0.45.0         
## [100] htmltools_0.5.8.1       pkgconfig_2.0.3         highr_0.10             
## [103] fastmap_1.1.1           rlang_1.1.3             GlobalOptions_0.1.2    
## [106] htmlwidgets_1.6.4       UCSC.utils_1.1.0        shiny_1.8.1.1          
## [109] farver_2.1.1            jquerylib_0.1.4         zoo_1.8-12             
## [112] jsonlite_1.8.8          BiocParallel_1.39.0     magrittr_2.0.3         
## [115] GenomeInfoDbData_1.2.12 dotCall64_1.1-1         patchwork_1.2.0        
## [118] munsell_0.5.1           Rcpp_1.0.12             reticulate_1.36.1      
## [121] stringi_1.8.3           zlibbioc_1.51.0         MASS_7.3-60.2          
## [124] plyr_1.8.9              parallel_4.4.0          listenv_0.9.1          
## [127] ggrepel_0.9.5           deldir_2.0-4            splines_4.4.0          
## [130] tensor_1.5              circlize_0.4.16         igraph_2.0.3           
## [133] spatstat.geom_3.2-9     ggsignif_0.6.4          RcppHNSW_0.6.0         
## [136] reshape2_1.4.4          evaluate_0.23           BiocManager_1.30.22    
## [139] foreach_1.5.2           httpuv_1.6.15           RANN_2.6.1             
## [142] tidyr_1.3.1             purrr_1.0.2             polyclip_1.10-6        
## [145] future_1.33.2           clue_0.3-65             scattermore_1.2        
## [148] BiocBaseUtils_1.7.0     broom_1.0.5             xtable_1.8-4           
## [151] RSpectra_0.16-1         gitcreds_0.1.2          rstatix_0.7.2          
## [154] later_1.3.2             viridisLite_0.4.2       snow_0.4-4             
## [157] tibble_3.2.1            memoise_2.0.1           beeswarm_0.4.0         
## [160] cluster_2.1.6           timechange_0.3.0        globals_0.16.3