---
title: "Predicting New MCIA scores"
author:
- Max Mattessich
- Joaquin Reyna
- Edel Aron
- Anna Konstorum
date: "Compiled: `r format(Sys.time(), '%B %d, %Y')`"
output:
  BiocStyle::html_document:
    dev: 'jpeg'
    fig_retina: 1
    number_sections: FALSE
    toc_depth: 2
    toc_float: TRUE
vignette: >
  %\VignetteIndexEntry{Predicting New MCIA scores}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Predicting MCIA global (factor) scores for new test samples

It may be of interest to use the embedding that is calculated on a training
sample set to predict scores on a test set (or, equivalently, on new data).

After loading the `nipalsMCIA` library, we randomly split the NCI60 cancer cell
line data into training and test sets.

## Installation

```{r installation-github, eval = FALSE}
# install.packages("devtools")
devtools::install_github("Muunraker/nipalsMCIA", ref = "code-development",
                         force = TRUE, build_vignettes = TRUE) # devel version
```

```{r installation-bioconductor, eval = FALSE}
# after acceptance
# install.packages("BiocManager")
BiocManager::install("nipalsMCIA")
```

```{r load-packages, message = FALSE}
library(ggplot2)
library(MultiAssayExperiment)
library(nipalsMCIA)
```

## Split the data

```{r split-data}
data(NCI60)
set.seed(8)

num_samples <- dim(data_blocks[[1]])[1]
num_train <- round(num_samples * 0.7, 0)
train_samples <- sample.int(num_samples, num_train)

data_blocks_train <- data_blocks
data_blocks_test <- data_blocks

for (i in seq_along(data_blocks)) {
  data_blocks_train[[i]] <- data_blocks_train[[i]][train_samples, ]
  data_blocks_test[[i]] <- data_blocks_test[[i]][-train_samples, ]
}

# Split corresponding metadata
metadata_train <- data.frame(metadata_NCI60[train_samples, ],
                             row.names = rownames(data_blocks_train$mrna))
colnames(metadata_train) <- c("cancerType")

metadata_test <- data.frame(metadata_NCI60[-train_samples, ],
                            row.names = rownames(data_blocks_test$mrna))
colnames(metadata_test) <- c("cancerType")

# Create train and test mae objects
data_blocks_train_mae <- simple_mae(data_blocks_train, row_format = "sample",
                                    colData = metadata_train)
data_blocks_test_mae <- simple_mae(data_blocks_test, row_format = "sample",
                                   colData = metadata_test)
```

## Run nipalsMCIA on training data

```{r computing-MCIA-on-training-data, message = FALSE, fig.show = "hide"}
MCIA_train <- nipals_multiblock(data_blocks = data_blocks_train_mae,
                                col_preproc_method = "colprofile", num_PCs = 10,
                                plots = "none", tol = 1e-9)
```

## Visualize model on training data using metadata on cancer type

The `get_metadata_colors()` function returns an assignment of a color for the
metadata columns.  The `nmb_get_gs()` function returns the global scores from the
input `NipalsResult` object.

```{r visualize-training-model, fig.height = 2.5}
meta_colors <- get_metadata_colors(mcia_results = MCIA_train, color_col = 1,
                                   color_pal_params = list(option = "E"))

global_scores <- nmb_get_gs(MCIA_train)
MCIA_out <- data.frame(global_scores[, 1:2])
MCIA_out$cancerType <- nmb_get_metadata(MCIA_train)$cancerType
colnames(MCIA_out) <- c("Factor.1", "Factor.2", "cancerType")

# plot the results
ggplot(data = MCIA_out, aes(x = Factor.1, y = Factor.2, color = cancerType)) +
  geom_point(size = 3) +
  labs(title = "MCIA for NCI60 training data") +
  scale_color_manual(values = meta_colors) +
  theme_bw()
```

## Generate factor scores for test data using the MCIA_train model

We use the \verb+predict_gs+ function to generate new factor scores on the test
data set using the MCIA_train model. The new dataset in the form of an MAE object
is input using the parameter `test_data`.

```{r generate-new-scores, message = FALSE}
MCIA_test_scores <- predict_gs(mcia_results = MCIA_train,
                               test_data = data_blocks_test_mae)
```

## Visualize new scores with old

We once again plot the top two factor scores for both the training and test
datasets

```{r visualize-both-models, fig.height = 2.5}
MCIA_out_test <- data.frame(MCIA_test_scores[, 1:2])
MCIA_out_test$cancerType <- MultiAssayExperiment::colData(data_blocks_test_mae)$cancerType

colnames(MCIA_out_test) <- c("Factor.1", "Factor.2", "cancerType")
MCIA_out_test$set <- "test"
MCIA_out$set <- "train"
MCIA_out_full <- rbind(MCIA_out, MCIA_out_test)
rownames(MCIA_out_full) <- NULL

# plot the results
ggplot(data = MCIA_out_full,
       aes(x = Factor.1, y = Factor.2, color = cancerType, shape = set)) +
  geom_point(size = 3) +
  labs(title = "MCIA for NCI60 training and test data") +
  scale_color_manual(values = meta_colors) +
  theme_bw()
```

# Session Info

<details>
  <summary>**Session Info**</summary>
  
```{r session-info}
sessionInfo()
```

</details>