## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    crop = NULL,
    message = FALSE
)

## ----eval = FALSE-------------------------------------------------------------
#  if(!requireNamespace("BiocManager", quietly = TRUE))
#          install.packages("BiocManager")
#  BiocManager::install("ProteinGymR")

## ----message = FALSE----------------------------------------------------------
library(ProteinGymR)
library(tidyr)
library(dplyr)
library(stringr)
library(ggplot2)
library(ComplexHeatmap)
library(AnnotationHub)

## ----import dms---------------------------------------------------------------
dms_data <- dms_substitutions()

## ----view studies-------------------------------------------------------------
head(names(dms_data))

## ----view assay---------------------------------------------------------------
head(dms_data[[1]])

## ----queryEH------------------------------------------------------------------
eh <- ExperimentHub::ExperimentHub()
AnnotationHub::query(eh, "ProteinGymR")

dms_metadata <- eh[["EH9607"]]
names(dms_metadata)

## ----ACE2---------------------------------------------------------------------
ACE2 <- dms_data[["ACE2_HUMAN_Chan_2020"]]

## ----heatmap, warning = FALSE, fig.wide = TRUE--------------------------------
ACE2 <- 
    ACE2 |>
    dplyr::mutate(
        ref = str_sub(ACE2$mutant, 1, 1),
        pos = as.integer(
            gsub(".*?([0-9]+).*", "\\1", ACE2$mutant)
            ),
        alt = str_sub(ACE2$mutant, -1)
    )

ACE2 <- ACE2 |> select("ref", "pos", "alt", "DMS_score")

head(ACE2)

## Reshape the data to wide format
ACE2_wide <- ACE2 |> 
    select(-ref) |> 
    pivot_wider(names_from = alt, values_from = DMS_score) |> 
    arrange(pos)

## Subset to first 100 position
ACE2_wide <- ACE2_wide |> 
    filter(pos <= 100)

head(ACE2_wide)

## Convert to matrix
pos <- ACE2_wide$pos
alt <- colnames(ACE2_wide)
alt <- alt[-c(1)]

heatmap_matrix <- ACE2_wide |> 
    select(2:length(ACE2_wide)) |> 
    as.matrix()

## Set amino acid position as rownames of matrix
rownames(heatmap_matrix) <- pos

## Transpose so position is x-axis
heatmap_matrix <- t(heatmap_matrix)

## Reorder rows based on physiochemical properties
phyiochem_order <- "DEKRHNQSTPGAVILMCFYW"
phyiochem_order <- unlist(strsplit(phyiochem_order, split = ""))

reordered_matrix <- heatmap_matrix[match(phyiochem_order, 
    rownames(heatmap_matrix)), ]

## Create the heatmap
ComplexHeatmap::Heatmap(reordered_matrix,
        name = "DMS Score",
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        show_row_names = TRUE,
        show_column_names = TRUE)

## ----available_models---------------------------------------------------------
available_models()

## ----warning=FALSE, fig.wide = TRUE-------------------------------------------
benchmark_models(metric = "AUC", 
    models = c("GEMME", "CARP_600K", "ESM_1b", "EVmutation", "ProtGPT2"))

## ----sesh info----------------------------------------------------------------
sessionInfo()