## ----echo=FALSE, results="hide", message=FALSE--------------------------------
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
library(BiocStyle)
set.seed(42)

# Check Python modules once at the start
keras_installed <- reticulate::py_module_available("keras")
numpy_installed <- reticulate::py_module_available("numpy")

# If not installed, skip evaluation of all subsequent chunks
knitr::opts_chunk$set(
  eval = keras_installed && numpy_installed
)

## -----------------------------------------------------------------------------
# suppressMessages(library(immApex))
# suppressMessages(library(keras3))
# suppressMessages(library(ggplot2))
# suppressMessages(library(viridis))
# suppressMessages(library(magrittr))

## ----tidy = FALSE-------------------------------------------------------------
# sequences <- generateSequences(prefix.motif = "CAS",
#                                suffix.motif = "YF",
#                                number.of.sequences = 1000,
#                                min.length = 8,
#                                max.length = 16)
# head(sequences)

## ----tidy = FALSE-------------------------------------------------------------
# nucleotide.sequences <- generateSequences(number.of.sequences = 1000,
#                                           min.length = 8,
#                                           max.length = 16,
#                                           sequence.dictionary = c("A", "C", "T", "G"))
# head(nucleotide.sequences)

## ----tidy = FALSE, eval = reticulate::py_module_available("keras") && reticulate::py_module_available("numpy")----
# variational.sequences <- variationalSequences(sequences,
#                                               encoder = "onehotEncoder",
#                                               number.of.sequences = 100,
#                                               encoder.hidden.dim = c(256, 128),
#                                               latent.dim = 16,
#                                               batch.size = 16,
#                                               call.threshold = 0.1)
# head(variational.sequences)

## ----tidy = FALSE-------------------------------------------------------------
# mutated.sequences <- mutateSequences(sequences,
#                                      n.sequence = 1,
#                                      position.start = 3,
#                                      position.end = 8)
# head(sequences)
# head(mutated.sequences)

## ----tidy = FALSE-------------------------------------------------------------
# data("immapex_example.data")
# Adaptive_example <- formatGenes(immapex_example.data[["Adaptive"]],
#                                 region = "v",
#                                 technology = "Adaptive",
#                                 simplify.format = TRUE)
# 
# head(Adaptive_example[,c("aminoAcid","vGeneName", "v_IMGT", "v_IMGT.check")])

## ----tidy = FALSE-------------------------------------------------------------
# TRBV_aa <- getIMGT(species = "human",
#                    chain = "TRB",
#                    frame = "inframe",
#                    region = "v",
#                    sequence.type = "aa")
# 
# TRBV_aa[[1]][1]

## ----tidy = FALSE-------------------------------------------------------------
# Adaptive_example <- inferCDR(Adaptive_example,
#                              chain = "TRB",
#                              reference = TRBV_aa,
#                              technology = "Adaptive",
#                              sequence.type = "aa",
#                              sequences = c("CDR1", "CDR2"))
# 
# Adaptive_example[200:210,c("CDR1_IMGT", "CDR2_IMGT")]

## ----tidy = FALSE-------------------------------------------------------------
# sequence.matrix <- onehotEncoder(input.sequences =  c(sequences, mutated.sequences),
#                                  convert.to.matrix = TRUE)
# head(sequence.matrix[,1:20])

## ----tidy = FALSE-------------------------------------------------------------
# property.matrix <- propertyEncoder(input.sequences =  c(sequences, mutated.sequences),
#                                    method.to.use = "FASGAI",
#                                    convert.to.matrix = TRUE)
# 
# head(property.matrix[,1:20])

## ----tidy = FALSE-------------------------------------------------------------
# mulit.property.matrix <- propertyEncoder(input.sequences =  c(sequences, mutated.sequences),
#                                          method.to.use = c("atchleyFactors", "kideraFactors"),
#                                          convert.to.matrix = TRUE)
# 
# head(mulit.property.matrix[,1:20])

## ----tidy = FALSE-------------------------------------------------------------
# median.property.matrix <- propertyEncoder(input.sequences =  c(sequences, mutated.sequences),
#                                           method.to.use = "crucianiProperties",
#                                           summary.function = "median")
# 
# head(median.property.matrix[,1:3])

## ----tidy = FALSE-------------------------------------------------------------
# geometric.matrix <- geometricEncoder(sequences,
#                                      method.to.use = "BLOSUM62",
#                                      theta = pi/3)
# head(geometric.matrix)

## ----tidy = FALSE-------------------------------------------------------------
# token.matrix <- tokenizeSequences(input.sequences =  c(sequences, mutated.sequences),
#                                   add.startstop = TRUE,
#                                   start.token = "!",
#                                   stop.token = "^",
#                                   convert.to.matrix = TRUE)
# head(token.matrix[,1:18])

## ----tidy = FALSE-------------------------------------------------------------
# ppm.matrix <- probabilityMatrix(sequences)
# head(ppm.matrix)

## ----tidy = FALSE-------------------------------------------------------------
# set.seed(42)
# back.freq <- sample(1:1000, 20)
# back.freq <- back.freq/sum(back.freq)
# 
# pwm.matrix <- probabilityMatrix(sequences,
#                                 max.length = 20,
#                                 convert.PWM = TRUE,
#                                 background.frequencies = back.freq)
# head(pwm.matrix)

## ----tidy = FALSE-------------------------------------------------------------
# adj.matrix <- adjacencyMatrix(sequences,
#                               normalize = FALSE)
# adj.matrix

## ----tidy = FALSE-------------------------------------------------------------
# property.matrix <- propertyEncoder(input.sequences =  c(sequences, mutated.sequences),
#                                    method.to.use = "FASGAI",
#                                    convert.to.matrix = TRUE)
# 
# property.sequences <- sequenceDecoder(property.matrix,
#                                       encoder = "propertyEncoder",
#                                       aa.method.to.use = "FASGAI",
#                                       call.threshold = 1)
# head(sequences)
# head(property.sequences)

## ----tidy=FALSE---------------------------------------------------------------
# sequence.matrix <- onehotEncoder(input.sequences =  c(sequences, mutated.sequences),
#                                  convert.to.matrix = TRUE)
# 
# OHE.sequences <- sequenceDecoder(sequence.matrix,
#                                  encoder = "onehotEncoder")
# 
# head(OHE.sequences)

## ----tidy = FALSE-------------------------------------------------------------
# #Sampling to make Training/Validation Data Cohorts
# set.seed(42)
# num_sequences <- nrow(sequence.matrix)
# indices <- 1:num_sequences
# train_indices <- sample(indices, size = floor(0.8 * num_sequences))
# val_indices <- setdiff(indices, train_indices)
# 
# x_train <- sequence.matrix[train_indices,]
# x_val <- sequence.matrix[val_indices,]
# 
# # Parameters
# input_shape <- dim(x_train)[2]
# epochs <- 20
# batch_size <- 128
# encoding_dim <- 40
# hidden_dim1 <- 256 # Hidden layer 1 size
# hidden_dim2 <- 128  # Hidden layer 2 size
# 
# es <- callback_early_stopping(monitor = "val_loss",
#                               min_delta = 0,
#                               patience = 4,
#                               verbose = 1,
#                               mode = "min")
# 
# # Define the Model
# input_seq <- layer_input(shape = c(input_shape))
# 
# # Encoder Layers
# encoded <- input_seq %>%
#           layer_dense(units = hidden_dim1, name = "e.1") %>%
#           layer_batch_normalization(name = "bn.1") %>%
#           layer_activation('leaky_relu', name = "act.1") %>%
#           layer_dense(units = hidden_dim2, name = "e.2") %>%
#           layer_batch_normalization(name = "bn.2") %>%
#           layer_activation('leaky_relu', name = "act.2") %>%
#           layer_dense(units = encoding_dim, activation = 'selu', name = "latent")
# 
# # Decoder Layers
# decoded <- encoded %>%
#           layer_dense(units = hidden_dim2, name = "d.2") %>%
#           layer_batch_normalization(name = "bn.3") %>%
#           layer_activation('leaky_relu', name = "act.3") %>%
#           layer_dense(units = hidden_dim1, name = "d.1") %>%
#           layer_batch_normalization(name = "bn.4") %>%
#           layer_activation('leaky_relu', name = "act.4") %>%
#           layer_dense(units = input_shape, activation = 'sigmoid')
# 
# # Autoencoder Model
# autoencoder <- keras_model(input_seq, decoded)
# autoencoder %>% keras3::compile(optimizer = optimizer_adam(learning_rate = 0.0001),
#                                    loss = "mse")
# 
# # Train the model
# history <- autoencoder %>% fit(x = x_train,
#                                y = x_train,
#                                validation_data = list(x_val, x_val),
#                                epochs = epochs,
#                                batch_size = batch_size,
#                                shuffle = TRUE,
#                                callbacks = es,
#                                verbose = 0)
# 
# plot(history) +
#   scale_color_viridis(option = "B", discrete = TRUE) +
#   scale_fill_manual(values = c("black","black")) +
#   theme_classic()

## ----tidy = FALSE-------------------------------------------------------------
# class1.sequences <- generateSequences(prefix.motif = "CAS",
#                                       suffix.motif = "YF",
#                                       number.of.sequences = 10000,
#                                       min.length = 8,
#                                       max.length = 16)
# 
# class2.sequences <- generateSequences(prefix.motif = "CASF",
#                                       suffix.motif = "YF",
#                                       number.of.sequences = 10000,
#                                       min.length = 8,
#                                       max.length = 16)
# 
# labels <- as.numeric(c(rep(0, 10000), rep(1, 10000)))
# 
# classifier.matrix <- onehotEncoder(input.sequences =  c(class1.sequences, class2.sequences),
#                                    convert.to.matrix = TRUE)

## ----tidy = FALSE-------------------------------------------------------------
# #Input shape will be 1D as we are using a matrix
# input.shape <- dim(classifier.matrix)[2]
# 
# #Simple model structure
# classifier.model <- keras_model_sequential() %>%
#                         layer_dense(units = 128, activation = "relu",
#                                     input_shape = c(input.shape)) %>%
#                         layer_dense(units = 32, activation = "relu") %>%
#                         layer_dense(units = 1, activation = "sigmoid")
# 
# classifier.model %>% compile(
#         optimizer = optimizer_adam(learning_rate = 0.00001),
#         loss = "binary_crossentropy",
#         metrics = c("accuracy")
# )
# 
# #Separating data and labels
# set.seed(42)
# val_indices <- sample(nrow(classifier.matrix), 10000*0.2)
# x_val <- classifier.matrix[val_indices,]
# x_train <- classifier.matrix[-val_indices,]
# 
# val_labels <- labels[val_indices]
# train_labels <- labels[-val_indices]
# 
# #Training the classifier.model
# history <- classifier.model %>% fit(x_train,
#                                     train_labels,
#                                     epochs = 20,
#                                     batch_size = 32,
#                                     validation_data = list(x_val, val_labels),
#                                     verbose = 0
# )
# 
# plot(history) +
#   scale_color_viridis(option = "B", discrete = TRUE) +
#   scale_fill_manual(values = c("black","black")) +
#   theme_classic()

## -----------------------------------------------------------------------------
# sessionInfo()