## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    crop = NULL
)

## ----installation, eval = FALSE-----------------------------------------------
#  if(!requireNamespace("BiocManager", quietly = TRUE)) {
#      install.packages("BiocManager")
#  }
#  
#  BiocManager::install("doubletrouble")
#  
#  ## Check that you have a valid Bioconductor installation
#  BiocManager::valid()

## ----load_package-------------------------------------------------------------
library(doubletrouble)

## ----data1--------------------------------------------------------------------
# Load list of DIAMOND tabular output
data(yeast_seq)
head(yeast_seq)

## ----ex_data------------------------------------------------------------------
# Load annotation list processed with syntenet::process_input()
data(yeast_annot)
head(yeast_annot)

## ----process_input------------------------------------------------------------
library(syntenet)

# Process input data
pdata <- process_input(yeast_seq, yeast_annot)

# Inspect the output
names(pdata)
pdata$seq
pdata$annotation

## ----run_diamond_intraspecies-------------------------------------------------
data(diamond_intra)

# Run DIAMOND in sensitive mode for S. cerevisiae only
if(diamond_is_installed()) {
    diamond_intra <- run_diamond(
        seq = pdata$seq["Scerevisiae"],
        compare = "intraspecies", 
        outdir = file.path(tempdir(), "diamond_intra"),
        ... = "--sensitive"
    )
}

# Inspect output
names(diamond_intra)
head(diamond_intra$Scerevisiae_Scerevisiae)

## ----binary_classification----------------------------------------------------
# Binary scheme
c_binary <- classify_gene_pairs(
    annotation = pdata$annotation,
    blast_list = diamond_intra,
    scheme = "binary"
)

# Inspecting the output
names(c_binary)
head(c_binary$Scerevisiae)

# How many pairs are there for each duplication mode?
table(c_binary$Scerevisiae$type)

## ----expanded_classification--------------------------------------------------
# Standard scheme
c_standard <- classify_gene_pairs(
    annotation = pdata$annotation,
    blast_list = diamond_intra,
    scheme = "standard"
)

# Inspecting the output
names(c_standard)
head(c_standard$Scerevisiae)

# How many pairs are there for each duplication mode?
table(c_standard$Scerevisiae$type)

## ----blast_interspecies-------------------------------------------------------
data(diamond_inter) # load pre-computed output in case DIAMOND is not installed

# Create data frame of comparisons to be made
comparisons <- data.frame(
    species = "Scerevisiae",
    outgroup = "Cglabrata"
)
comparisons

# Run DIAMOND for the comparison we specified
if(diamond_is_installed()) {
    diamond_inter <- run_diamond(
        seq = pdata$seq,
        compare = comparisons,
        outdir = file.path(tempdir(), "diamond_inter"),
        ... = "--sensitive"
    )
}

names(diamond_inter)
head(diamond_inter$Scerevisiae_Cglabrata)

## ----full_classification------------------------------------------------------
# Extended scheme
c_extended <- classify_gene_pairs(
    annotation = pdata$annotation,
    blast_list = diamond_intra,
    scheme = "extended",
    blast_inter = diamond_inter
)

# Inspecting the output
names(c_extended)
head(c_extended$Scerevisiae)

# How many pairs are there for each duplication mode?
table(c_extended$Scerevisiae$type)

## -----------------------------------------------------------------------------
library(txdbmaker)
# Create a list of `TxDb` objects from a list of `GRanges` objects
txdb_list <- lapply(yeast_annot, txdbmaker::makeTxDbFromGRanges)
txdb_list

## -----------------------------------------------------------------------------
# Get a list of data frames with intron counts per gene for each species
intron_counts <- lapply(txdb_list, get_intron_counts)

# Inspecting the list
names(intron_counts)
head(intron_counts$Scerevisiae)

## -----------------------------------------------------------------------------
# Full scheme
c_full <- classify_gene_pairs(
    annotation = pdata$annotation,
    blast_list = diamond_intra,
    scheme = "full",
    blast_inter = diamond_inter,
    intron_counts = intron_counts
)

# Inspecting the output
names(c_full)
head(c_full$Scerevisiae)

# How many pairs are there for each duplication mode?
table(c_full$Scerevisiae$type)

## ----classify_genes-----------------------------------------------------------
# Classify genes into unique modes of duplication
c_genes <- classify_genes(c_full)

# Inspecting the output
names(c_genes)
head(c_genes$Scerevisiae)

# Number of genes per mode
table(c_genes$Scerevisiae$type)

## ----kaks_calculation---------------------------------------------------------
data(cds_scerevisiae)
head(cds_scerevisiae)

# Store DNAStringSet object in a list
cds_list <- list(Scerevisiae = cds_scerevisiae)

# Keep only top five TD-derived gene pairs for demonstration purposes
td_pairs <- c_full$Scerevisiae[c_full$Scerevisiae$type == "TD", ]
gene_pairs <- list(Scerevisiae = td_pairs[seq(1, 5, by = 1), ])

# Calculate Ka, Ks, and Ka/Ks
kaks <- pairs2kaks(gene_pairs, cds_list)

# Inspect the output
head(kaks)

## ----ks_eda-------------------------------------------------------------------
# Load data and inspect it
data(gmax_ks)
head(gmax_ks)

# Plot distribution
plot_ks_distro(gmax_ks)

## ----find_ks_peaks------------------------------------------------------------
# Find 2 and 3 peaks and test which one has more support
peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = c(2, 3), verbose = TRUE)
names(peaks)
str(peaks)

# Visualize Ks distribution
plot_ks_peaks(peaks)

## ----find_peaks_explicit------------------------------------------------------
# Find 2 peaks ignoring Ks values > 1
peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = 2, max_ks = 1)
plot_ks_peaks(peaks)

## ----sizer--------------------------------------------------------------------
# Get numeric vector of Ks values <= 1
ks <- gmax_ks$Ks[gmax_ks$Ks <= 1]

# Get SiZer map
feature::SiZer(ks)

## ----split_by_peak------------------------------------------------------------
# Gene pairs without age-based classification
head(gmax_ks)

# Classify gene pairs by age group
pairs_age_group <- split_pairs_by_peak(gmax_ks, peaks)

# Inspecting the output
names(pairs_age_group)

# Take a look at the classified gene pairs
head(pairs_age_group$pairs)

# Visualize Ks distro with age boundaries
pairs_age_group$plot

## -----------------------------------------------------------------------------
# Load data set with pre-computed duplicates for 3 fungi species
data(fungi_kaks)
names(fungi_kaks)
head(fungi_kaks$saccharomyces_cerevisiae)

# Get a data frame of counts per mode in all species
counts_table <- duplicates2counts(fungi_kaks |> classify_genes())

counts_table

## -----------------------------------------------------------------------------
# A) Facets
p1 <- plot_duplicate_freqs(counts_table)

# B) Stacked barplot, absolute frequencies
p2 <- plot_duplicate_freqs(counts_table, plot_type = "stack")

# C) Stacked barplot, relative frequencies
p3 <- plot_duplicate_freqs(counts_table, plot_type = "stack_percent")

# Combine plots, one per row
patchwork::wrap_plots(p1, p2, p3, nrow = 3) + 
    patchwork::plot_annotation(tag_levels = "A")

## ----fig.height = 3, fig.width = 8--------------------------------------------
# Frequency of duplicated genes by mode
classify_genes(fungi_kaks) |>   # classify genes into unique duplication types
    duplicates2counts() |>      # get a data frame of counts (long format)
    plot_duplicate_freqs()      # plot frequencies

## ----fig.height=3, fig.width=9------------------------------------------------
ks_df <- fungi_kaks$saccharomyces_cerevisiae

# A) Histogram, whole paranome
p1 <- plot_ks_distro(ks_df, plot_type = "histogram")

# B) Density, whole paranome
p2 <- plot_ks_distro(ks_df, plot_type = "density") 

# C) Histogram with density lines, whole paranome
p3 <- plot_ks_distro(ks_df, plot_type = "density_histogram")

# Combine plots side by side
patchwork::wrap_plots(p1, p2, p3, nrow = 1) +
    patchwork::plot_annotation(tag_levels = "A")

## ----fig.width = 8, fig.height=4----------------------------------------------
# A) Duplicates by type, histogram
p1 <- plot_ks_distro(ks_df, bytype = TRUE, plot_type = "histogram")

# B) Duplicates by type, violin
p2 <- plot_ks_distro(ks_df, bytype = TRUE, plot_type = "violin")

# Combine plots side by side
patchwork::wrap_plots(p1, p2) +
    patchwork::plot_annotation(tag_levels = "A")

## ----fig.width = 6, fig.height = 4--------------------------------------------
# A) Ks for each species
p1 <- plot_rates_by_species(fungi_kaks)

# B) Ka/Ks by duplication type for each species
p2 <- plot_rates_by_species(fungi_kaks, rate_column = "Ka_Ks", bytype = TRUE)

# Combine plots - one per row
patchwork::wrap_plots(p1, p2, nrow = 2) +
    patchwork::plot_annotation(tag_levels = "A")

## ----session_info-------------------------------------------------------------
sessioninfo::session_info()