## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    crop = NULL
)

## ----installation, eval = FALSE-----------------------------------------------
# if(!requireNamespace("BiocManager", quietly = TRUE)) {
#     install.packages("BiocManager")
# }
# 
# BiocManager::install("doubletrouble")
# 
# ## Check that you have a valid Bioconductor installation
# BiocManager::valid()

## ----load_package-------------------------------------------------------------
library(doubletrouble)

## ----eval=FALSE---------------------------------------------------------------
# # Checking if names of lists match
# setequal(names(seqs), names(annotation)) # should return TRUE

## ----eval=FALSE---------------------------------------------------------------
# species <- c("Saccharomyces cerevisiae", "Candida glabrata")
# 
# # Download data from Ensembl with {biomartr}
# ## Whole-genome protein sequences (.fa)
# fasta_dir <- file.path(tempdir(), "proteomes")
# fasta_files <- biomartr::getProteomeSet(
#     db = "ensembl", organisms = species, path = fasta_dir
# )
# 
# ## Gene annotation (.gff3)
# gff_dir <- file.path(tempdir(), "annotation")
# gff_files <- biomartr::getGFFSet(
#     db = "ensembl", organisms = species, path = gff_dir
# )
# 
# ## CDS (.fa)
# cds_dir <- file.path(tempdir(), "CDS")
# cds_files <- biomartr::getCDSSet(
#     db = "ensembl", organisms = species, path = cds_dir
# )
# 
# # Import data to the R session
# ## Read .fa files with proteomes as a list of AAStringSet + clean names
# seq <- syntenet::fasta2AAStringSetlist(fasta_dir)
# names(seq) <- gsub("\\..*", "", names(seq))
# 
# ## Read .gff3 files as a list of GRanges
# annot <- syntenet::gff2GRangesList(gff_dir)
# names(annot) <- gsub("\\..*", "", names(annot))
# 
# ## Read .fa files with CDS as a list of DNAStringSet objects
# cds <- lapply(cds_files, Biostrings::readDNAStringSet)
# names(cds) <- gsub("\\..*", "", basename(cds_files))
# 
# # Process data
# ## Keep ranges for protein-coding genes only
# yeast_annot <- lapply(annot, function(x) {
#     gene_ranges <- x[x$biotype == "protein_coding" & x$type == "gene"]
#     gene_ranges <- IRanges::subsetByOverlaps(x, gene_ranges)
#     return(gene_ranges)
# })
# 
# ## Keep only longest sequence for each protein-coding gene (no isoforms)
# yeast_seq <- lapply(seq, function(x) {
#     # Keep only protein-coding genes
#     x <- x[grep("protein_coding", names(x))]
# 
#     # Leave only gene IDs in sequence names
#     names(x) <- gsub(".*gene:| .*", "", names(x))
# 
#     # If isoforms are present (same gene ID multiple times), keep the longest
#     x <- x[order(Biostrings::width(x), decreasing = TRUE)]
#     x <- x[!duplicated(names(x))]
# 
#     return(x)
# })

## ----example_data-------------------------------------------------------------
# Load example data
data(yeast_seq)
data(yeast_annot)

yeast_seq
yeast_annot

## ----process_input------------------------------------------------------------
library(syntenet)

# Process input data
pdata <- process_input(yeast_seq, yeast_annot)

# Inspect the output
names(pdata)
pdata$seq
pdata$annotation

## ----run_diamond_intraspecies-------------------------------------------------
data(diamond_intra)

# Run DIAMOND in sensitive mode for S. cerevisiae only
if(diamond_is_installed()) {
    diamond_intra <- run_diamond(
        seq = pdata$seq["Scerevisiae"],
        compare = "intraspecies", 
        outdir = file.path(tempdir(), "diamond_intra"),
        ... = "--sensitive"
    )
}

# Inspect output
names(diamond_intra)
head(diamond_intra$Scerevisiae_Scerevisiae)

## ----binary_classification----------------------------------------------------
# Binary scheme
c_binary <- classify_gene_pairs(
    annotation = pdata$annotation,
    blast_list = diamond_intra,
    scheme = "binary"
)

# Inspecting the output
names(c_binary)
head(c_binary$Scerevisiae)

# How many pairs are there for each duplication mode?
table(c_binary$Scerevisiae$type)

## ----expanded_classification--------------------------------------------------
# Standard scheme
c_standard <- classify_gene_pairs(
    annotation = pdata$annotation,
    blast_list = diamond_intra,
    scheme = "standard"
)

# Inspecting the output
names(c_standard)
head(c_standard$Scerevisiae)

# How many pairs are there for each duplication mode?
table(c_standard$Scerevisiae$type)

## ----blast_interspecies-------------------------------------------------------
data(diamond_inter) # load pre-computed output in case DIAMOND is not installed

# Create data frame of comparisons to be made
comparisons <- data.frame(
    species = "Scerevisiae",
    outgroup = "Cglabrata"
)
comparisons

# Run DIAMOND for the comparison we specified
if(diamond_is_installed()) {
    diamond_inter <- run_diamond(
        seq = pdata$seq,
        compare = comparisons,
        outdir = file.path(tempdir(), "diamond_inter"),
        ... = "--sensitive"
    )
}

names(diamond_inter)
head(diamond_inter$Scerevisiae_Cglabrata)

## ----full_classification------------------------------------------------------
# Extended scheme
c_extended <- classify_gene_pairs(
    annotation = pdata$annotation,
    blast_list = diamond_intra,
    scheme = "extended",
    blast_inter = diamond_inter
)

# Inspecting the output
names(c_extended)
head(c_extended$Scerevisiae)

# How many pairs are there for each duplication mode?
table(c_extended$Scerevisiae$type)

## -----------------------------------------------------------------------------
# Example: multiple outgroups for the same species
comparisons <- data.frame(
    species = rep("speciesA", 3),
    outgroup = c("speciesB", "speciesC", "speciesD")
)

comparisons

## ----message=FALSE------------------------------------------------------------
library(txdbmaker)
# Create a list of `TxDb` objects from a list of `GRanges` objects
txdb_list <- lapply(yeast_annot, txdbmaker::makeTxDbFromGRanges)
txdb_list

## -----------------------------------------------------------------------------
# Get a list of data frames with intron counts per gene for each species
intron_counts <- lapply(txdb_list, get_intron_counts)

# Inspecting the list
names(intron_counts)
head(intron_counts$Scerevisiae)

## -----------------------------------------------------------------------------
# Full scheme
c_full <- classify_gene_pairs(
    annotation = pdata$annotation,
    blast_list = diamond_intra,
    scheme = "full",
    blast_inter = diamond_inter,
    intron_counts = intron_counts
)

# Inspecting the output
names(c_full)
head(c_full$Scerevisiae)

# How many pairs are there for each duplication mode?
table(c_full$Scerevisiae$type)

## ----classify_genes-----------------------------------------------------------
# Classify genes into unique modes of duplication
c_genes <- classify_genes(c_full)

# Inspecting the output
names(c_genes)
head(c_genes$Scerevisiae)

# Number of genes per mode
table(c_genes$Scerevisiae$type)

## ----kaks_calculation---------------------------------------------------------
data(cds_scerevisiae)
head(cds_scerevisiae)

# Store DNAStringSet object in a list
cds_list <- list(Scerevisiae = cds_scerevisiae)

# Keep only top five TD-derived gene pairs for demonstration purposes
td_pairs <- c_full$Scerevisiae[c_full$Scerevisiae$type == "TD", ]
gene_pairs <- list(Scerevisiae = td_pairs[seq(1, 5, by = 1), ])

# Calculate Ka, Ks, and Ka/Ks
kaks <- pairs2kaks(gene_pairs, cds_list)

# Inspect the output
head(kaks)

## ----ks_eda-------------------------------------------------------------------
# Load data and inspect it
data(gmax_ks)
head(gmax_ks)

# Plot distribution
plot_ks_distro(gmax_ks)

## ----find_ks_peaks------------------------------------------------------------
# Find 2 and 3 peaks and test which one has more support
peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = c(2, 3), verbose = TRUE)
names(peaks)
str(peaks)

# Visualize Ks distribution
plot_ks_peaks(peaks)

## ----find_peaks_explicit------------------------------------------------------
# Find 2 peaks ignoring Ks values > 1
peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = 2, max_ks = 1)
plot_ks_peaks(peaks)

## ----sizer--------------------------------------------------------------------
# Get numeric vector of Ks values <= 1
ks <- gmax_ks$Ks[gmax_ks$Ks <= 1]

# Get SiZer map
feature::SiZer(ks)

## ----split_by_peak------------------------------------------------------------
# Gene pairs without age-based classification
head(gmax_ks)

# Classify gene pairs by age group
pairs_age_group <- split_pairs_by_peak(gmax_ks[, c(1,2,3)], peaks)

# Inspecting the output
names(pairs_age_group)

# Take a look at the classified gene pairs
head(pairs_age_group$pairs)

# Visualize Ks distro with age boundaries
pairs_age_group$plot

## -----------------------------------------------------------------------------
# Get all pairs in age group 2
pairs_ag2 <- pairs_age_group$pairs[pairs_age_group$pairs$peak == 2, c(1,2)]

# Get all SD pairs
sd_pairs <- gmax_ks[gmax_ks$type == "SD", c(1,2)]

# Merge tables
pairs_wgd_legumes <- merge(pairs_ag2, sd_pairs)

head(pairs_wgd_legumes)

## -----------------------------------------------------------------------------
# Load data set with pre-computed duplicates for 3 fungi species
data(fungi_kaks)
names(fungi_kaks)
head(fungi_kaks$saccharomyces_cerevisiae)

# Get a data frame of counts per mode in all species
counts_table <- duplicates2counts(fungi_kaks |> classify_genes())

counts_table

## -----------------------------------------------------------------------------
# A) Facets
p1 <- plot_duplicate_freqs(counts_table)

# B) Stacked barplot, absolute frequencies
p2 <- plot_duplicate_freqs(counts_table, plot_type = "stack")

# C) Stacked barplot, relative frequencies
p3 <- plot_duplicate_freqs(counts_table, plot_type = "stack_percent")

# Combine plots, one per row
patchwork::wrap_plots(p1, p2, p3, nrow = 3) + 
    patchwork::plot_annotation(tag_levels = "A")

## ----fig.height = 3, fig.width = 8--------------------------------------------
# Frequency of duplicated genes by mode
classify_genes(fungi_kaks) |>   # classify genes into unique duplication types
    duplicates2counts() |>      # get a data frame of counts (long format)
    plot_duplicate_freqs()      # plot frequencies

## ----fig.height=3, fig.width=9------------------------------------------------
ks_df <- fungi_kaks$saccharomyces_cerevisiae

# A) Histogram, whole paranome
p1 <- plot_ks_distro(ks_df, plot_type = "histogram")

# B) Density, whole paranome
p2 <- plot_ks_distro(ks_df, plot_type = "density") 

# C) Histogram with density lines, whole paranome
p3 <- plot_ks_distro(ks_df, plot_type = "density_histogram")

# Combine plots side by side
patchwork::wrap_plots(p1, p2, p3, nrow = 1) +
    patchwork::plot_annotation(tag_levels = "A")

## ----fig.width = 8, fig.height=4----------------------------------------------
# A) Duplicates by type, histogram
p1 <- plot_ks_distro(ks_df, bytype = TRUE, plot_type = "histogram")

# B) Duplicates by type, violin
p2 <- plot_ks_distro(ks_df, bytype = TRUE, plot_type = "violin")

# Combine plots side by side
patchwork::wrap_plots(p1, p2) +
    patchwork::plot_annotation(tag_levels = "A")

## ----fig.width = 6, fig.height = 4--------------------------------------------
# A) Ks for each species
p1 <- plot_rates_by_species(fungi_kaks)

# B) Ka/Ks by duplication type for each species
p2 <- plot_rates_by_species(fungi_kaks, rate_column = "Ka_Ks", bytype = TRUE)

# Combine plots - one per row
patchwork::wrap_plots(p1, p2, nrow = 2) +
    patchwork::plot_annotation(tag_levels = "A")

## ----session_info-------------------------------------------------------------
sessioninfo::session_info()