## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", crop = NULL ) ## ----installation, eval = FALSE----------------------------------------------- # if(!requireNamespace("BiocManager", quietly = TRUE)) { # install.packages("BiocManager") # } # # BiocManager::install("doubletrouble") # # ## Check that you have a valid Bioconductor installation # BiocManager::valid() ## ----load_package------------------------------------------------------------- library(doubletrouble) ## ----data1-------------------------------------------------------------------- # Load list of DIAMOND tabular output data(yeast_seq) head(yeast_seq) ## ----ex_data------------------------------------------------------------------ # Load annotation list processed with syntenet::process_input() data(yeast_annot) head(yeast_annot) ## ----process_input------------------------------------------------------------ library(syntenet) # Process input data pdata <- process_input(yeast_seq, yeast_annot) # Inspect the output names(pdata) pdata$seq pdata$annotation ## ----run_diamond_intraspecies------------------------------------------------- data(diamond_intra) # Run DIAMOND in sensitive mode for S. cerevisiae only if(diamond_is_installed()) { diamond_intra <- run_diamond( seq = pdata$seq["Scerevisiae"], compare = "intraspecies", outdir = file.path(tempdir(), "diamond_intra"), ... = "--sensitive" ) } # Inspect output names(diamond_intra) head(diamond_intra$Scerevisiae_Scerevisiae) ## ----binary_classification---------------------------------------------------- # Binary scheme c_binary <- classify_gene_pairs( annotation = pdata$annotation, blast_list = diamond_intra, scheme = "binary" ) # Inspecting the output names(c_binary) head(c_binary$Scerevisiae) # How many pairs are there for each duplication mode? table(c_binary$Scerevisiae$type) ## ----expanded_classification-------------------------------------------------- # Standard scheme c_standard <- classify_gene_pairs( annotation = pdata$annotation, blast_list = diamond_intra, scheme = "standard" ) # Inspecting the output names(c_standard) head(c_standard$Scerevisiae) # How many pairs are there for each duplication mode? table(c_standard$Scerevisiae$type) ## ----blast_interspecies------------------------------------------------------- data(diamond_inter) # load pre-computed output in case DIAMOND is not installed # Create data frame of comparisons to be made comparisons <- data.frame( species = "Scerevisiae", outgroup = "Cglabrata" ) comparisons # Run DIAMOND for the comparison we specified if(diamond_is_installed()) { diamond_inter <- run_diamond( seq = pdata$seq, compare = comparisons, outdir = file.path(tempdir(), "diamond_inter"), ... = "--sensitive" ) } names(diamond_inter) head(diamond_inter$Scerevisiae_Cglabrata) ## ----full_classification------------------------------------------------------ # Extended scheme c_extended <- classify_gene_pairs( annotation = pdata$annotation, blast_list = diamond_intra, scheme = "extended", blast_inter = diamond_inter ) # Inspecting the output names(c_extended) head(c_extended$Scerevisiae) # How many pairs are there for each duplication mode? table(c_extended$Scerevisiae$type) ## ----------------------------------------------------------------------------- library(txdbmaker) # Create a list of `TxDb` objects from a list of `GRanges` objects txdb_list <- lapply(yeast_annot, txdbmaker::makeTxDbFromGRanges) txdb_list ## ----------------------------------------------------------------------------- # Get a list of data frames with intron counts per gene for each species intron_counts <- lapply(txdb_list, get_intron_counts) # Inspecting the list names(intron_counts) head(intron_counts$Scerevisiae) ## ----------------------------------------------------------------------------- # Full scheme c_full <- classify_gene_pairs( annotation = pdata$annotation, blast_list = diamond_intra, scheme = "full", blast_inter = diamond_inter, intron_counts = intron_counts ) # Inspecting the output names(c_full) head(c_full$Scerevisiae) # How many pairs are there for each duplication mode? table(c_full$Scerevisiae$type) ## ----classify_genes----------------------------------------------------------- # Classify genes into unique modes of duplication c_genes <- classify_genes(c_full) # Inspecting the output names(c_genes) head(c_genes$Scerevisiae) # Number of genes per mode table(c_genes$Scerevisiae$type) ## ----kaks_calculation--------------------------------------------------------- data(cds_scerevisiae) head(cds_scerevisiae) # Store DNAStringSet object in a list cds_list <- list(Scerevisiae = cds_scerevisiae) # Keep only top five TD-derived gene pairs for demonstration purposes td_pairs <- c_full$Scerevisiae[c_full$Scerevisiae$type == "TD", ] gene_pairs <- list(Scerevisiae = td_pairs[seq(1, 5, by = 1), ]) # Calculate Ka, Ks, and Ka/Ks kaks <- pairs2kaks(gene_pairs, cds_list) # Inspect the output head(kaks) ## ----ks_eda------------------------------------------------------------------- # Load data and inspect it data(gmax_ks) head(gmax_ks) # Plot distribution plot_ks_distro(gmax_ks) ## ----find_ks_peaks------------------------------------------------------------ # Find 2 and 3 peaks and test which one has more support peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = c(2, 3), verbose = TRUE) names(peaks) str(peaks) # Visualize Ks distribution plot_ks_peaks(peaks) ## ----find_peaks_explicit------------------------------------------------------ # Find 2 peaks ignoring Ks values > 1 peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = 2, max_ks = 1) plot_ks_peaks(peaks) ## ----sizer-------------------------------------------------------------------- # Get numeric vector of Ks values <= 1 ks <- gmax_ks$Ks[gmax_ks$Ks <= 1] # Get SiZer map feature::SiZer(ks) ## ----split_by_peak------------------------------------------------------------ # Gene pairs without age-based classification head(gmax_ks) # Classify gene pairs by age group pairs_age_group <- split_pairs_by_peak(gmax_ks, peaks) # Inspecting the output names(pairs_age_group) # Take a look at the classified gene pairs head(pairs_age_group$pairs) # Visualize Ks distro with age boundaries pairs_age_group$plot ## ----------------------------------------------------------------------------- # Load data set with pre-computed duplicates for 3 fungi species data(fungi_kaks) names(fungi_kaks) head(fungi_kaks$saccharomyces_cerevisiae) # Get a data frame of counts per mode in all species counts_table <- duplicates2counts(fungi_kaks |> classify_genes()) counts_table ## ----------------------------------------------------------------------------- # A) Facets p1 <- plot_duplicate_freqs(counts_table) # B) Stacked barplot, absolute frequencies p2 <- plot_duplicate_freqs(counts_table, plot_type = "stack") # C) Stacked barplot, relative frequencies p3 <- plot_duplicate_freqs(counts_table, plot_type = "stack_percent") # Combine plots, one per row patchwork::wrap_plots(p1, p2, p3, nrow = 3) + patchwork::plot_annotation(tag_levels = "A") ## ----fig.height = 3, fig.width = 8-------------------------------------------- # Frequency of duplicated genes by mode classify_genes(fungi_kaks) |> # classify genes into unique duplication types duplicates2counts() |> # get a data frame of counts (long format) plot_duplicate_freqs() # plot frequencies ## ----fig.height=3, fig.width=9------------------------------------------------ ks_df <- fungi_kaks$saccharomyces_cerevisiae # A) Histogram, whole paranome p1 <- plot_ks_distro(ks_df, plot_type = "histogram") # B) Density, whole paranome p2 <- plot_ks_distro(ks_df, plot_type = "density") # C) Histogram with density lines, whole paranome p3 <- plot_ks_distro(ks_df, plot_type = "density_histogram") # Combine plots side by side patchwork::wrap_plots(p1, p2, p3, nrow = 1) + patchwork::plot_annotation(tag_levels = "A") ## ----fig.width = 8, fig.height=4---------------------------------------------- # A) Duplicates by type, histogram p1 <- plot_ks_distro(ks_df, bytype = TRUE, plot_type = "histogram") # B) Duplicates by type, violin p2 <- plot_ks_distro(ks_df, bytype = TRUE, plot_type = "violin") # Combine plots side by side patchwork::wrap_plots(p1, p2) + patchwork::plot_annotation(tag_levels = "A") ## ----fig.width = 6, fig.height = 4-------------------------------------------- # A) Ks for each species p1 <- plot_rates_by_species(fungi_kaks) # B) Ka/Ks by duplication type for each species p2 <- plot_rates_by_species(fungi_kaks, rate_column = "Ka_Ks", bytype = TRUE) # Combine plots - one per row patchwork::wrap_plots(p1, p2, nrow = 2) + patchwork::plot_annotation(tag_levels = "A") ## ----session_info------------------------------------------------------------- sessioninfo::session_info()