### R code from vignette source 'MutationalPatterns.Rnw'

###################################################
### code chunk number 1: style
###################################################
BiocStyle::latex()


###################################################
### code chunk number 2: options
###################################################
options(width=72)


###################################################
### code chunk number 3: loading_reference_data
###################################################
library(BSgenome)

# Let's display the first five entries only to save space.
available.genomes()[1:5]


###################################################
### code chunk number 4: loading_reference_data
###################################################
ref_genome <- "BSgenome.Hsapiens.UCSC.hg19"
library(ref_genome, character.only = TRUE)


###################################################
### code chunk number 5: load_package
###################################################
library(MutationalPatterns)


###################################################
### code chunk number 6: locate_files
###################################################
vcf_files <- list.files(system.file("extdata", package="MutationalPatterns"),
                        pattern = ".vcf", full.names = TRUE)


###################################################
### code chunk number 7: set_sample_names
###################################################
sample_names <- c(
    "colon1", "colon2", "colon3",
    "intestine1", "intestine2", "intestine3",
    "liver1", "liver2", "liver3")


###################################################
### code chunk number 8: read_vcfs_as_granges
###################################################
vcfs <- read_vcfs_as_granges(vcf_files, sample_names, genome = "hg19")
summary(vcfs)


###################################################
### code chunk number 9: select_subset_of_chromosomes
###################################################
auto <- extractSeqlevelsByGroup(species="Homo_sapiens", 
                                style="UCSC", 
                                group="auto")

vcfs <- lapply(vcfs, function(x) keepSeqlevels(x, auto))


###################################################
### code chunk number 10: mutations_from_vcf
###################################################
mutations_from_vcf(vcfs[[1]])


###################################################
### code chunk number 11: mutation_types
###################################################
mutation_types(vcfs[[1]])


###################################################
### code chunk number 12: mutation_context
###################################################
mutation_context(vcfs[[1]], ref_genome)


###################################################
### code chunk number 13: type_context
###################################################
type_context(vcfs[[1]], ref_genome)


###################################################
### code chunk number 14: mut_type_occurrences
###################################################
# We will use the output of this function later to make plots a little
# bit later.
type_occurrences <- mut_type_occurrences(vcfs, ref_genome)
type_occurrences


###################################################
### code chunk number 15: plot_spectrum
###################################################
p1 <- plot_spectrum(type_occurrences)


###################################################
### code chunk number 16: plot_spectrum_2
###################################################
p2 <- plot_spectrum(type_occurrences, CT = TRUE)


###################################################
### code chunk number 17: plot_spectrum_3
###################################################
p3 <- plot_spectrum(type_occurrences, CT = TRUE, legend = FALSE)


###################################################
### code chunk number 18: combine_plot_spectrum
###################################################
library("gridExtra")
grid.arrange(p1, p2, p3, ncol=3, widths=c(3,3,1.8))


###################################################
### code chunk number 19: plot_spectrum_4
###################################################
tissue <- c(rep("colon", 3), rep("intestine", 3), rep("liver", 3))
p4 <- plot_spectrum(type_occurrences, by = tissue, CT = TRUE, legend = FALSE)


###################################################
### code chunk number 20: plot_spectrum_5
###################################################
my_colors <- c("pink", "orange", "blue", "lightblue", "green", "red", "purple")
p5 <- plot_spectrum(type_occurrences, CT = TRUE, legend = TRUE, 
                    colors = my_colors)


###################################################
### code chunk number 21: combine_plot_spectrum_2
###################################################
grid.arrange(p4, p5, ncol=2, widths=c(3,2))


###################################################
### code chunk number 22: mut_matrix
###################################################
mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome)


###################################################
### code chunk number 23: plot_96_profile
###################################################
plot_96_profile(mut_mat[,c(1,4,7)])


###################################################
### code chunk number 24: estimate_rank
###################################################
# Add a tiny psuedocount to avoid a 0 in the matrix.
mut_mat = mut_mat + 0.0001

# Use the NMF package to generate an estimate plot.
library("NMF")
estimate = nmf(mut_mat, rank=2:5, method="brunet", nrun=100, seed=123456)
plot(estimate)


###################################################
### code chunk number 25: extract_signatures
###################################################
nmf_res <- extract_signatures(mut_mat, rank = 2)


###################################################
### code chunk number 26: add_column_names
###################################################
colnames(nmf_res$signatures) <- c("Signature A", "Signature B")


###################################################
### code chunk number 27: plot_96_profile
###################################################
plot_96_profile(nmf_res$signatures)


###################################################
### code chunk number 28: plot_contribution
###################################################
rownames(nmf_res$contribution) <- c("Signature A", "Signature B")
plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "relative")


###################################################
### code chunk number 29: plot_contribution_2
###################################################
# Plot absolute signature contribution
plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "absolute")

# Plot contribution of signatures for subset of samples with index parameter
plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "absolute",
                    index = c(1,2))

# flip X and Y coordinates
plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "absolute",
                    coord_flip = TRUE)


###################################################
### code chunk number 30: plot_compare_profiles
###################################################
plot_compare_profiles(mut_mat[,1], 
                        nmf_res$reconstructed[,1], 
                        profile_names = c("Original", "Reconstructed"))


###################################################
### code chunk number 31: download_cancer_signatures
###################################################
sp_url <- paste("http://cancer.sanger.ac.uk/cancergenome/assets/",
                "signatures_probabilities.txt", sep = "")

cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE)
# Reorder (to make the order of the trinucleotide changes the same)
cancer_signatures = cancer_signatures[order(cancer_signatures[,1]),]
# Only signatures in matrix
cancer_signatures = as.matrix(cancer_signatures[,4:33])


###################################################
### code chunk number 32: fit_to_signatures
###################################################
fit_res <- fit_to_signatures(mut_mat, cancer_signatures)


###################################################
### code chunk number 33: plot_contribution_3
###################################################
# select signatures with some contribution
select <- which(rowSums(fit_res$contribution) > 0)
# plot contribution
plot_contribution(fit_res$contribution[select,], 
                    cancer_signatures[,select], 
                    coord_flip = FALSE, 
                    mode = "absolute")


###################################################
### code chunk number 34: plot_compare_profiles_2
###################################################
plot_compare_profiles(mut_mat[,1], fit_res$reconstructed[,1],
                        profile_names = c("Original", 
                            "Reconstructed \n cancer signatures"))


###################################################
### code chunk number 35: get_genes
###################################################
# Get ``known genes'' table from UCSC for hg19
# biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
genes_hg19 <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene)


###################################################
### code chunk number 36: strand_from_vcf
###################################################
strand_from_vcf(vcfs[[1]], genes_hg19)


###################################################
### code chunk number 37: mut_matrix_stranded
###################################################
mut_mat_s <- mut_matrix_stranded(vcfs, ref_genome, genes_hg19)
head(mut_mat_s, 10)


###################################################
### code chunk number 38: strand_occurrences
###################################################
strand_counts <- strand_occurrences(mut_mat_s, by=tissue)


###################################################
### code chunk number 39: plot_strand
###################################################
plot_strand(strand_counts, mode = "relative")


###################################################
### code chunk number 40: strand_bias_test
###################################################
strand_bias <- strand_bias_test(strand_counts)


###################################################
### code chunk number 41: plot_strand_bias
###################################################
plot_strand_bias(strand_bias)


###################################################
### code chunk number 42: extract_signatures
###################################################
# Extract 2 signatures
nmf_res_strand <- extract_signatures(mut_mat_s, rank = 2)

# Provide signature names
colnames(nmf_res_strand$signatures) <- c("Signature A", "Signature B")


###################################################
### code chunk number 43: plot_192_profile
###################################################
# Plot signatures with 192 features
a <- plot_192_profile(nmf_res_strand$signatures)

# Plot strand bias per mutation type for each signature with significance test
b <- plot_signature_strand_bias(nmf_res_strand$signatures)

# Load the gridExtra library for combining plots.
grid.arrange(a, b, ncol=2, widths=c(1,1))


###################################################
### code chunk number 44: plot_rainfall
###################################################
# Define autosomal chromosomes
chromosomes <- seqnames(get(ref_genome))[1:22]

# Make a rainfall plot
plot_rainfall(vcfs[[1]], title = names(vcfs[1]),
                chromosomes = chromosomes, cex = 1.5)


###################################################
### code chunk number 45: plot_rainfall_2
###################################################
chromosomes <- seqnames(get(ref_genome))[1]
plot_rainfall(vcfs[[1]], title = names(vcfs[1]),
                chromosomes = chromosomes[1], cex = 2)


###################################################
### code chunk number 46: install_biomaRt
###################################################
## In case you hadn't installed biomaRt yet, you should comment out
## the following lines:
# source("https://bioconductor.org/biocLite.R")
# biocLite("biomaRt")

# Load the biomaRt package.
library(biomaRt)


###################################################
### code chunk number 47: download_using_biomaRt
###################################################
## We can query the BioMart database, but this may take a long time
## though, so we take some shortcuts by loading the results from our
## examples.  The corresponding code for downloading data can be
## found above the command we run.

# regulatory <- useEnsembl(biomart="regulation",
#                          dataset="hsapiens_regulatory_feature",
#                          GRCh = 37)

## Download the regulatory CTCF binding sites and convert them to
## a GRanges object.
# CTCF <- getBM(attributes = c('chromosome_name',
#                             'chromosome_start',
#                             'chromosome_end',
#                             'feature_type_name',
#                             'cell_type_name'),
#              filters = "regulatory_feature_type_name", 
#              values = "CTCF Binding Site", 
#              mart = regulatory)
#
# CTCF_g <- reduce(GRanges(CTCF$chromosome_name,
#                 IRanges(CTCF$chromosome_start,
#                 CTCF$chromosome_end)))
CTCF_g <- readRDS(system.file("states/CTCF_g_data.rds",
                    package="MutationalPatterns"))

## Download the promoter regions and conver them to a GRanges object.
# promoter = getBM(attributes = c('chromosome_name', 'chromosome_start',
#                                 'chromosome_end', 'feature_type_name'),
#                  filters = "regulatory_feature_type_name", 
#                  values = "Promoter", 
#                  mart = regulatory)
# promoter_g = reduce(GRanges(promoter$chromosome_name,
#                     IRanges(promoter$chromosome_start,
#                             promoter$chromosome_end)))
promoter_g <- readRDS(system.file("states/promoter_g_data.rds",
                        package="MutationalPatterns"))

# flanking = getBM(attributes = c('chromosome_name',
#                                 'chromosome_start',
#                                 'chromosome_end',
#                                 'feature_type_name'),
#                  filters = "regulatory_feature_type_name", 
#                  values = "Promoter Flanking Region", 
#                  mart = regulatory)
# flanking_g = reduce(GRanges(
#                        flanking$chromosome_name,
#                        IRanges(flanking$chromosome_start,
#                        flanking$chromosome_end)))

flanking_g <- readRDS(system.file("states/promoter_flanking_g_data.rds",
                                    package="MutationalPatterns"))


###################################################
### code chunk number 48: combine_genomic_regions
###################################################
regions <- GRangesList(promoter_g, flanking_g, CTCF_g)

names(regions) <- c("Promoter", "Promoter flanking", "CTCF")


###################################################
### code chunk number 49: combine_genomic_regions_2
###################################################
seqlevelsStyle(regions) <- "UCSC"


###################################################
### code chunk number 50: download_bed_data
###################################################
## Get the filename with surveyed/callable regions
surveyed_file <- list.files(system.file("extdata",
                            package = "MutationalPatterns"),
                            pattern = ".bed",
                            full.names = TRUE)

## Import the file using rtracklayer and use the UCSC naming standard
library(rtracklayer)
surveyed <- import(surveyed_file)
seqlevelsStyle(surveyed) <- "UCSC"

## For this example we use the same surveyed file for each sample.
surveyed_list <- rep(list(surveyed), 9)


###################################################
### code chunk number 51: genomic_distribution
###################################################
## Calculate the number of observed and expected number of mutations in
## each genomic regions for each sample.
distr <- genomic_distribution(vcfs, surveyed_list, regions)


###################################################
### code chunk number 52: enrichment_depletion_test
###################################################
## Perform the enrichment/depletion test by tissue type.
distr_test <- enrichment_depletion_test(distr, by = tissue)

## Or without specifying the 'by' parameter.
distr_test2 <- enrichment_depletion_test(distr)


###################################################
### code chunk number 53: plot_enrichment_depletion
###################################################
plot_enrichment_depletion(distr_test)


###################################################
### code chunk number 54: sessionInfo
###################################################
sessionInfo()