## ----setup, include = FALSE---------------------------------------------------
library(fluentGenomics)
dir <- system.file("extdata", package="macrophage")

library(tximeta)
makeLinkedTxome(
  indexDir=file.path(dir, "gencode.v29_salmon_0.12.0"),
  source="Gencode",
  organism="Homo sapiens",
  release="29",
  genome="GRCh38",
  fasta="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz",
  gtf=file.path(dir, "gencode.v29.annotation.gtf.gz"), # local version
  write=FALSE
)

knitr::opts_chunk$set(
  fig.align = "center"
)

## ----workflow, fig.cap = "(ref:workflow)", fig.align='center', out.width="\\textwidth", echo = FALSE----
knitr::include_graphics("workflow.png")

## ----read-cache, eval = FALSE-------------------------------------------------
#  library(fluentGenomics)
#  atac <- readRDS(cache_atac_se())

## ----dir, eval=FALSE----------------------------------------------------------
#  dir <- "/path/to/quant/files"

## ----setdir-------------------------------------------------------------------
dir <- system.file("extdata", package="macrophage")

## ----coldata-rna--------------------------------------------------------------
library(dplyr)
library(readr)
colfile <- file.path(dir, "coldata.csv")
coldata <- read_csv(colfile) %>%
  dplyr::select(
    names,
    id = sample_id,
    line = line_id,
    condition = condition_name
  ) %>%
  dplyr::mutate(
    files = file.path(dir, "quants", names, "quant.sf.gz"),
    line = factor(line),
    condition = relevel(factor(condition), "naive")
  )
coldata

## ----tximeta-run--------------------------------------------------------------
suppressPackageStartupMessages(library(SummarizedExperiment))
library(tximeta)
se <- tximeta(coldata, dropInfReps=TRUE)
se

## ----linkedtxome-ex, eval = FALSE---------------------------------------------
#  makeLinkedTxome(
#    indexDir=file.path(dir, "gencode.v29_salmon_0.12.0"),
#    source="Gencode",
#    organism="Homo sapiens",
#    release="29",
#    genome="GRCh38",
#    fasta="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz",
#    gtf=file.path(dir, "gencode.v29.annotation.gtf.gz"), # local version
#    write=FALSE
#  )

## ----gse----------------------------------------------------------------------
gse <- summarizeToGene(se)

## ----coldata-atac, eval=FALSE-------------------------------------------------
#  atac_coldata <- read_tsv("ATAC_sample_metadata.txt.gz") %>%
#    select(
#      sample_id,
#      donor,
#      condition = condition_name
#    ) %>%
#    mutate(condition = relevel(factor(condition), "naive"))

## ----mat-atac, eval=FALSE-----------------------------------------------------
#  atac_mat <- read_tsv("ATAC_cqn_matrix.txt.gz",
#                       skip = 1,
#                       col_names =c("rownames", atac_coldata[["sample_id"]]))
#  rownames <- atac_mat[["rownames"]]
#  atac_mat <- as.matrix(atac_mat[,-1])
#  rownames(atac_mat) <- rownames

## ----peaks-atac, eval=FALSE---------------------------------------------------
#  library(plyranges)
#  peaks_df <- read_tsv("ATAC_peak_metadata.txt.gz",
#                       col_types = c("cidciicdc")
#  )
#  
#  peaks_gr <- peaks_df %>%
#    as_granges(seqnames = chr) %>%
#    select(peak_id=gene_id) %>%
#    set_genome_info(genome = "GRCh38")

## ----atac-se, eval=FALSE------------------------------------------------------
#  atac <- SummarizedExperiment(assays = list(cqndata=atac_mat),
#                               rowRanges=peaks_gr,
#                               colData=atac_coldata)

## ----setup-deseq--------------------------------------------------------------
library(DESeq2)
dds <- DESeqDataSet(gse, ~line + condition)
# filter out lowly expressed genes
# at least 10 counts in at least 6 samples
keep <- rowSums(counts(dds) >= 10) >= 6
dds <- dds[keep,]

## ----fit-model----------------------------------------------------------------
dds <- DESeq(dds)

## ----results-DFrame-----------------------------------------------------------
res <- results(dds,
               contrast=c("condition","IFNg","naive"),
               lfcThreshold=1, alpha=0.01)

## ----ma-plot, fig.cap="(ref:maplot)"------------------------------------------
summary(res)
DESeq2::plotMA(res, ylim=c(-10,10))

## ----results-GRanges----------------------------------------------------------
suppressPackageStartupMessages(library(plyranges))
de_genes <- results(dds,
                    contrast=c("condition","IFNg","naive"),
                    lfcThreshold=1,
                    format="GRanges") %>%
  names_to_column("gene_id")
de_genes

## ----de-genes-----------------------------------------------------------------
de_genes <- de_genes %>%
  filter(padj < 0.01) %>%
  select(gene_id, de_log2FC = log2FoldChange, de_padj = padj)

## ----not-de-genes-------------------------------------------------------------
other_genes <- results(dds,
                       contrast=c("condition","IFNg","naive"),
                       lfcThreshold=1,
                       altHypothesis="lessAbs",
                       format="GRanges") %>%
  filter(padj < 0.01) %>%
  names_to_column("gene_id") %>%
  dplyr::select(gene_id,
                de_log2FC = log2FoldChange,
                de_padj = padj)

## ----limma, eval = FALSE------------------------------------------------------
#  library(limma)
#  design <- model.matrix(~donor + condition, colData(atac))
#  fit <- lmFit(assay(atac), design)
#  fit <- eBayes(fit)
#  idx <- which(colnames(fit$coefficients) == "conditionIFNg")
#  tt <- topTable(fit, coef=idx, sort.by="none", n=nrow(atac))

## ----peaks-tidy, eval = FALSE-------------------------------------------------
#  atac_peaks <- rowRanges(atac) %>%
#    remove_names() %>%
#    mutate(
#      da_log2FC = tt$logFC,
#      da_padj = tt$adj.P.Val
#    ) %>%
#    set_genome_info(genome = "hg38")
#  
#  seqlevelsStyle(atac_peaks) <- "UCSC"

## ----load-peaks---------------------------------------------------------------
library(fluentGenomics)
peaks

## ----filter-peaks-------------------------------------------------------------
da_peaks <- peaks %>%
  filter(da_padj < 0.01)

## ----slice-example------------------------------------------------------------
size <- length(de_genes)
slice(other_genes, sample.int(n(), size))

## ----boot-set-01--------------------------------------------------------------
# set a seed for the results
set.seed(2019-08-02)
boot_genes <- replicate(10,
                        slice(other_genes, sample.int(n(), size)),
                        simplify = FALSE)

## ----boot-set-02--------------------------------------------------------------
boot_genes <- bind_ranges(boot_genes, .id = "resample")

## ----combine-results----------------------------------------------------------
all_genes <- bind_ranges(
  de=de_genes,
  not_de = boot_genes,
  .id="origin"
) %>%
  mutate(
    origin = factor(origin, c("not_de", "de")),
    resample = ifelse(is.na(resample), 0L, as.integer(resample))
  )
all_genes

## ----resize-01----------------------------------------------------------------
all_genes <- all_genes %>%
  anchor_5p() %>%
  mutate(width = 1)

## ----resize-02----------------------------------------------------------------
all_genes <- all_genes %>%
  anchor_center() %>%
  mutate(width=2*1e4)

## ----olap-join----------------------------------------------------------------
genes_olap_peaks <- all_genes %>%
  join_overlap_left(da_peaks)
genes_olap_peaks

## ----reduce-ex01--------------------------------------------------------------
gene_peak_max_lfc <- genes_olap_peaks %>%
  group_by(gene_id, origin)  %>%
  reduce_ranges_directed(
    peak_count = sum(!is.na(da_padj)) / n_distinct(resample),
    peak_max_lfc = max(abs(da_log2FC))
  )

## ----boxplot, fig.cap = "(ref:boxplot)"---------------------------------------
library(ggplot2)
gene_peak_max_lfc %>%
  filter(peak_count > 0) %>%
  as.data.frame() %>%
  ggplot(aes(origin, peak_max_lfc)) +
  geom_boxplot()

## ----summarize-ex01-----------------------------------------------------------
origin_peak_lfc <- genes_olap_peaks %>%
  group_by(origin) %>%
  summarize(
    peak_count = sum(!is.na(da_padj)) / n_distinct(resample),
    lfc1_peak_count =sum(abs(da_log2FC) > 1, na.rm=TRUE)/ n_distinct(resample),
    lfc2_peak_count = sum(abs(da_log2FC) > 2, na.rm=TRUE)/ n_distinct(resample)
  )
origin_peak_lfc

## ----pivot-enrich-------------------------------------------------------------
origin_peak_lfc %>%
  as.data.frame() %>%
  tidyr::pivot_longer(cols = -origin) %>%
  tidyr::pivot_wider(names_from = origin, values_from = value) %>%
  mutate(enrichment = de / not_de)

## ----reduce-summarize---------------------------------------------------------
genes_olap_peaks %>%
  group_by(gene_id, origin, resample) %>%
  reduce_ranges_directed(
    lfc1 = sum(abs(da_log2FC) > 1, na.rm=TRUE),
    lfc2 = sum(abs(da_log2FC) > 2, na.rm=TRUE)
  ) %>%
  group_by(origin) %>%
  summarize(
    lfc1_gene_count = sum(lfc1 > 0) / n_distinct(resample),
    lfc1_peak_count = sum(lfc1) / n_distinct(resample),
    lfc2_gene_count = sum(lfc2 > 0) / n_distinct(resample),
    lfc2_peak_count = sum(lfc2) / n_distinct(resample)
  )

## ----count-fn-----------------------------------------------------------------
count_if_above_threshold <- function(var, thresholds) {
  lapply(thresholds, function(.) sum(abs(var) > ., na.rm = TRUE))
}

## ----thresholds---------------------------------------------------------------
thresholds <- da_peaks %>%
  mutate(abs_lfc = abs(da_log2FC)) %>%
  with(
    seq(min(abs_lfc), max(abs_lfc), length.out = 100)
  )

## ----reduce-ex02--------------------------------------------------------------
genes_peak_all_thresholds <- genes_olap_peaks %>%
  group_by(gene_id, origin, resample) %>%
  reduce_ranges_directed(
    value = count_if_above_threshold(da_log2FC, thresholds),
    threshold = list(thresholds)
  )
genes_peak_all_thresholds

## ----expand-summarize---------------------------------------------------------
origin_peak_all_thresholds <- genes_peak_all_thresholds %>%
  expand_ranges() %>%
  group_by(origin, threshold) %>%
  summarize(
    gene_count = sum(value > 0) / n_distinct(resample),
    peak_count = sum(value) / n_distinct(resample)
  )
origin_peak_all_thresholds

## ----line-chart, fig.cap = "(ref:linechart)"----------------------------------
origin_threshold_counts <- origin_peak_all_thresholds %>%
  as.data.frame() %>%
  tidyr::pivot_longer(cols = -c(origin, threshold),
                      names_to = c("type", "var"),
                      names_sep = "_",
                      values_to = "count") %>%
  select(-var)

origin_threshold_counts %>%
  filter(type == "peak") %>%
  tidyr::pivot_wider(names_from = origin, values_from = count) %>%
  mutate(enrichment =  de / not_de) %>%
  ggplot(aes(x = threshold, y = enrichment)) +
  geom_line() +
  labs(x = "logFC threshold", y = "Relative Enrichment")

## ----line-chart2, fig.cap = "(ref:linechart2)"--------------------------------
origin_threshold_counts %>%
  ggplot(aes(x = threshold,
             y = count + 1,
             color = origin,
             linetype = type)) +
  geom_line() +
  scale_y_log10()

## ----eval = FALSE-------------------------------------------------------------
#  # development version from Github
#  BiocManager::install("sa-lee/fluentGenomics")
#  # version available from Bioconductor
#  BiocManager::install("fluentGenomics")

## -----------------------------------------------------------------------------
sessionInfo()