## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(fig.width=5, fig.height=5, message=FALSE, warning=FALSE)

## ----eval=FALSE---------------------------------------------------------------
# eh <- ExperimentHub()
# ah <- AnnotationHub()
# # some default resources:
# seg <- eh[["EH7307"]] # pre-built genome segmentation for hg38
# exclude <- ah[["AH107305"]] # Kundaje excluded regions for hg38, see below
# 
# set.seed(5) # set seed for reproducibility
# blockLength <- 5e5 # size of blocks to bootstrap
# R <- 10 # number of iterations of the bootstrap
# 
# # input `ranges` require seqlengths, if missing see `GenomeInfoDb::Seqinfo`
# seqlengths(ranges)
# # next, we remove non-standard chromosomes ...
# ranges <- keepStandardChromosomes(ranges, pruning.mode="coarse")
# # ... and mitochondrial genome, as these are too short
# seqlevels(ranges, pruning.mode="coarse") <- setdiff(seqlevels(ranges), "MT")
# 
# # generate bootstraps
# boots <- bootRanges(ranges, blockLength=blockLength, R=R,
#                     seg=seg, exclude=exclude)
# # `boots` can then be used with plyranges commands, e.g. join_overlap_*

## ----echo=FALSE---------------------------------------------------------------
knitr::include_graphics("images/bootRanges.jpeg")

## ----excluderanges------------------------------------------------------------
suppressPackageStartupMessages(library(AnnotationHub))
ah <- AnnotationHub()
# hg38.Kundaje.GRCh38_unified_Excludable
exclude_1 <- ah[["AH107305"]]
# hg38.UCSC.centromere
exclude_2 <- ah[["AH107354"]]
# hg38.UCSC.telomere
exclude_3 <- ah[["AH107355"]]
# hg38.UCSC.short_arm
exclude_4 <- ah[["AH107356"]]
# combine them
suppressWarnings({
  exclude <- trim(c(exclude_1, exclude_2, exclude_3, exclude_4))
})
exclude <- sort(GenomicRanges::reduce(exclude))

## ----message=FALSE, warning=FALSE---------------------------------------------
suppressPackageStartupMessages(library(ExperimentHub))
eh <- ExperimentHub()
seg_cbs <- eh[["EH7307"]] # prefer CBS for hg38
seg_hmm <- eh[["EH7308"]]
seg <- seg_cbs

## -----------------------------------------------------------------------------
suppressPackageStartupMessages(library(ensembldb))
suppressPackageStartupMessages(library(EnsDb.Hsapiens.v86))
edb <- EnsDb.Hsapiens.v86
filt <- AnnotationFilterList(GeneIdFilter("ENSG", "startsWith"))
g <- genes(edb, filter = filt)

## -----------------------------------------------------------------------------
library(GenomeInfoDb)
g <- keepStandardChromosomes(g, pruning.mode = "coarse")
# MT is too small for bootstrapping, so must be removed
seqlevels(g, pruning.mode="coarse") <- setdiff(seqlevels(g), "MT")
# normally we would assign a new style, but for recent host issues
# that produced vignette build problems, we use `paste0`
## seqlevelsStyle(g) <- "UCSC" 
seqlevels(g) <- paste0("chr", seqlevels(g))
genome(g) <- "hg38"
g <- sortSeqlevels(g)
g <- sort(g)
table(seqnames(g))

## -----------------------------------------------------------------------------
library(nullranges)
suppressPackageStartupMessages(library(plyranges))
library(patchwork)

## ----seg-cbs, fig.width=5, fig.height=4---------------------------------------
set.seed(5)
exclude <- exclude %>%
  plyranges::filter(width(exclude) >= 500)
L_s <- 1e6
seg_cbs <- segmentDensity(g, n = 3, L_s = L_s,
                          exclude = exclude, type = "cbs")
plots <- lapply(c("ranges","barplot","boxplot"), function(t) {
  plotSegment(seg_cbs, exclude, type = t)
})
plots[[1]]
plots[[2]] + plots[[3]]

## ----fig.width=4, fig.height=3------------------------------------------------
region <- GRanges("chr16", IRanges(3e7,4e7))
plotSegment(seg_cbs, exclude, type="ranges", region=region)

## ----seg-hmm, fig.width=5, fig.height=4---------------------------------------
seg_hmm <- segmentDensity(g, n = 3, L_s = L_s,
                          exclude = exclude, type = "hmm")
plots <- lapply(c("ranges","barplot","boxplot"), function(t) {
  plotSegment(seg_hmm, exclude, type = t)
})
plots[[1]]
plots[[2]] + plots[[3]]

## -----------------------------------------------------------------------------
suppressPackageStartupMessages(library(nullrangesData))
dhs <- DHSA549Hg38()
dhs <- dhs %>% plyranges::filter(signalValue > 100) %>%
  mutate(id = seq_along(.)) %>%
  plyranges::select(id, signalValue)
length(dhs)
table(seqnames(dhs))

## -----------------------------------------------------------------------------
set.seed(5) # for reproducibility
R <- 50
blockLength <- 5e5
boots <- bootRanges(dhs, blockLength, R = R, seg = seg, exclude=exclude)
boots

## -----------------------------------------------------------------------------
suppressPackageStartupMessages(library(tidyr))
combined <- dhs %>% 
  mutate(iter=0) %>%
  bind_ranges(boots) %>% 
  plyranges::select(iter)
stats <- combined %>% 
  group_by(iter) %>%
  summarize(n = n()) %>%
  as_tibble()
head(stats)

## ----eval=FALSE---------------------------------------------------------------
# # pseudocode for the general paradigm:
# boots <- bootRanges(y) # make bootstrapped y
# x %>% join_overlap_inner(boots) %>% # overlaps of x with bootstrapped y
#   group_by(x_id, iter) %>% # collate by x ID and the bootstrap iteration
#   summarize(some_statistic = ...) %>% # compute some summary on metadata
#   as_tibble() %>% # pass to tibble
#   complete(
#     x_id, iter, # for any missing combinations of x ID and iter...
#     fill=list(some_statistic = 0) # ...fill in missing values
#   )

## -----------------------------------------------------------------------------
x <- GRanges("chr2", IRanges(1 + 50:99 * 1e6, width=1e6), x_id=1:50)

## -----------------------------------------------------------------------------
x <- x %>% mutate(n_overlaps = count_overlaps(., dhs))
sum( x$n_overlaps )

## -----------------------------------------------------------------------------
boot_stats <- x %>% join_overlap_inner(boots) %>%
  group_by(x_id, iter) %>%
  summarize(n_overlaps = n()) %>%
  as_tibble() %>%
  complete(x_id, iter, fill=list(n_overlaps = 0)) %>%
  group_by(iter) %>%
  summarize(sumOverlaps = sum(n_overlaps))

## ----boot-histI---------------------------------------------------------------
suppressPackageStartupMessages(library(ggplot2))
ggplot(boot_stats, aes(sumOverlaps)) +
  geom_histogram(binwidth=5)+
  geom_vline(xintercept = sum(x$n_overlaps), linetype = "dashed")

## -----------------------------------------------------------------------------
x_obs <- x %>% join_overlap_inner(dhs,maxgap=1e3)
sum(x_obs$signalValue )
boot_stats <- x %>% join_overlap_inner(boots,maxgap=1e3)  %>%
  group_by(x_id, iter) %>%
  summarize(Signal = sum(signalValue)) %>%
  as_tibble() %>% 
  complete(x_id, iter, fill=list(Signal = 0)) %>%
  group_by(iter) %>%
  summarize(sumSignal = sum(Signal))

## ----boot-histII--------------------------------------------------------------
ggplot(boot_stats, aes(sumSignal)) +
  geom_histogram()+
  geom_vline(xintercept = sum(x_obs$signalValue), linetype = "dashed")

## -----------------------------------------------------------------------------
suppressPackageStartupMessages(library(nullrangesData))
dhs <- DHSA549Hg38()
region <- GRanges("chr1", IRanges(10e6 + 1, width=1e6))
x <- GRanges("chr1", IRanges(10e6 + 0:9 * 1e5 + 1, width=1e4))
y <- dhs %>% filter_by_overlaps(region) %>% select(NULL)
x %>% mutate(num_overlaps = count_overlaps(., y))

## -----------------------------------------------------------------------------
seg <- oneRegionSegment(region, seqlength=248956422)
y <- keepSeqlevels(y, "chr1")
set.seed(1)
boot <- bootRanges(y, blockLength=1e5, R=1, seg=seg,
                   proportionLength=FALSE)
boot
x %>% mutate(num_overlaps = count_overlaps(., boot))

## -----------------------------------------------------------------------------
suppressPackageStartupMessages(library(plotgardener))
my_palette <- function(n) {
  head(c("red","green3","red3","dodgerblue",
         "blue2","green4","darkred"), n)
}
plotGRanges <- function(gr) {
  pageCreate(width = 5, height = 5, xgrid = 0,
                ygrid = 0, showGuides = TRUE)
  for (i in seq_along(seqlevels(gr))) {
    chrom <- seqlevels(gr)[i]
    chromend <- seqlengths(gr)[[chrom]]
    suppressMessages({
      p <- pgParams(chromstart = 0, chromend = chromend,
                    x = 0.5, width = 4*chromend/500, height = 2,
                    at = seq(0, chromend, 50),
                    fill = colorby("state_col", palette=my_palette))
      prngs <- plotRanges(data = gr, params = p,
                          chrom = chrom,
                          y = 2 * i,
                          just = c("left", "bottom"))
      annoGenomeLabel(plot = prngs, params = p, y = 0.1 + 2 * i)
    })
  }
}

## -----------------------------------------------------------------------------
library(GenomicRanges)
seq_nms <- rep(c("chr1","chr2"), c(4,3))
seg <- GRanges(
  seqnames = seq_nms,
  IRanges(start = c(1, 101, 201, 401, 1, 201, 301),
          width = c(100, 100, 200, 100, 200, 100, 100)),
  seqlengths=c(chr1=500,chr2=400),
  state = c(1,2,1,3,3,2,1),
  state_col = factor(1:7)
)

## ----toysegments, fig.width=5, fig.height=4-----------------------------------
plotGRanges(seg)

## ----toyrangesI, fig.width=5, fig.height=4------------------------------------
set.seed(1)
n <- 200
gr <- GRanges(
  seqnames=sort(sample(c("chr1","chr2"), n, TRUE)),
  IRanges(start=round(runif(n, 1, 500-10+1)), width=10)
)
suppressWarnings({
  seqlengths(gr) <- seqlengths(seg)
})
gr <- gr[!(seqnames(gr) == "chr2" & end(gr) > 400)]
gr <- sort(gr)
idx <- findOverlaps(gr, seg, type="within", select="first")
gr <- gr[!is.na(idx)]
idx <- idx[!is.na(idx)]
gr$state <- seg$state[idx]
gr$state_col <- factor(seg$state_col[idx])
plotGRanges(gr)

## ----toy-no-prop, fig.width=5, fig.height=4-----------------------------------
set.seed(1)
gr_prime <- bootRanges(gr, blockLength = 25, seg = seg,
                       proportionLength = FALSE)
plotGRanges(gr_prime)

## ----toy-prop, fig.width=5, fig.height=4--------------------------------------
set.seed(1)
gr_prime <- bootRanges(gr, blockLength = 50, seg = seg,
                       proportionLength = TRUE)
plotGRanges(gr_prime)

## -----------------------------------------------------------------------------
sessionInfo()