Author: Martin Morgan Date: 26 July, 2019
Attach the [TENxBrainData][] experiment data package
library(TENxBrainData)
Load a very large SummarizedExperiment
tenx <- TENxBrainData()
## snapshotDate(): 2019-07-10
## see ?TENxBrainData and browseVignettes('TENxBrainData') for documentation
## downloading 0 resources
## loading from cache
tenx
## class: SingleCellExperiment
## dim: 27998 1306127
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(2): Ensembl Symbol
## colnames(1306127): AAACCTGAGATAGGAG-1 AAACCTGAGCGGCTTC-1 ...
## TTTGTCAGTTAAAGTG-133 TTTGTCATCTGAAAGA-133
## colData names(4): Barcode Sequence Library Mouse
## reducedDimNames(0):
## spikeNames(0):
assay(tenx)
## <27998 x 1306127> DelayedMatrix object of type "integer":
## AAACCTGAGATAGGAG-1 ... TTTGTCATCTGAAAGA-133
## [1,] 0 . 0
## [2,] 0 . 0
## [3,] 0 . 0
## [4,] 0 . 0
## [5,] 0 . 0
## ... . . .
## [27994,] 0 . 0
## [27995,] 1 . 0
## [27996,] 0 . 0
## [27997,] 0 . 0
## [27998,] 0 . 0
Quickly perform basic operations
log1p(assay(tenx))
## <27998 x 1306127> DelayedMatrix object of type "double":
## AAACCTGAGATAGGAG-1 ... TTTGTCATCTGAAAGA-133
## [1,] 0 . 0
## [2,] 0 . 0
## [3,] 0 . 0
## [4,] 0 . 0
## [5,] 0 . 0
## ... . . .
## [27994,] 0.0000000 . 0
## [27995,] 0.6931472 . 0
## [27996,] 0.0000000 . 0
## [27997,] 0.0000000 . 0
## [27998,] 0.0000000 . 0
Subset and summarize ‘library size’ for 1000 cells
tenx_subset <- tenx[,1:1000]
lib_size <- colSums(assay(tenx_subset))
hist(log10(lib_size))
The data is sparse, with more than 92% of cells equal to 0
sum(assay(tenx_subset) == 0) / prod(dim(tenx_subset))
## [1] 0.9276541
Avoid unnecessary copying
n <- 50000
## makes a copy of `res1` on each iteration
set.seed(123)
system.time({
res1 <- NULL
for (i in 1:n)
res1 <- c(res1, rnorm(1))
})
## user system elapsed
## 7.098 2.908 10.013
## pre-allocation
set.seed(123)
system.time({
res2 <- numeric(n)
for (i in 1:n)
res2[i] <- rnorm(1)
})
## user system elapsed
## 0.078 0.004 0.082
identical(res1, res2)
## [1] TRUE
## no need to think about allocation!
set.seed(123)
system.time({
res3 <- sapply(1:n, function(i) rnorm(1))
})
## user system elapsed
## 0.094 0.002 0.096
identical(res1, res3)
## [1] TRUE
Vectorize your own scripts
n <- 2000000
## iteration: n calls to `rnorm(1)`
set.seed(123)
system.time({
res1 <- sapply(1:n, function(i) rnorm(1))
})
## user system elapsed
## 5.678 0.693 6.374
## vectorize: 1 call to `rnorm(n)`
set.seed(123)
system.time( res2 <- rnorm(n) )
## user system elapsed
## 0.137 0.000 0.136
identical(res1, res2)
## [1] TRUE
Reuse other people’s efficient code.
limma::lmFit()
to fit 10,000’s of linear models very quicklyExamples in the lab this afternoon, and from Tuesday evening
Example: nucleotide frequency of mapped reads
Working through example data; not really large…
library(RNAseqData.HNRNPC.bam.chr14)
fname <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1]
basename(fname)
## [1] "ERR127306_chr14.bam"
Rsamtools::countBam(fname)
## space start end width file records nucleotides
## 1 NA NA NA NA ERR127306_chr14.bam 800484 57634848
Input into a GAlignments
object, including seq
(sequence) of each
read.
library(GenomicAlignments)
param <- ScanBamParam(what = "seq")
galn <- readGAlignments(fname, param = param)
Write a function to determine GC content of reads
library(Biostrings)
gc_content <- function(galn) {
seq <- mcols(galn)[["seq"]]
gc <- letterFrequency(seq, "GC", as.prob=TRUE)
as.vector(gc)
}
Calculate and display GC content
param <- ScanBamParam(what = "seq")
galn <- readGAlignments(fname, param = param)
res1 <- gc_content(galn)
hist(res1)
Open file for reading, specifying ‘yield’ size
bfl <- BamFile(fname, yieldSize = 100000)
open(bfl)
Repeatedly read chunks of data and calculate GC content
res2 <- numeric()
repeat {
message(".")
galn <- readGAlignments(bfl, param = param)
if (length(galn) == 0)
break
## inefficient copy of res2, but only a few iterations...
res2 <- c(res2, gc_content(galn))
}
## .
## .
## .
## .
## .
## .
## .
## .
## .
## .
Clean up and compare approaches
close(bfl)
identical(res1, res2)
## [1] TRUE
Many down-sides
Maximum speed-up
fun <- function(i) {
Sys.sleep(1) # a time-consuming calculation
i # and then the result
}
system.time({
res1 <- lapply(1:10, fun)
})
## user system elapsed
## 0.003 0.000 10.036
library(BiocParallel)
system.time({
res2 <- bplapply(1:10, fun)
})
## user system elapsed
## 2.034 0.062 3.170
identical(res1, res2)
## [1] TRUE
Parallel, chunk-wise iteration through genomic files. Set up:
library(GenomicFiles)
Define a yield
function that provides a chunk of data for processing
yield <- function(x) {
param <- ScanBamParam(what = "seq")
readGAlignments(x, param = param)
}
Define a map
function that transforms the input data to the desired result
map <- function(x) {
seq <- mcols(x)[["seq"]]
gc <- letterFrequency(seq, "GC", as.prob = TRUE)
as.vector(gc)
}
Define a reduce
function that combines two successive results
reduce <- c
Perform the calculation, chunk-wise and in parallel
library(RNAseqData.HNRNPC.bam.chr14)
fname <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1]
bfl <- BamFile(fname, yieldSize = 100000)
res <- reduceByYield(bfl, yield, map, reduce, parallel = TRUE)
hist(res)
sessionInfo()
## R version 3.6.1 Patched (2019-07-16 r76845)
## Platform: x86_64-apple-darwin17.7.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Users/ma38727/bin/R-3-6-branch/lib/libRblas.dylib
## LAPACK: /Users/ma38727/bin/R-3-6-branch/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] GenomicFiles_1.21.0 rtracklayer_1.45.2
## [3] GenomicAlignments_1.21.4 Rsamtools_2.1.3
## [5] Biostrings_2.53.2 XVector_0.25.0
## [7] RNAseqData.HNRNPC.bam.chr14_0.23.0 TENxBrainData_1.5.0
## [9] HDF5Array_1.13.4 rhdf5_2.29.0
## [11] SingleCellExperiment_1.7.0 SummarizedExperiment_1.15.5
## [13] DelayedArray_0.11.4 BiocParallel_1.19.0
## [15] matrixStats_0.54.0 Biobase_2.45.0
## [17] GenomicRanges_1.37.14 GenomeInfoDb_1.21.1
## [19] IRanges_2.19.10 S4Vectors_0.23.17
## [21] BiocGenerics_0.31.5 BiocStyle_2.13.2
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.0 bit64_0.9-7
## [3] AnnotationHub_2.17.6 shiny_1.3.2
## [5] assertthat_0.2.1 askpass_1.1
## [7] interactiveDisplayBase_1.23.0 BiocManager_1.30.4
## [9] BiocFileCache_1.9.1 blob_1.2.0
## [11] BSgenome_1.53.0 GenomeInfoDbData_1.2.1
## [13] progress_1.2.2 yaml_2.2.0
## [15] pillar_1.4.2 RSQLite_2.1.2
## [17] backports_1.1.4 lattice_0.20-38
## [19] glue_1.3.1 digest_0.6.20
## [21] promises_1.0.1 htmltools_0.3.6
## [23] httpuv_1.5.1 Matrix_1.2-17
## [25] XML_3.98-1.20 pkgconfig_2.0.2
## [27] biomaRt_2.41.7 bookdown_0.12
## [29] zlibbioc_1.31.0 purrr_0.3.2
## [31] xtable_1.8-4 later_0.8.0
## [33] openssl_1.4.1 tibble_2.1.3
## [35] GenomicFeatures_1.37.4 magrittr_1.5
## [37] crayon_1.3.4 mime_0.7
## [39] memoise_1.1.0 evaluate_0.14
## [41] prettyunits_1.0.2 tools_3.6.1
## [43] hms_0.5.0 stringr_1.4.0
## [45] Rhdf5lib_1.7.3 AnnotationDbi_1.47.0
## [47] compiler_3.6.1 rlang_0.4.0
## [49] grid_3.6.1 RCurl_1.95-4.12
## [51] VariantAnnotation_1.31.3 rappdirs_0.3.1
## [53] bitops_1.0-6 rmarkdown_1.14
## [55] ExperimentHub_1.11.3 codetools_0.2-16
## [57] DBI_1.0.0 curl_4.0
## [59] R6_2.4.0 knitr_1.23
## [61] dplyr_0.8.3 bit_1.1-14
## [63] zeallot_0.1.0 stringi_1.4.3
## [65] Rcpp_1.0.1 vctrs_0.2.0
## [67] dbplyr_1.4.2 tidyselect_0.2.5
## [69] xfun_0.8