%\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{3. A DelayedArray / HDF5Array update (slides from April 2021)} %\VignetteDepends{knitr,Matrix,DelayedArray,HDF5Array,lobstr} % 2019-12-22: A temporary fix to avoid the following pdflatex error caused by % an issue in LaTeX package filehook-scrlfile (used by beamer): % ! Package filehook Error: Detected unknown definition of \InputIfFileExists. % Use the 'force' option of 'filehook' to overwrite it.. % The error appeared on tokay2 in Dec 2019 after reinstalling MiKTeX 2.9. % See comment by Phelype Oleinik here for the fix: % https://tex.stackexchange.com/questions/512189/problem-with-chemmacros-beamer-and-filehook-scrlfile-sty \PassOptionsToPackage{force}{filehook} \documentclass[8pt]{beamer} \mode { \usetheme{Madrid} \usecolortheme{crane} } \usepackage{slides} \renewcommand\Rclass[1]{{\texttt{#1}\index{#1 (class)}}} \AtBeginSection[] { \begin{frame} \tableofcontents[currentsection] \end{frame} } \title{DelayedArray / HDF5Array update} \author{Herv\'e Pag\`es} \institute{Fred Hutch, Seattle} \date{April 2021} \begin{document} <>= library(knitr) opts_chunk$set(size="scriptsize") options(width=80) library(Matrix) library(DelayedArray) library(HDF5Array) library(lobstr) @ \maketitle \frame{\tableofcontents} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Recent additions to package DelayedArray} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{ConstantArray objects (by Aaron)} \begin{block}{} This would ordinarily take up 8 TB of memory: <>= library(DelayedArray) CM <- ConstantArray(c(1e6, 1e6), value=NA_real_) CM lobstr::obj_size(CM) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{sinkApply()} \begin{block}{} \Rcode{sinkApply()}: a convenience function for walking on a \Rcode{RealizationSink} derivative for the purpose of filling it with blocks of data \end{block} \bigskip \begin{block}{} Example: Fill a 1e6 x 1e6 on-disk matrix with random data \begin{knitrout}\scriptsize \definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969} \color{fgcolor} \begin{kframe} \begin{verbatim} sink <- HDF5RealizationSink(c(1e6L, 1e6L)) # or TileDBRealizationSink sink_grid <- defaultSinkAutoGrid(sink) FUN <- function(sink, viewport) { block <- array(runif(length(viewport)), dim=dim(viewport)) write_block(sink, viewport, block) } sink <- sinkApply(sink, FUN, grid=sink_grid) close(sink) M <- as(sink, "DelayedArray") \end{verbatim} \end{kframe} \end{knitrout} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{rbind(), cbind(), and sparsity} \begin{block}{} \Rcode{rbind()} and \Rcode{cbind()} of \Rcode{DelayedArray} objects now propagate sparsity \end{block} \bigskip \begin{block}{} \begin{knitrout}\scriptsize \definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969} \color{fgcolor} \begin{kframe} \begin{verbatim} tenx1 <- HDF5Array::TENxMatrix("tenx1.h5") # is_sparse(tenx1) is TRUE tenx2 <- HDF5Array::TENxMatrix("tenx2.h5") # is_sparse(tenx2) is TRUE bigtenx <- cbind(tenx1, tenx2) is_sparse(bigtenx) # TRUE blockApply(bigtenx, FUN, ...) # will take advantage of sparsity \end{verbatim} \end{kframe} \end{knitrout} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Recent additions to package HDF5Array} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Recent additions to package HDF5Array} \begin{block}{} \Rcode{HDF5Array()}: can now take an URL to a file on Amazon S3 (kind of slow!) \end{block} \bigskip \begin{block}{} \Rcode{H5SparseMatrix}: a \Rcode{DelayedMatrix} subclass for representing and operating on an HDF5 sparse matrix stored in CSR/CSC/Yale format (e.g. 10x Genomics and h5ad formats) \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Work in progress and future work} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Work in progress and future work} Work in progress: \begin{block}{} \Rcode{h5summarize(..., op="sum")}: Optimized summarization of an HDF5 dataset or subset: \begin{itemize} \item Implemented in C (direct calls to HDF5 C lib in \Biocpkg{Rhdf5lib}) \item Operates at the level of the physical chunks \item More efficient than \Rcode{blockApply()} \item Integration to \Biocpkg{DelayedArray}/\Biocpkg{DelayedMatrixStats}: \Rcode{h5summarize()} will be used behind the scene by things like \Rcode{rowVars()} \end{itemize} \end{block} \bigskip Future work: \begin{block}{} \Rcode{SparseArray} objects: In-memory sparse representation of arrays of arbitrary dimensions \begin{itemize} \item Already used internally by block processing of sparse \Rcode{DelayedArray} objects (current name is \Rcode{SparseArraySeed}) \item Will go to their own package (currently in \Rcode{DelayedArray}) \item Implement fast native operations: arithmetic, \Rcode{Math} group (e.g. \Rcode{log}), summarization, etc.. This will benefit block processing of sparse \Rcode{DelayedArray} objects \end{itemize} \end{block} \end{frame} \end{document}