%\VignetteEngine{knitr::knitr}
%\VignetteIndexEntry{1. Working with large arrays in R (slides from July 2017)}
%\VignetteDepends{knitr,Matrix,DelayedArray,HDF5Array,SummarizedExperiment,airway,lobstr}

% 2019-12-22: A temporary fix to avoid the following pdflatex error caused by
% an issue in LaTeX package filehook-scrlfile (used by beamer):
%   ! Package filehook Error: Detected unknown definition of \InputIfFileExists.
%   Use the 'force' option of 'filehook' to overwrite it..
% The error appeared on tokay2 in Dec 2019 after reinstalling MiKTeX 2.9.
% See comment by Phelype Oleinik here for the fix:
%   https://tex.stackexchange.com/questions/512189/problem-with-chemmacros-beamer-and-filehook-scrlfile-sty
\PassOptionsToPackage{force}{filehook}

\documentclass[8pt]{beamer}

\mode<presentation> {
\usetheme{Madrid}
\usecolortheme{whale}
}

\usepackage{slides}
\renewcommand\Rclass[1]{{\texttt{#1}\index{#1 (class)}}}

\AtBeginSection[]
{
  \begin{frame}<beamer>
    \tableofcontents[currentsection]
  \end{frame}
}

\title{Working with large arrays in R}
\subtitle{A look at HDF5Array/RleArray/DelayedArray objects}

\author{Herv\'e Pag\`es\\
        \href{mailto:hpages.on.github@gmail.com}{hpages.on.github@gmail.com}}

\institute{Bioconductor conference\\Boston}

\date{July 2017}

\begin{document}

<<setup, include=FALSE>>=
library(knitr)
opts_chunk$set(size="scriptsize")
mydata_dir <- file.path(tempdir(), "mydata")
if (!dir.exists(mydata_dir)) dir.create(mydata_dir)
options(width=80)
library(Matrix)
library(DelayedArray)
library(HDF5Array)
library(SummarizedExperiment)
library(airway)
library(lobstr)
@

\maketitle

\frame{\tableofcontents}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Motivation and challenges}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}[fragile]
  \frametitle{Motivation and challenges}

  R ordinary {\bf matrix} or {\bf array} is not suitable for big datasets:
  \begin{block}{}
    \begin{itemize}
      \item 10x Genomics dataset (single cell experiment):
            30,000 genes x 1.3 million cells = 36.5 billion values
      \item in an ordinary integer matrix ==> 136G in memory!
    \end{itemize}
  \end{block}

  \bigskip

  Need for alternative containers:
  \begin{block}{}
    \begin{itemize}
      \item but at the same time, the object should be (almost) as easy to
            manipulate as an ordinary matrix or array
      \item {\em standard R matrix/array API}: \Rcode{dim}, \Rcode{dimnames},
            \Rcode{t}, \Rcode{is.na}, \Rcode{==}, \Rcode{+}, \Rcode{log},
            \Rcode{cbind}, \Rcode{max}, \Rcode{sum}, \Rcode{colSums}, etc...
      \item not limited to 2 dimensions ==> also support arrays of arbitrary
            number of dimensions
    \end{itemize}
  \end{block}

  \bigskip

  2 approaches: {\bf in-memory data} vs {\bf on-disk data}
\end{frame}


\begin{frame}[fragile]
  \frametitle{Motivation and challenges}

  \centerline{\bf In-memory data}

  \begin{block}{}
    \begin{itemize}
      \item a 30k x 1.3M matrix might still fit in memory if the data can
            be efficiently compressed
      \item example: sparse data (small percentage of nonzero values) ==>
            {\em sparse representation} (storage of nonzero values only)
      \item example: data with long runs of identical values ==> {\em RLE
            compression (Run Length Encoding)}
      \item choose the {\em smallest type} to store the values: \Rcode{raw}
            (1 byte) < \Rcode{integer} (4 bytes) < \Rcode{double} (8 bytes)
      \item if using {\em RLE compression}:
            \begin{itemize}
              \item choose the {\em best orientation} to store the values:
                    {\em by row} or {\em by column} (one might give better
                    compression than the other)
              \item store the data by chunk ==> opportunity to pick up
                    {\em best type} and {\em best orientation} on a chunk
                    basis (instead of for the whole data)
            \end{itemize}
      \item size of 30k x 1.3M matrix in memory can be reduced from 136G
            to 16G!
    \end{itemize}
  \end{block}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Motivation and challenges}

  \centerline{\bf Examples of in-memory containers}

  \bigskip

  {\bf dgCMatrix} container from the \Biocpkg{Matrix} package:
  \begin{block}{}
    \begin{itemize}
      \item sparse matrix representation
      \item nonzero values stored as \Rcode{double}
    \end{itemize}
  \end{block}

  \bigskip

  {\bf RleArray} and {\bf RleMatrix} containers from the
  \Biocpkg{DelayedArray} package:
  \begin{block}{}
    \begin{itemize}
      \item use RLE compression
      \item arbitrary number of dimensions
      \item type of values: any R atomic type (\Rcode{integer},
            \Rcode{double}, \Rcode{logical}, \Rcode{complex},
            \Rcode{character}, and \Rcode{raw})
    \end{itemize}
  \end{block}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Motivation and challenges}

  \centerline{\bf On-disk data}

  \bigskip

  However...
  \begin{itemize}
    \item if data is too big to fit in memory (even after compression) ==>
          must use {\em on-disk representation}
    \item challenge: should still be (almost) as easy to manipulate as
          an ordinary matrix! ({\em standard R matrix/array API})
  \end{itemize}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Motivation and challenges}

  \centerline{\bf Examples of on-disk containers}

  \bigskip

  Direct manipulation of an {\bf HDF5 dataset} via the
  \Biocpkg{rhdf5} API. Low level API!

  \bigskip

  {\bf HDF5Array} and {\bf HDF5Matrix} containers from the
  \Biocpkg{HDF5Array} package:
  \begin{block}{}
    Provide access to the HDF5 dataset via an API that mimics the standard
    R matrix/array API
  \end{block}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Memory footprint}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}[fragile]
  \frametitle{Memory footprint}

  \centerline{\bf The "airway" dataset}

  \begin{columns}[t]
    \begin{column}{0.36\textwidth}
      \begin{exampleblock}{}
<<airway>>=
library(airway)
data(airway)
m <- unname(assay(airway))
dim(m)
typeof(m)
@
      \end{exampleblock}
    \end{column}
    \begin{column}{0.52\textwidth}
      \begin{exampleblock}{}
<<airway2>>=
head(m, n=4)
tail(m, n=4)
sum(m != 0) / length(m)
@
      \end{exampleblock}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Memory footprint}

  \centerline{{\bf dgCMatrix} vs {\bf RleMatrix} vs {\bf HDF5Matrix}}

  \begin{columns}[t]
    \begin{column}{0.60\textwidth}
      \begin{exampleblock}{}
<<obj_size>>=
library(lobstr)  # for obj_size()
obj_size(m)

library(Matrix)
obj_size(as(m, "dgCMatrix"))

library(DelayedArray)
obj_size(as(m, "RleMatrix"))
obj_size(as(t(m), "RleMatrix"))

library(HDF5Array)
obj_size(as(m, "HDF5Matrix"))
@
      \end{exampleblock}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Memory footprint}

  Some limitations of the sparse matrix implementation in the \Biocpkg{Matrix}
  package:

  \begin{block}{}
    \begin{itemize}
      \item nonzero values always stored as \Rcode{double}, the most memory
            consuming type
      \item number of nonzero values must be $< 2^{31}$
      \item limited to 2 dimensions: no support for arrays of arbitrary number
            of dimensions
    \end{itemize}
  \end{block}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{RleArray and HDF5Array objects}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  RleMatrix/RleArray and HDF5Matrix/HDF5Array provide:
  \begin{block}{}
    \begin{itemize}
      \item support all R atomic types
      \item no limits in size (but each dimension must be $< 2^{31}$)
      \item arbitrary number of dimensions
    \end{itemize}
  \end{block}

  \bigskip

  And also:
  \begin{block}{}
    \begin{itemize}
      \item {\bf delayed operations}
      \item {\bf block processing} (behind the scene)
      \item TODO: multicore block processing (sequential only at the moment)
    \end{itemize}
  \end{block}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  \centerline{\bf Delayed operations}

  \bigskip

  \centerline{We start with HDF5Matrix object \Rcode{M}:}

  \begin{columns}[t]
    \begin{column}{0.60\textwidth}
      \begin{exampleblock}{}
<<M>>=
M <- as(m, "HDF5Matrix")
M
@
      \end{exampleblock}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  Subsetting is delayed:

  \begin{columns}[t]
    \begin{column}{0.40\textwidth}
      \begin{exampleblock}{}
<<M2>>=
M2 <- M[10:12, 1:5]
M2
@
      \end{exampleblock}
    \end{column}
    \begin{column}{0.48\textwidth}
      \begin{exampleblock}{}
<<seed_of_M2>>=
seed(M2)
@
      \end{exampleblock}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  Transposition is delayed:

  \begin{columns}[t]
    \begin{column}{0.40\textwidth}
      \begin{exampleblock}{}
<<>>=
M3 <- t(M2)
M3
@
      \end{exampleblock}
    \end{column}
    \begin{column}{0.48\textwidth}
      \begin{exampleblock}{}
<<>>=
seed(M3)
@
      \end{exampleblock}
    \end{column}
    \end{columns}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  \Rcode{cbind()} / \Rcode{rbind()} are delayed:

  \begin{columns}[t]
    \begin{column}{0.44\textwidth}
      \begin{exampleblock}{}
<<>>=
M4 <- cbind(M3, M[1:5, 6:8])
M4
@
      \end{exampleblock}
    \end{column}
    \begin{column}{0.44\textwidth}
      \begin{exampleblock}{}
<<eval=FALSE>>=
seed(M4)  # Error! (more than one seed)
@
      \end{exampleblock}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  All the operations in the following groups are delayed:
  \begin{itemize}
    \item \Rcode{Arith} (\Rcode{+}, \Rcode{-}, ...)
    \item \Rcode{Compare} (\Rcode{==}, \Rcode{<}, ...)
    \item \Rcode{Logic} (\Rcode{\&}, \Rcode{|})
    \item \Rcode{Math} (\Rcode{log}, \Rcode{sqrt})
    \item and more ...
  \end{itemize}

  \begin{columns}[t]
    \begin{column}{0.42\textwidth}
      \begin{exampleblock}{}
<<>>=
M5 <- M == 0
M5
@
      \end{exampleblock}
    \end{column}
    \begin{column}{0.47\textwidth}
      \begin{exampleblock}{}
<<>>=
seed(M5)
@
      \end{exampleblock}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  \begin{columns}[t]
    \begin{column}{0.44\textwidth}
      \begin{exampleblock}{}
<<>>=
M6 <- round(M[11:14, ] / M[1:4, ], digits=3)
M6
@
      \end{exampleblock}
    \end{column}
    \begin{column}{0.44\textwidth}
      \begin{exampleblock}{}
<<eval=FALSE>>=
seed(M6)  # Error! (more than one seed)
@
      \end{exampleblock}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  \centerline{\bf Realization}

  \bigskip

  Delayed operations can be {\bf realized} by coercing the DelayedMatrix
  object to HDF5Array:

  \begin{columns}[t]
    \begin{column}{0.40\textwidth}
      \begin{exampleblock}{}
<<>>=
M6a <- as(M6, "HDF5Array")
M6a
@
      \end{exampleblock}
    \end{column}
    \begin{column}{0.48\textwidth}
      \begin{exampleblock}{}
<<>>=
seed(M6a)
@
      \end{exampleblock}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  \bigskip

  ... or by coercing it to RleArray:

  \begin{columns}[t]
    \begin{column}{0.44\textwidth}
      \begin{exampleblock}{}
<<>>=
M6b <- as(M6, "RleArray")
M6b
@
      \end{exampleblock}
    \end{column}
    \begin{column}{0.44\textwidth}
      \begin{exampleblock}{}
<<>>=
seed(M6b)
@
      \end{exampleblock}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  \centerline{\bf Controlling where HDF5 datasets are realized}

  \bigskip

  {\em HDF5 dump management utilities}: a set of utilities to control where
  HDF5 datasets are written to disk.

  \begin{columns}[t]
    \begin{column}{0.44\textwidth}
      \begin{exampleblock}{}
<<>>=
hdf5_dumpfile <- file.path(mydata_dir, "M6c.h5")
setHDF5DumpFile(hdf5_dumpfile)
setHDF5DumpName("M6c")
M6c <- as(M6, "HDF5Array")
@
      \end{exampleblock}
    \end{column}
    \begin{column}{0.44\textwidth}
      \begin{exampleblock}{}
<<>>=
seed(M6c)
h5ls(hdf5_dumpfile)
@
      \end{exampleblock}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  \centerline{\Rcode{showHDF5DumpLog()}}

  \begin{exampleblock}{}
<<>>=
showHDF5DumpLog()
@
  \end{exampleblock}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  \centerline{\bf Block processing}

  \bigskip

  The following operations are NOT delayed. They are implemented via a
  {\em block processing} mechanism that loads and processes one block
  at a time:
  \begin{itemize}
    \item operations in the \Rcode{Summary} group (\Rcode{max}, \Rcode{min},
          \Rcode{sum}, \Rcode{any}, \Rcode{all})
    \item \Rcode{mean}
    \item Matrix row/col summarization operations (\Rcode{col/rowSums},
          \Rcode{col/rowMeans}, ...)
    \item \Rcode{anyNA}, \Rcode{which}
    \item \Rcode{apply}
    \item and more ...
  \end{itemize}
\end{frame}

\begin{frame}[fragile]
  \frametitle{RleArray and HDF5Array objects}

  \begin{columns}[t]
    \begin{column}{0.75\textwidth}
      \begin{exampleblock}{}
<<>>=
DelayedArray:::set_verbose_block_processing(TRUE)
colSums(M)
@
      \end{exampleblock}

      Control the block size:

      \begin{exampleblock}{}
<<>>=
getAutoBlockSize()
setAutoBlockSize(1e6)
colSums(M)
@ 
      \end{exampleblock}
    \end{column}
  \end{columns}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Hands-on}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}[fragile]
  \frametitle{Hands-on}

  \begin{block}{}
    1. Load the "airway" dataset.
  \end{block}

  \begin{block}{}
    2. It's wrapped in a SummarizedExperiment object. Get the count data as
       an ordinary matrix.
  \end{block}

  \begin{block}{}
    3. Wrap it in an HDF5Matrix object: (1) using \Rcode{writeHDF5Array()};
       then (2) using coercion.
  \end{block}

  \begin{block}{}
    4. When using coercion, where has the data been written on disk?
  \end{block}

  \begin{block}{}
    5. See \Rcode{?setHDF5DumpFile} for how to control the location of
       "automatic" HDF5 datasets. Try to control the destination of the
       data when coercing.
  \end{block}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Hands-on}

  \begin{block}{}
    6. Use \Rcode{showHDF5DumpLog()} to see all the HDF5 datasets written to
       disk during the current session.
  \end{block}

  \bigskip

  \begin{block}{}
    7. Try some operations on the HDF5Matrix object: (1) some delayed ones;
       (2) some non-delayed ones (block processing).
  \end{block}

  \bigskip

  \begin{block}{}
    8. Use \Rcode{DelayedArray:::set\_verbose\_block\_processing(TRUE)}
       to see block processing in action.
  \end{block}

  \bigskip

  \begin{block}{}
    9. Control the block size with \Rcode{setAutoBlockSize()}.
  \end{block}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Hands-on}

  \begin{block}{}
    10. Stick the HDF5Matrix object back in the SummarizedExperiment object.
        The resulting object is an "HDF5-backed SummarizedExperiment object".
  \end{block}

  \bigskip

  \begin{block}{}
    11. The HDF5-backed SummarizedExperiment object can be manipulated
        (almost) like an in-memory SummarizedExperiment object.
        Try \Rcode{[}, \Rcode{cbind}, \Rcode{rbind} on it.
  \end{block}

  \bigskip

  \begin{block}{}
    12. The \Biocpkg{SummarizedExperiment} package provides
        \Rcode{saveHDF5SummarizedExperiment} to save a SummarizedExperiment
        object (HDF5-backed or not) as an HDF5-backed SummarizedExperiment
        object. Try it.
  \end{block}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{DelayedArray/HDF5Array: Future developments}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{frame}[fragile]
  \frametitle{Future developments}

  \centerline{\bf Block processing improvements}

  \begin{block}{}
    Block genometry: (1) better by default, (2) let the user have more
    control on it
  \end{block}

  \begin{block}{}
    Support multicore
  \end{block}

  \begin{block}{}
    Expose it: \Rcode{blockApply()}
  \end{block}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Future developments}

  \centerline{\bf HDF5Array improvements}

  \begin{block}{}
    Store the \Rcode{dimnames} in the HDF5 file (in {\em HDF5 Dimension Scale
    datasets} - \url{https://www.hdfgroup.org/HDF5/Tutor/h5dimscale.html})
  \end{block}

  \begin{block}{}
    Use better automatic chunk geometry when realizing an HDF5Array object
  \end{block}

  \begin{block}{}
    Block processing should take advantage of the chunk geometry (e.g.
    \Rcode{realize()} should use blocks that are clusters of chunks)
  \end{block}

  \begin{block}{}
    Unfortunately: not possible to support multicore realization at the
    moment (HDF5 does not support concurrent writing to a dataset yet)
  \end{block}
\end{frame}

\begin{frame}[fragile]
  \frametitle{Future developments}

  \centerline{\bf RleArray improvements}

  \begin{block}{}
    Let the user have more control on the chunk geometry when
    constructing/realizing an RleArray object
  \end{block}

  \begin{block}{}
    Like for HDF5Array objects, block processing should take advantage
    of the chunk geometry
  \end{block}

  \begin{block}{}
    Support multicore realization
  \end{block}

  \begin{block}{}
    Provide C/C++ low-level API for direct row/column access from C/C++ code
    (e.g. from the \Biocpkg{beachmat} package)
  \end{block}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

<<cleanup, include=FALSE>>=
setHDF5DumpFile()
unlink(mydata_dir, recursive=TRUE, force=TRUE)
@

\end{document}