% \VignetteEngine{knitr::knitr} % \VignetteIndexEntry{05. Genomic Ranges -- slides} \documentclass[xcolor=dvipsnames]{beamer} \usepackage{BioconductorSlides} \hypersetup{colorlinks,linkcolor=,urlcolor=Blue} \title{\Rclass{GenomicRanges} for Data and Annotation} \author{Martin T.\ Morgan\footnote{\url{mtmorgan@fhcrc.org}}} \date{27-28 February 2014} <>= suppressPackageStartupMessages({ library(knitr) library(GenomicRanges) }) opts_chunk$set(cache=TRUE, tidy=FALSE) @ \begin{document} \maketitle \begin{frame}{Introduction} Importance of range concepts: conceptually\ldots \begin{itemize} \item Genomic data and annotation can be represented by ranges \item Biological questions reflect range-based queries \end{itemize} Examples \begin{itemize} \item How many reads overlap each gene? \item How many reads span splice junctions? \item Where do regulatory elements bind in ChIP-seq experiments? \item Which regulatory elements are closest to differentially expressed genes? \item What sequences are common under discovered regulatory marks? \end{itemize} \end{frame} \begin{frame}{Where do \Rclass{GRanges}-like objects come from?} Data \begin{itemize} \item From BAM files via \Rfunction{readGAlignments} in \Biocpkg{GenomicAlignments} \item From BED files via \Rfunction{import} in \Biocpkg{rtracklayer} \end{itemize} Annotation \begin{itemize} \item \Biocpkg{rtracklayer} import BED, WIG, GTF, \ldots files \item \Rpackage{TxDb.*} model organsism gene models; \Biocpkg{GenomicFeatures} \Rfunction{makeTranscriptDbFrom*} \item \Biocpkg{AnnotationHub} -- pre-computed instances from large public resources (later in course) \end{itemize} \end{frame} \begin{frame}[fragile]{Key reference} Lawrence et al., 2013, Software for Computing and Annotating Genomic Ranges. PLoS Comput Biol 9(8): e1003118\footnote{\url{http://www.ploscompbiol.org/article/info\%3Adoi\%2F10.1371\%2Fjournal.pcbi.1003118}} \begin{itemize} \item Initial developers: Michael Lawrence, Herv\'e Pag\`es, Patrick Aboyoun \end{itemize} \end{frame} \section{Ranges} \subsection{IRanges} \begin{frame}[fragile]{\Rclass{Ranges}} What is a range? \begin{itemize} \item `start' and `end' coordinate vectors \item Closed interval (i.e., include end points) \item Zero-width convention \item Can be `named' \end{itemize} <>= library(IRanges) eg <- IRanges(start= c(1, 10, 20), end = c(4, 10, 19), names= c("A", "B", "C")) ## bigger start <- floor(runif(10000, 1, 1000)) end <- start + floor(runif(10000, 0, 100)) ir <- IRanges(start, end) @ \end{frame} \begin{frame}[fragile]{`Accessors' and simple manipulation} Accessors \begin{itemize} \item \Rfunction{start}, \Rfunction{end}, \Rfunction{width}, \Rfunction{names} \end{itemize} `Vector'-like behavior \begin{itemize} \item \Rfunction{length}, \Rcode{[} \end{itemize} <>= length(ir) ir[1:4] start(ir[1:4]) ir[width(ir) > 10 & width(ir) < 20] @ \end{frame} \begin{frame}[fragile]{Operations} \begin{enumerate} \item Intra-range: operate on each range independently, e.g., \Rfunction{shift} \item Inter-range: operate on several ranges of a single instance, e.g., \Rfunction{reduce}, \Rfunction{coverage} \item Between-range: operate on two instances, e.g., \Rfunction{findOverlaps} \end{enumerate} See table in afternoon lab! <>= ir <- IRanges(start=c(7, 9, 12, 14, 22:24), end=c(15, 11, 12, 18, 26, 27, 28)) shift(ir, 1) rir <- reduce(ir) findOverlaps(ir, rir) @ \end{frame} \begin{frame}[fragile]{\Rclass{IRangesList}} \begin{itemize} \item Often useful to group \Rclass{IRanges} into a list, with each element of the list containing 0 or more \Rclass{IRanges} instances \item Operations usually work on list element \end{itemize} <>= irl <- split(ir, width(ir)) reduce(irl) @ \end{frame} \subsection{GRanges} \begin{frame}[fragile]{GRanges} Builds on \Rclass{IRanges}, \Rclass{IRangesList}\ldots \begin{itemize} \item `seqnames' (e.g., chromosome) and `strand' \item (optional) `seqlengths' for genome information \item (optional) `mcols' for `metadata' data frame on each range \end{itemize} <>= library(GenomicRanges) genes <- GRanges(seqnames=c("chr3R", "chrX"), ranges=IRanges( start=c(19967117, 18962306), end =c(19973212, 18962925), names=c("FBgn0039155", "FBgn0085359")), strand=c("+", "-"), seqlengths=c(chr3R=27905053L, chrX=22422827L)) mcols(genes) <- DataFrame(EntrezId=c("42865", "2768869"), Symbol=c("kal-1", "CG34330")) @ \end{frame} \begin{frame}[fragile]{Coordinates and accessors} Genome coordinates \begin{itemize} \item 1-based \item `left-most' -- `start' of ranges on the minus strand are the left-most coordinate, rather than the 5' coordinate. \end{itemize} Accessors \begin{itemize} \item \Rfunction{seqnames}, \Rfunction{strand}, \Rfunction{seqlengths}, \Rfunction{seqlevels} and like \Rclass{IRanges}: \Rfunction{start}, \Rfunction{end}, \Rfunction{width}, \Rfunction{names} \item \Rfunction{mcols}; \Rfunction{\$} for direct access to metadata \end{itemize} <>= width(genes) genes$Symbol @ \end{frame} \begin{frame}[fragile]{Operations} \begin{itemize} \item Like \Rclass{IRanges}, but generally seqnames- and strand-aware \item E.g., \Rfunction{flank} identifies \emph{upstream} (5') region \item E.g., \Rfunction{findOverlaps} checks \Rcode{seqnames} and \Rcode{strand} \end{itemize} <>= flank(genes, 1000) ## 5' flanking range @ \end{frame} \subsection{Other Idioms} \begin{frame}[fragile]{\Rclass{*List} classes} \begin{itemize} \item Often useful to have a list, where all elements of the list are restricted to be of the same type -- like \Rclass{IRangesList} \item Support for common `atomic' types (\Rclass{LogicalList}, \Rclass{IntegerList}, \Rclass{NumericList}, \Rclass{CharacterList}, \ldots) in addition to \Rclass{IRangesList}, \Rclass{GRangesList}, \ldots \item Operations on list elements usually vectorized across elements \end{itemize} <>= rl <- splitAsList(1:5, c("A", "B", "A", "B", "B")) elementLengths(rl) log(rl) @ \end{frame} \begin{frame}{Three advanced concepts} \begin{enumerate} \item \Rclass{GRanges} extends \Rclass{IRanges::Vector}, from which it inherits vector-like operations and metadata. \item \Rclass{*List} data structures are actually vectors $+$ a partitioning, so operations like \Rcode{unlist}, \Rcode{relist} and \Rcode{split} are fast. \item Many computationally expensive operations, e.g., \Rcode{findOverlaps} are implemented in C, and are fast. \end{enumerate} \end{frame} \end{document}