\documentclass{beamer} \usepackage[normalem]{ulem} \usepackage{Sweave} \mode { \usetheme{Darmstadt} \useinnertheme{circles} \useoutertheme{infolines} \setbeamercovered{transparent} } \newcommand{\Rfunction}[1]{{\texttt{#1}}} \newcommand{\Rfunarg}[1]{{\texttt{#1}}} \newcommand{\Robject}[1]{{\texttt{#1}}} \newcommand{\Rpackage}[1]{{\textit{#1}}} \newcommand{\Rclass}[1]{{\textit{#1}}} \newcommand{\code}[1]{{\texttt{#1}}} \newcommand{\software}[1]{{\textit{#1}}} \title{The *Ranges Suite} \subtitle{Application to the analysis of RNA-seq data} \author{Michael Lawrence} \institute{Genentech} \AtBeginSubsection[] { \begin{frame}{Outline} \tableofcontents[currentsection,currentsubsection] \end{frame} } \AtBeginSection[] { \begin{frame}{Outline} \tableofcontents[currentsection,currentsubsection] \end{frame} } \begin{document} <>= options(width=50) library(RangesRNAseqTutorial) @ \frame{\titlepage} %% OUTLINE %%% Introduction: *Ranges packages, RNA-seq approaches and questions %%% Simple expression counts (in yeast?) %%% Isoform-specific expression in CRC ALDOA data %%% Transcript structure: novel junctions in ALDOA data \section{Introduction} \begin{frame} \frametitle{Sequencing Approaches} \begin{description} \item[Source] Genome, transcriptome, synthetic, ... \item[Enrichment] WGS, ChIP, PCR, poly-A, exome capture, ... \end{description} \end{frame} \begin{frame} \frametitle{RNA-seq} \begin{itemize} \item High-throughput sequencing of cDNA libraries \item Usually there is some enrichment for mRNAs (e.g. poly-A) \item Usually aligned to genome with splicing tolerance (e.g. GSNAP) \end{itemize} \end{frame} \begin{frame} \frametitle{General Sequence Analysis Workflow} \begin{enumerate} \item QA on raw instrument output, see \Rpackage{ShortRead} \item Usually, external alignment of data, i.e., gSNAP via upcoming \Rpackage{gmapR} \item Import of alignments and/or sequences into R \item Analysis of sequences, alignments and enrichment patterns \end{enumerate} \end{frame} \begin{frame} \frametitle{The *Ranges Packages} \begin{block}{\Rpackage{IRanges}} Base of the sequence analysis infrastructure in Bioconductor \begin{itemize} \item Data structures for interval datasets and genome-scale vectors \item Routines for finding regions of enrichment and overlap between features \end{itemize} \end{block} \begin{block}{\Rpackage{GenomicRanges}} Extension of \Rpackage{IRanges} for genomic (biological) datasets, including sequence annotations and read alignments, with biology-specific overlap algorithms and other utilities. \end{block} \end{frame} % \subsection{Questions} % \label{sec:questions} %% Some sort of high-level, motivating questions \begin{frame} \frametitle{Questions of Interest in RNA-seq} RNA-seq alignments permit us to ask questions about transcript: \begin{block}{} \begin{itemize} \item<2-5> \alert<2>{Expression levels (from counting overlaps with gene annotations) and their differences between samples, alleles and splicing isoforms.} \item<3-5> \alert<3>{Structure, novel splicing, gene fusions (from the alignments)} \item<4> \alert<4>{Variants, RNA-editing (from the sequences)} \end{itemize} \end{block} \includegraphics<1>[width=11cm]{isoform-expr-intro-isoforms-b} \includegraphics<2>[width=11cm]{isoform-expr-intro-data-b} \includegraphics<3>[width=11cm]{isoform-expr-intro-reads} \includegraphics<4->[width=11cm]{isoform-expr-intro-variants} \end{frame} \section{Gene Expression} \begin{frame} \frametitle{RNA-seq Expression Analysis Workflow} \begin{itemize} \item<1> QA (\Rpackage{ShortRead}), alignment (\Rpackage{gmapR}) \item<1-2> Import of alignments \item<1-2> Counting of alignments over various features, including genes, transcripts and exons. \item<1> Differential expression between samples (\Rpackage{DEseq}, \Rpackage{edgeR}), isoforms (\Rpackage{DEXseq}), or alleles \item<1> Further down-stream analyses, including GSEA \end{itemize} \end{frame} \begin{frame} \frametitle{Data from \Rpackage{leeBamViews}} \begin{itemize} \item Four samples from a Yeast RNA-seq experiment \item Two wildtype, two RLP mutants \item Illumina 36bp single-end reads \item Alignments in \Rpackage{leeBamViews} for positions 800000 to 900000 on chromosome XIII \item Stored as BAM files \end{itemize} \end{frame} \begin{frame} \frametitle{Disclaimer} \begin{alertblock}{For Demonstration Purposes Only} The alignments in \Rpackage{leeBamViews} are for a reference that is slightly different from (about two weeks older than) the UCSC sacCer2 build, for which we have gene annotations. \end{alertblock} \end{frame} \subsection{Importing Alignments} \begin{frame} \frametitle{Representing SAM/BAM in R} Could store SAM alignments in a \Rclass{GRanges}, but alignments are common enough for a formal representation. \begin{block}{The \Rclass{GappedAlignments} Class} A \Rclass{Vector} of \emph{unpaired} read alignments with SAM-specific fields \begin{itemize} \item Load with \code{readGappedAlignments(file)} \item Access \Rfunction{cigar}, \Rfunction{qpos}, \Rfunction{start}, etc. \item Some utilities, like \Rfunction{coverage}. \item For overlap computations, convert to gapped ranges with \Rfunction{grglist}, get intronic gaps with \Rfunction{introns}. \end{itemize} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Loading a BAM File} \begin{block}{} <>= bams <- dir(system.file("bam", package="leeBamViews"), full = TRUE, pattern = "bam$") reads_ga <- readGappedAlignments(bams[[1]]) head(reads_ga, 1) @ %$ \end{block} \end{frame} \subsection{Counting Overlaps} \begin{frame} \frametitle{Representing Read Alignments as Ranges} For finding overlaps between reads and genomic features, we need to represent read alignments as intervals on stranded sequences. \\ \bigskip \begin{tabular}{llll|l} \hline Sequence Name & Start & End & Strand & metadata... \\ \hline chr10 & 3012936 & 3012959 & + & \\ chr10 & 3012941 & 3012964 & + & \\ chr10 & 3012944 & 3012967 & + & \end{tabular} \\ \bigskip \begin{onlyenv}<2> \begin{alertblock}{} All genomic data fits this basic format \end{alertblock} \end{onlyenv} \end{frame} \begin{frame} \frametitle{Genomic Datasets in R} \begin{block}{The \Rclass{GRanges} Class} A \Rclass{Vector} of genomic intervals, with metadata \begin{itemize} \item Constructor: \code{GRanges(seqnames, ranges, strand, ...)} \item \code{seqnames(x)}: sequence name \item \code{start(x)}, \code{end(x)}, \code{width(x)}: interval information \item \code{strand(x)}: strand (+/-/*) \item \code{values(x)}: a \Rclass{DataFrame} of metadata columns, like score or gene \item \code{seqinfo(x)}: a \Rclass{Seqinfo} with information about the sequences \end{itemize} Group multiple \Rclass{GRanges} in a \Rclass{GRangesList}, representing ranges with gaps. \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Representing Alignments as Ranges with Gaps} \begin{itemize} \item Read alignments may contain gaps, e.g., cross a splice junction. \item The possibly multiple ranges per read need to be grouped together. \item Use \Rclass{GRangesList}, with one \Rclass{GRanges} per read. \end{itemize} \begin{block}{Creating a \Rclass{GRangesList} from \Rclass{GappedAlignments}} <>= reads_grl <- grglist(reads_ga) @ \end{block} \end{frame} \begin{frame} \frametitle{Representing Transcript Models} \begin{itemize} \item Many ways to represent transcript models as ranges: \begin{itemize} \item Transcripts \item Exons, introns \item CDS, UTRs \end{itemize} \item Reference annotations, so prefer persistent storage \end{itemize} \end{frame} \begin{frame} \frametitle{Transcript Models in R} \begin{block}{The \Rclass{TxDb} Class} Reference to SQLite DB with transcript information \begin{itemize} \item \code{transcripts(x)}: whole transcript ranges \item \code{exons(x)}: exon ranges \item \code{cds(x)}: coding exon ranges \item \code{exonsBy(x)}, \code{cdsBy(x)}: \Rclass{GRangesList} object, grouping by transcript or gene \end{itemize} For common organisms/models, \Rpackage{TxDb.*} packages are available. \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Obtaining the Yeast Transcripts} We get the coding exons grouped by gene, and individually. Using the coding exons is usually a good idea, since the UTRs can be problematic. \begin{block}{} <>= library(TxDb.Scerevisiae.UCSC.sacCer2.sgdGene) sc2_txdb <- TxDb.Scerevisiae.UCSC.sacCer2.sgdGene sc2_cds <- cds(sc2_txdb) sc2_cds_gene <- cdsBy(sc2_txdb, by = "gene") @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Exercise} \begin{alertblock}{Problem} Get the coding exons grouped by transcript, instead of gene. \end{alertblock} \begin{onlyenv}<2> \begin{block}{Solution} <>= sc2_cds_tx <- cdsBy(sc2_txdb, by = "tx") @ \end{block} \end{onlyenv} \end{frame} \begin{frame}[fragile] \frametitle{Getting Gene IDs for the Transcripts} For interpretation, it is useful to have the gene ID for each transcript: \begin{block}{} <>= sc2_tx <- transcripts(sc2_txdb, columns = c("tx_id", "gene_id")) tx_id_match <- match(names(sc2_cds_tx), values(sc2_tx)$tx_id) gene_id <- values(sc2_tx)$gene_id[tx_id_match] @ %$ \end{block} Theoretically possible for transcript to belong to multiple genes, so \texttt{gene\_id} is a \Rclass{CharacterList}. Typically there is one gene per transcript, so we \Rfunction{drop} it to a \Rclass{character} vector: \begin{block}{} <>= all(elementLengths(gene_id) <= 1) values(sc2_tx)$gene_id <- drop(gene_id) @ % $ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Count Reads in Exons, by Gene} \begin{block}{Reconcile Chromosome Names} <>= sc2_cds_gene <- keepSeqlevels(sc2_cds_gene, "chrXIII") sc2_cds_gene <- renameSeqlevels(sc2_cds_gene, c(chrXIII = "Scchr13")) @ \end{block} \begin{block}{Call \Rfunction{countOverlaps}} <>= sc2_counts <- countOverlaps(sc2_cds_gene, reads_grl, ignore.strand = TRUE) @ %$ \end{block} We pass \code{ignore.strand = TRUE}, because the strand is the strand of alignment, not transcription. \end{frame} \begin{frame} \frametitle{Some Limitations of our Simple Approach} We have the count of reads overlapping with the coding regions of every gene, but: \begin{itemize} \item It only works for a single BAM file (need a convenience function), \item A read is counted for every feature it overlaps, which works reasonably well for genes but does not help with distinguishing transcript isoforms, and \item There is no consideration of splicing, i.e., checking for compatibility with known transcript structures. \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Processing Multiple BAM Files} \begin{block}{} <>= sc2_counts <- seqapply(bams, function(bam) { countOverlaps(sc2_cds_gene, grglist(readGappedAlignments(bam)), ignore.strand = TRUE) }) @ \end{block} The results land in a simple list of integer vectors; we need something more formal. \end{frame} \begin{frame} \frametitle{Representing Rectangular Counts} \begin{block}{The \Rclass{SummarizedExperiment} class} The \Rclass{SummarizedExperiment} class is the \Rclass{eSet} analog in the \Rclass{IRanges} framework. It is a rectangular, self-contained data structure with the following components: \begin{description} \item[\Robject{assays}] A list of matrices, each feature X sample, containing counts or other measurements \item[\Robject{rowData}] A \Rclass{GRanges} or \Rclass{GRangesList} with annotations on the features; \emph{each feature must then be a genomic range (possibly with gaps).} \item[\Robject{colData}] A \Rclass{DataFrame} with phenotypic data on the samples \item[\Robject{exptData}] A list of metadata objects describing the experiment \end{description} \end{block} \end{frame} \begin{frame} \frametitle{\Rclass{SummarizedExperiment} API} \begin{itemize} \item \code{SummarizedExperiment(assays, rowData, colData, exptData, ...)}: construct an instance with the given components \item \code{assay(x, i)}: Get the \code{i}th assay matrix \item \code{rowData(x)}, \code{colData(x)}, \code{exptData(x)}: get the corresponding metadata component \item \code{x[i, j]}: Rectangular subset by feature and sample. \item \code{x\$name}: Get the named column in \Robject{colData} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Storing the Counts in a \Rclass{SummarizedExperiment}} \begin{block}{} <>= assay <- list(counts = do.call(cbind, as.list(sc2_counts))) rowData <- sc2_cds_gene samples <- gsub("_.*", "", basename(bams)) colData <- DataFrame(genotype = gsub(".$", "", samples), lane = gsub(".*(.)$", "\\1", samples), row.names = samples) se <- SummarizedExperiment(assay, rowData, colData) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Calculating RPKMs} \begin{block}{} <>= sc2_count_mat <- assay(se) k <- sum(width(rowData(se))) m <- countBam(BamFileList(bams))[,"records"] / 1e6 rpkm <- sc2_count_mat / k / rep(m, each = nrow(sc2_count_mat)) @ \end{block} \end{frame} \begin{frame} \frametitle{Potential Shortcut: \Rfunction{summarizeOverlaps}} The \Rfunction{summarizeOverlaps} function: \begin{itemize} \item Counts overlaps between BAM files and a set of genomic features, \item \alert<2->{Always discards reads that map to multiple features}, with several different algorithms for determining uniqueness, and \item Returns the result as a \Rclass{SummarizedExperiment}. \end{itemize} \begin{onlyenv}<3> \begin{alertblock}{} Filters do not consider splicing, i.e., splicing compatibility and junction hits. \end{alertblock} \end{onlyenv} \end{frame} \begin{frame}[fragile] \frametitle{Feeding into DE Algorithms} \begin{block}{\Rpackage{DESeq}} <>= library(DESeq) rowDataDf <- as.data.frame(unlist(range(rowData(se)))) cond <- as.data.frame(colData(se)) featureData <- AnnotatedDataFrame(rowDataDf) deseq <- newCountDataSet(assay(se), cond = cond, featureData = featureData) @ \end{block} \begin{block}{\Rpackage{edgeR}} <>= library(edgeR) edger <- DGEList(assay(se), group = colData$genotype, genes = rowDataDf) @ %$ \end{block} \end{frame} % \begin{frame} % \frametitle{Regions of Interest over Multiple BAMs} % \begin{block}{} % <>= % sc2_exons <- unlist(sc2_tx, use.names = FALSE) % bamViews <- BamViews(bamPaths = bams, % bamRanges = sc2_exons, % bamSamples = sampleData) % @ % \end{block} % \end{frame} \begin{frame}[fragile] \frametitle{Find Transcript Hits for Each Read} \begin{block}{Using \Rfunction{findOverlaps}} <>= hits <- findOverlaps(reads_grl, sc2_cds_gene, ignore.strand = TRUE) values(reads_grl)$hits <- as(hits, "List") head(table(elementLengths(values(reads_grl)$hits))) @ \end{block} \end{frame} \section{Isoform-specific Expression} \label{sec:approach} \subsection{Approach} \begin{frame} \frametitle{Focus: \emph{Annotated} Transcript Isoforms} \begin{block}{} We are primarily interested in the expression of annotated isoforms and deviations from known structures. \end{block} \end{frame} \begin{frame} \frametitle{Questions about Isoform Expression} \begin{block}{} \begin{itemize} \item<2-> \alert<2>{Is an isoform expressed?} \item<3-> \alert<3>{What is the expression level of a particular isoform, and how can we compare it to that of other isoforms?} \item<4-> \alert<4>{Is the balance in isoform expression different across samples?} \end{itemize} \end{block} \includegraphics<1>[width=11cm]{isoform-expr-intro-isoforms} \includegraphics<2>[width=11cm]{isoform-expr-intro-data} \includegraphics<3>[width=11cm]{isoform-expr-intro-quantify} \includegraphics<4>[width=11cm]{isoform-expr-intro-balance} \end{frame} \begin{frame} \frametitle{Fragment Length Matters} \begin{alertblock}{Note} Differences in fragment length between regions and samples can bias counts in multiple ways, including: \begin{itemize} \item Coverage at ends of transcript is reduced; the affected region grows with fragment length. The bias is most significant for short genes and those with variable TSS/TSE. \item Any counts of transcript sub-regions (exons) or splice junctions are further affected, because the probability of overlap depends on the fragment length. \end{itemize} \end{alertblock} \end{frame} \begin{frame} \frametitle{Fragment Length Affects Capture of Cassette Exon} \framesubtitle{w/ Robert Gentleman} \includegraphics<1>[width=12cm]{isoforms-capture} \includegraphics<2>[width=12cm]{isoforms-capture-rule} \includegraphics<3>[width=12cm]{isoforms-capture-small} \includegraphics<4>[width=12cm]{isoforms-capture-medium} \includegraphics<5>[width=12cm]{isoforms-capture-large} \end{frame} \begin{frame} \frametitle{Example} \begin{itemize} \item Illumina paired-end 75nt RNA-seq reads from two matched human tissue samples: tumor and normal \item Strand inferred during alignment from splice directions \item Stored in a BAM file \item Subset to the ALDOA (aldolase) gene \end{itemize} \end{frame} \subsection{Looking at Fragment Length} %% findSpliceOverlaps basically does this from a BamFile. We need to %% cover readGappedAlignmentPairs though. For one thing, we want to %% play with the fragment length. \begin{frame}[fragile] \frametitle{Getting the Annotations for ALDOA} \begin{onlyenv}<1> \begin{block}{Finding the ALDOA Region} <>= aldoa_eg <- org.Hs.egSYMBOL2EG$ALDOA library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene aldoa_exons <- exons(txdb, vals = list(gene_id = aldoa_eg), columns = c("gene_id", "tx_id")) @ %$ \end{block} \begin{block}{Grouping the Exons by Transcript} <>= aldoa_range <- range(aldoa_exons) aldoa_exons_tx <- multisplit(aldoa_exons, values(aldoa_exons)$tx_id) @ %$ \end{block} \end{onlyenv} \begin{onlyenv}<2> \begin{block}{Getting the Coding Extents} <>= aldoa_cds <- cds(txdb, vals = list(gene_id = aldoa_eg), columns = c("gene_id", "tx_id")) aldoa_cds_tx <- multisplit(aldoa_cds, values(aldoa_cds)$tx_id) aldoa_exons_tx <- aldoa_exons_tx[names(aldoa_cds_tx)] @ % $ \end{block} \end{onlyenv} \end{frame} \begin{frame}[fragile] \frametitle{Selecting Only the CCDS Models} Just take my word for it: \begin{block}{} <>= aldoa_exons_tx <- aldoa_exons_tx[c(1, 3, 4, 6)] aldoa_cds_tx <- aldoa_cds_tx[c(1, 3, 4, 6)] @ %$ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Reading the Alignments} \begin{block}{Finding the BAMs} <>= extdatadir <- system.file("extdata", package = "RangesRNAseqTutorial") files <- tools::list_files_with_exts(extdatadir, "bam") names(files) <- tools::file_path_sans_ext(basename(files)) bamFiles <- BamFileList(files) @ \end{block} \begin{block}{Load the normal BAM} <>= bam <- bamFiles$normal @ <>= param <- ScanBamParam(tag = "XS", what = "isize", which = aldoa_range) ga <- readGappedAlignmentPairs(path(bam), param = param) @ %$ \end{block} We request the \textit{XS} tag, the strand as determined by the aligner from the splice directions, as well as the \texttt{isize} (TLEN) field from the BAM file, which is the fragment length. \end{frame} \begin{frame} \frametitle{Representing Paired-end Alignments} \begin{block}{The \Rclass{GappedAlignmentPairs} Class} The \Rclass{GappedAlignmentPairs} class is composed of two parallel \Rclass{GappedAlignments} objects: one for the first read of the pair, the other for the last. It provides the following API: \begin{itemize} \item \code{first(x)}, \code{last(x)}: gets the \Rclass{GappedAlignments} for the first or last reads of every pair \item \code{left(x)}, \code{right(x)}: gets the \Rclass{GappedAlignments} for the left or right member of every pair, in terms of genomic position \item \code{grglist(x)}: converts to a \Rclass{GRangesList}, with gaps for the introns \item \code{introns(x)}: gets the intronic gaps as a \Rclass{GRangesList} \end{itemize} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Getting the Fragment Length} \begin{itemize} \item The algorithm for computing isize/TLEN differs by aligner, and some aligners do not populate the field. \item GSNAP does exactly what we want: it takes the extent of the paired-end alignment, excluding clipped regions and intronic gaps (N in CIGAR). So we can simply use the absolute value of the \texttt{isize} field (the sign depends on the relative read positions): \begin{block}{} <>= fraglen <- abs(values(first(ga))$isize) @ %$ \end{block} Any introns in the inter-read gap are not accounted for (because we do not know the original transcript). \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Exercise: Computing Fragment Length} \begin{alertblock}{Problem} Use \Rclass{GappedAlignmentPairs} to compute the fragment length, without worrying about the introns. \end{alertblock} \begin{onlyenv}<2> \begin{block}{Solution} <>= fraglen_simple <- end(right(ga)) - start(left(ga)) + 1L @ \end{block} \end{onlyenv} \end{frame} \begin{frame} \frametitle{Density of Fragment Length} <>= calcFragLenCDF <- function(x) { tab <- tabulate(x, max(x)) DataFrame(fraglen = seq_len(max(x)), cumfrac = cumsum(tab) / sum(tab)) } plotFragLenCDF <- function(x, xlim, ...) { if (missing(xlim)) xlim <- c(x$fraglen[which(x$cumfrac > 0)[1]], max(x$fraglen)) p <- qplot(fraglen, cumfrac, data = as.data.frame(x), geom = "line", ylab = "Fraction of reads with fragment length <= X", xlab = "Fragment length", xlim = xlim, ...) ## if (!is.null(x$sample)) ## p <- p + geom_line(aes(group = sample)) p } ## fraglen_cdf <- calcFragLenCDF(fraglen) ## plotFragLenCDF(fraglen_cdf, xlim = c(0, 2000)) plotFragLenDensity <- function(fraglen, xlim, ...) { p <- qplot(fraglen, geom = "density", xlab = "Fragment length", xlim = xlim, ...) p } plotFragLenDensity(fraglen, xlim = c(0, 2000)) @ \end{frame} \subsection{Counting Overlaps} \begin{frame} \frametitle{Overview of Algorithm} \begin{columns}[l] \column{6cm} \begin{block}{Steps} \begin{enumerate} \item<2-> \alert<2>{Run RNA-seq pipeline, and load \textit{concordant unique} alignments} \item<3-> \alert<3>{Find reads and annotated transcripts with any exonic overlap} \item<4-> \alert<4>{Split the alignments into two bins: \textit{compatible} with annotated splicing, and \textit{incompatible}} \item<5-> \alert<5>{For the \textit{compatible}, find reads that map \textit{uniquely} to an isoform} \end{enumerate} \end{block} \column{5cm} \includegraphics<2-3>[width=5cm]{isoform-approach-1} \includegraphics<4>[width=5cm]{isoform-approach-2} \includegraphics<5>[width=5cm]{isoform-approach-3} \end{columns} \end{frame} \begin{frame} \frametitle{Compatibility and Uniqueness} \begin{block}{} \begin{itemize} \item<2-> \alert<2-3>{To be \textit{compatible} a read must align completely within the exons and the read gaps should exactly match the introns over the read extent} \item<4-> \alert<4->{To be \textit{uniquely mapped} a read must be compatible with only a single isoform} \end{itemize} \end{block} \includegraphics<1>[width=11cm]{isoform-expr-intro-isoforms-c} \includegraphics<2>[width=11cm]{isoform-expr-intro-reads-b} \includegraphics<3>[width=11cm]{isoform-expr-intro-misfits-4} \includegraphics<4>[width=11cm]{isoform-expr-intro-reads-b} \includegraphics<5>[width=11cm]{isoform-expr-intro-reads-assigned-b} \end{frame} \begin{frame}[fragile] \frametitle{Finding Overlaps Using Splicing} The \Rfunction{findSpliceOverlaps} function is an experimental, work-in-progress that annotates the return value of the basic \Rfunction{findOverlaps} including the following logical variables: \begin{description} \item[compatible] The read (pair) exactly matches the splicing pattern of the transcript, over the extent of the alignment. If this is ever \code{TRUE} for a read, all of its incompatible hits are discarded. \item[unique] The hit is compatible and is the only one for this read, i.e., it maps only to a single transcript. \item[coding] The hit is at least partially to a coding region. \end{description} \begin{block}{Calling \Rfunction{findSpliceOverlaps}} <>= hits <- findSpliceOverlaps(bamFiles$normal, aldoa_exons_tx, cds = aldoa_cds_tx, pairedEnd = TRUE) @ %$ <>= hits_normal <- hits hits_tumor <- findSpliceOverlaps(bamFiles$tumor, aldoa_exons_tx, cds = aldoa_cds_tx, pairedEnd = TRUE) @ %$ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Counting the Hits} Assume we are interested in the total count, the compatible count and the unique (compatible) count, restricting to coding regions: \begin{block}{} <>= countSpliceHits <- function(hits) { coding_hits <- hits[values(hits)$coding] compatible_hits <- coding_hits[values(coding_hits)$compatible] unique_hits <- coding_hits[values(coding_hits)$unique] cbind(coding = countSubjectHits(coding_hits), compatible = countSubjectHits(compatible_hits), unique = countSubjectHits(unique_hits)) } counts <- countSpliceHits(hits) @ % $ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Exercise: Summarizing Over Multiple Samples} \begin{onlyenv}<1> \begin{alertblock}{Challenge} Write a function called \Rfunction{summarizeSpliceOverlaps} that takes a \Rclass{BamFileList} (\Robject{bamFiles}), computes the unique counts and returns a \Rclass{SummarizedExperiment}. \textbf{Bonus}: compute and store the total and compatible counts, as well. \end{alertblock} \end{onlyenv} \begin{onlyenv}<2> \begin{block}{Solution} <>= summarizeSpliceOverlaps <- function(bams, tx, cds) { counts <- lapply(bams, function(bam) { hits <- findSpliceOverlaps(bam, tx, cds = cds) counts <- countSpliceHits(hits) split(counts, colnames(counts)[col(counts)]) }) assays <- do.call(mapply, c(cbind, counts, SIMPLIFY = FALSE)) colData <- DataFrame(tumorStatus = c("tumor", "normal")) rownames(colData) <- colData$tumorStatus SummarizedExperiment(assays, tx, colData) } se <- summarizeSpliceOverlaps(bamFiles, aldoa_exons_tx, aldoa_cds_tx) @ % $ \end{block} \end{onlyenv} \end{frame} \subsection{Interpreting the Results} %% Fisher test between top two expressers \begin{frame}[fragile] \frametitle{Comparing the Top Two Isoforms} We use a Fisher Test to check whether the balance is shifting between top two isoforms (by uniquely mapped reads), between tumor and normal. \begin{block}{} <>= uc <- assay(se, "unique") uc_ord <- order(rowSums(uc), decreasing = TRUE) uc_top <- uc[head(uc_ord, 2),] fisher.test(uc_top)$estimate @ %$ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Plotting the Isoform Coverage} \begin{enumerate} \item Load alignments, merging XS (strand of transcription) within pairs. \item Convert to ranges, resolving XS to strand value. \item Filter for reads that map to a single transcript. \end{enumerate} \end{frame} \begin{frame}[fragile] \frametitle{Loading the Alignments} \begin{onlyenv}<1> \begin{block}{Propagation of XS From Reads to Pairs} <>= propagateXS <- function(ga) { first_xs <- values(first(ga))$XS last_xs <- values(last(ga))$XS xs <- first_xs xs[is.na(xs)] <- last_xs[is.na(xs)] values(ga)$XS <- xs ga } @ %$ \end{block} \end{onlyenv} \begin{onlyenv}<2> \begin{block}{Load Alignments} <>= loadAlignments <- function(bam) { ga <- readGappedAlignmentPairs(path(bam), param = param) propagateXS(ga) } ga_normal <- loadAlignments(bamFiles$normal) ga_tumor <- loadAlignments(bamFiles$tumor) @ \end{block} \end{onlyenv} \end{frame} \begin{frame}[fragile] \frametitle{Converting Reads to Ranges} \begin{block}{Resolving Strand from XS} <>= resolveStrandFromXS <- function(reads, xs = values(reads)$XS) { strand <- ifelse(!is.na(xs), xs, "*") strand(reads) <- relist(Rle(strand, elementLengths(reads)), reads) reads } @ %$ \end{block} \begin{block}{Converting Reads to Ranges} <>= getReadRanges <- function(ga) { resolveStrandFromXS(grglist(ga)) } reads_normal <- getReadRanges(ga_normal) reads_tumor <- getReadRanges(ga_tumor) @ %$ \end{block} \end{frame} %% FIXME: should we hide some of this code? \begin{frame}[fragile] \frametitle{Filter For Uniquely Mapped Reads} \begin{block}{} <>= getUniques <- function(hits) { unique(queryHits(hits[values(hits)$unique & subjectHits(hits) %in% c(1, 4)])) } unique_normal <- getUniques(hits_normal) unique_tumor <- getUniques(hits_tumor) unique_reads_normal <- reads_normal[unique_normal] unique_reads_tumor <- reads_tumor[unique_tumor] unique_reads <- mstack(normal = unlist(unique_reads_normal), tumor = unlist(unique_reads_tumor)) @ %$ \end{block} \end{frame} \SweaveOpts{width=10, height=7} \begin{frame}[fragile] \frametitle{Plotting the Isoform Coverage} We leverage the \Rfunction{autoplot} function from \Rpackage{ggbio}: \begin{block}{} <>= read_track <- autoplot(unique_reads, stat = "coverage", facets = name ~ .) tx_track <- autoplot(aldoa_exons_tx, geom = "alignment", ylab = "") cov_tracks <- tracks(read_track, tx_track, heights = c(3, 1)) cov_tracks @ \end{block} \end{frame} \begin{frame}[plain] \frametitle{Coverage Plot} \includegraphics[width=11cm]{tutorial-ggbio-isoform-coverage} \end{frame} \begin{frame}[fragile] \frametitle{Zooming into the Plot} The \code{xlim} argument restricts the genomic interval: \begin{block}{} <>= xlim(cov_tracks) <- c(30075000, 30080000) cov_tracks @ \end{block} \end{frame} \begin{frame}[plain] \frametitle{Zoomed Coverage Plot} \includegraphics[width=11cm]{tutorial-ggbio-zoom} \end{frame} \begin{frame}[fragile] \frametitle{Exercise: Zoom Using the Coverage} \begin{alertblock}{Problem} Zoom to a similar region of interest by finding a window around the tallest peaks. Hint: \Rfunction{coverage} and \Rfunction{slice} would be useful here. \end{alertblock} \begin{onlyenv}<2> \begin{block}{Solution} <>= cov_chr16 <- coverage(unique_reads)$chr16 roi <- range(ranges(slice(cov_chr16, 1000))) roi <- roi + 500 xlim(cov_tracks) <- roi cov_tracks @ %$ \end{block} \end{onlyenv} \end{frame} \section{Transcript Structure: Splicing} % \begin{frame} % \frametitle{Overview of Algorithm} % \begin{columns}[l] % \column{6cm} % \begin{block}{Steps} % \begin{enumerate} % \item Run RNA-seq pipeline, and load % \textit{concordant unique} alignments % \item Find reads and isoforms with any exonic overlap % \item Split the alignments into two bins: % \textit{compatible} with % splicing, and \textit{incompatible} % \item \alert<1>{For the \textit{compatible}, find reads that map % \textit{uniquely} to an isoform} % \item \alert<2>{For the \textit{incompatible}, find those indicating % \textit{novel} junctions or splice sites} % \end{enumerate} % \end{block} % \column{5cm} % \includegraphics<1>[width=5cm]{isoform-approach-3} % \includegraphics<2>[width=5cm]{isoform-approach-4} % \end{columns} % \end{frame} %% Finding the splices \begin{frame} \frametitle{Questions about Isoform Structure} \begin{block}{} \begin{itemize} \item Are there any unannotated isoforms present? \item How do we use them to derive novel structures? Is assembly feasible? \end{itemize} \end{block} \includegraphics<1>[width=11cm]{isoform-expr-intro-isoforms-c} \includegraphics<2>[width=11cm]{isoform-expr-intro-reads-b} \includegraphics<3>[width=11cm]{isoform-expr-intro-misfits-4} \end{frame} \begin{frame}[fragile] \frametitle{Finding the Splices in the Alignments} We use the \Rfunction{introns} function to find the intronic gaps in the read alignments (N runs in the CIGAR): \begin{block}{} <>= splices <- resolveStrandFromXS(introns(ga_normal), values(ga_normal)$XS) @ %$ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Junction Counting} \framesubtitle{Simplest algorithm: form keys from \Rclass{GRanges} and hashing.} \begin{block}{Key generator} <>= gr2key <- function(x) { paste(seqnames(x), start(x), end(x), strand(x), sep = ":") } @ \end{block} \begin{block}{Key Parser} <>= key2gr <- function(x, ...) { key_mat <- matrix(unlist(strsplit(x, ":", fixed=TRUE)), nrow = 4) GRanges(key_mat[1,], IRanges(as.integer(key_mat[2,]), as.integer(key_mat[3,])), key_mat[4,], ...) } @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Junction Counting} \begin{block}{} <>= introns <- psetdiff(range(aldoa_exons_tx), aldoa_exons_tx) introns_flat <- unlist(introns, use.names = FALSE) tx_keys <- gr2key(introns_flat) @ <>= splices_flat <- unlist(splices, use.names = FALSE) splice_table <- table(gr2key(splices_flat)) splice_summary <- key2gr(names(splice_table), score = as.integer(splice_table), novel = !names(splice_table) %in% tx_keys, seqlengths = seqlengths(splices)) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Plotting the Isoform Coverage} \framesubtitle{Setup} \begin{block}{Get Uniquely Mapped Splices} <>= summarizeSplices <- function(ga) { splices <- resolveStrandFromXS(introns(ga), values(ga)$XS) <> splice_summary } @ <>= unique_splices_normal <- summarizeSplices(ga_normal[unique_normal]) unique_splices_tumor <- summarizeSplices(ga_tumor[unique_tumor]) unique_splices <- mstack(normal = unique_splices_normal, tumor = unique_splices_tumor) @ % $ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Plotting Splice Counts: Score as Height} \begin{block}{} <>= read_track <- autoplot(unique_splices, geom = "arch", aes(height = score), color = "deepskyblue3", ylab = "coverage", facets = name ~ .) + stat_coverage(unique_reads, facets = name ~ .) tracks(read_track, tx_track, heights = c(3, 1), xlim = roi) @ \end{block} \end{frame} \begin{frame}[plain] \frametitle{Coverage Plot, Splice Count as Height} \includegraphics[width=11cm]{tutorial-ggbio-splices} \end{frame} \begin{frame}[fragile] \frametitle{Plotting the Splice Counts: Score as Size} \begin{block}{} <>= read_track <- autoplot(unique_splices, geom = "arch", aes(size = score, height = width / 5), color = "deepskyblue3", ylab = "coverage", facets = name ~ .) + stat_coverage(unique_reads, facets = name ~ .) tracks(read_track, tx_track, heights = c(3, 1), xlim = roi) @ \end{block} \end{frame} \begin{frame}[plain] \frametitle{Coverage Plot, Splice Count as Size} \includegraphics[width=11cm]{tutorial-ggbio-splices-size} \end{frame} \begin{frame}[fragile] \frametitle{Plotting the Novel Junctions} \framesubtitle{Setup} \begin{block}{Get the Novel Splices} <>= splices_normal <- summarizeSplices(ga_normal) splices_tumor <- summarizeSplices(ga_tumor) all_splices <- mstack(normal = splices_normal, tumor = splices_tumor) novel_splices <- all_splices[values(all_splices)$novel & values(all_splices)$score >= 9] unique_novel_splices <- c(unique_splices, novel_splices) values(unique_novel_splices)$annotated <- !values(unique_novel_splices)$novel @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Plotting the Novel Junctions} This time, we specify the \code{color = novel} aesthetic: \begin{block}{} <>= novel_track <- autoplot(unique_novel_splices, geom = "arch", aes(size = score, height = width / 5, color = annotated), ylab = "coverage", facets = name ~ .) + stat_coverage(unique_reads, facets = name ~ .) tracks(novel_track, tx_track, heights = c(3, 1), xlim = roi) @ \end{block} \end{frame} \begin{frame}[plain] \frametitle{Coverage Plot, with Novel Junctions} \includegraphics[width=11cm]{tutorial-ggbio-novel-junctions} \end{frame} \begin{frame} \frametitle{Acknowledgements} \textbf{Ranges Packages} \\ Herve Pages \\ Patrick Aboyoun \\ Valerie Obenchain \bigskip \textbf{ggbio} \\ Tengfei Yin \\ Dianne Cook \bigskip Robert Gentleman \end{frame} % \section{Conclusion} % \begin{frame} % \frametitle{Further Analysis} % \begin{description} % \item[RNA-seq DE] \Rpackage{edgeR}, \Rpackage{DEseq}, \Rpackage{DEXseq} % \end{description} % \end{frame} \end{document}