\documentclass{beamer} \usepackage[normalem]{ulem} \usepackage{Sweave} \mode { \usetheme{Madrid} \useinnertheme{circles} \useoutertheme{infolines} \setbeamercovered{transparent} } \newcommand{\Rfunction}[1]{{\texttt{#1}}} \newcommand{\Rfunarg}[1]{{\texttt{#1}}} \newcommand{\Robject}[1]{{\texttt{#1}}} \newcommand{\Rpackage}[1]{{\textit{#1}}} \newcommand{\Rclass}[1]{{\textit{#1}}} \newcommand{\code}[1]{{\texttt{#1}}} \newcommand{\software}[1]{{\textit{#1}}} \title{Exploring Isoform-specific Expression} \subtitle{With R and Bioconductor} \author{Michael Lawrence} \institute{Genentech} \AtBeginSubsection[] { \begin{frame}{Outline} \tableofcontents[currentsection,currentsubsection] \end{frame} } \AtBeginSection[] { \begin{frame}{Outline} \tableofcontents[currentsection,currentsubsection] \end{frame} } \begin{document} <>= options(width=50) library(isoformExprTutorial) @ \frame{\titlepage} %% Intro: slides from isoform talk, up through the approach, %% + some description of the data \section{Introduction} \subsection{Questions} \label{sec:questions} %% Some sort of high-level, motivating questions \begin{frame} \frametitle{Questions of Interest} RNA-seq alignments permit us to ask questions about transcript: \begin{block}{} \begin{itemize} \item<2-5> \alert<2>{Expression levels (from counts over exons and junctions)} \item<3-5> \alert<3>{Structure (from the alignments)} \item<4> \alert<4>{Variants (from the sequences)} \end{itemize} \end{block} \includegraphics<1>[width=11cm]{isoform-expr-intro-isoforms-b} \includegraphics<2>[width=11cm]{isoform-expr-intro-data-b} \includegraphics<3>[width=11cm]{isoform-expr-intro-reads} \includegraphics<4->[width=11cm]{isoform-expr-intro-variants} \end{frame} \begin{frame} \frametitle{Questions about Isoform Expression} \begin{block}{} \begin{itemize} \item<2-> \alert<2>{Is an isoform expressed?} \item<3-> \alert<3>{What is the expression level of a particular isoform, and how can we compare it to that of other isoforms?} \item<4-> \alert<4>{Is the balance in isoform expression different across samples?} \end{itemize} \end{block} \includegraphics<1>[width=11cm]{isoform-expr-intro-isoforms} \includegraphics<2>[width=11cm]{isoform-expr-intro-data} \includegraphics<3>[width=11cm]{isoform-expr-intro-quantify} \includegraphics<4>[width=11cm]{isoform-expr-intro-balance} \end{frame} \begin{frame} \frametitle{Questions about Isoform Structure} \begin{block}{} \begin{itemize} \item Are there any unannotated isoforms present? \item How do we use them to derive novel structures? Is assembly feasible? \end{itemize} \end{block} \includegraphics<1>[width=11cm]{isoform-expr-intro-isoforms-c} \includegraphics<2>[width=11cm]{isoform-expr-intro-reads-b} \includegraphics<3>[width=11cm]{isoform-expr-intro-misfits-4} \end{frame} \subsection{Approach} \label{sec:approach} \begin{frame} \frametitle{Overview of Algorithm} \begin{columns}[l] \column{6cm} \begin{block}{Steps} \begin{enumerate} \item<2-> \alert<2>{Run RNA-seq pipeline, and load \textit{concordant unique} alignments} \item<3-> \alert<3>{Find reads and isoforms with any exonic overlap} \item<4-> \alert<4>{Split the alignments into two bins: \textit{compatible} with splicing, and \textit{incompatible}} \item<5-> \alert<5>{For the \textit{compatible}, find reads that map \textit{uniquely} to an isoform} \item<6-> \alert<6->{For the \textit{incompatible}, find those indicating \textit{novel} junctions or splice sites} \end{enumerate} \end{block} \column{5cm} \includegraphics<2-3>[width=5cm]{isoform-approach-1} \includegraphics<4>[width=5cm]{isoform-approach-2} \includegraphics<5>[width=5cm]{isoform-approach-3} \includegraphics<6>[width=5cm]{isoform-approach-4} \includegraphics<7>[width=5cm]{isoform-approach-5} \end{columns} \end{frame} \begin{frame} \frametitle{Compatibility and Uniqueness} \begin{block}{} \begin{itemize} \item<2-> \alert<2-3>{To be \textit{compatible} a read must align completely within the exons and the read gaps should exactly match the introns over the read extent} \item<4-> \alert<4->{To be \textit{uniquely mapped} a read must be compatible with only a single isoform} \end{itemize} \end{block} \includegraphics<1>[width=11cm]{isoform-expr-intro-isoforms-c} \includegraphics<2>[width=11cm]{isoform-expr-intro-reads-b} \includegraphics<3>[width=11cm]{isoform-expr-intro-misfits-4} \includegraphics<4>[width=11cm]{isoform-expr-intro-reads-b} \includegraphics<5>[width=11cm]{isoform-expr-intro-reads-assigned-b} \end{frame} \begin{frame} \frametitle{Novelty: Grabbing the Low-hanging Fruit} \begin{block}{Levels of novelty} \begin{itemize} \item<2-5> \alert<2>{Inconclusive} \item<3-5> \alert<3>{Novel combination of events} \item<4-6> \alert<4>{Novel junction} \item<5-6> \alert<5>{Novel splice site} \end{itemize} \end{block} \includegraphics<1>[width=11cm]{isoform-expr-intro-reads-b} \includegraphics<2>[width=11cm]{isoform-expr-intro-misfits-1} \includegraphics<3>[width=11cm]{isoform-expr-intro-misfits-2} \includegraphics<4>[width=11cm]{isoform-expr-intro-misfits-3} \includegraphics<5->[width=11cm]{isoform-expr-intro-misfits-4} \end{frame} % \subsection{Some Sample Results} % \begin{frame} % \frametitle{454 vs. Illumina Counts} % \includegraphics[width=8cm]{bar-numbers-tech-mat-sum} % \end{frame} % \begin{frame} % \frametitle{Unique Fraction} % \includegraphics[width=8cm]{scatter-unique_over_compat-tech} % \end{frame} % \begin{frame} % \frametitle{Incompatible Fraction} % \includegraphics[width=8cm]{scatter-incompatible_over_total-tech} % \end{frame} \section{Implementation} %% Implementation: \begin{frame} \frametitle{Example} \begin{itemize} \item Illumina paired-end 75nt RNA-seq reads from two matched human tissue samples: tumor and normal \item Strand inferred during alignment from splice directions \item Stored in a BAM file \item Subset to the ALDOA (aldolase) gene \end{itemize} \end{frame} \subsection{Loading and Preparing the Data} \begin{frame} \frametitle{Data Loading Overview} We need two types of data for the analysis: \begin{enumerate} \item Transcript annotations (gene models) \item Read alignments from the two samples \end{enumerate} For this demonstration, we will focus on the ALDOA gene. \end{frame} %% Loading the transcripts, preprocessing them \begin{frame}[fragile] \frametitle{Finding ALDOA} To load the data, we need the genomic interval of ALDOA. \begin{block}{} <>= aldoa_eg <- org.Hs.egSYMBOL2EG$ALDOA txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene aldoa_gr <- exons(txdb, vals = list(gene_id = aldoa_eg), columns = c("tx_id", "gene_id")) aldoa_range <- range(aldoa_gr) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Forming the Transcript Models} Simplest method is calling \Rfunction{exonsBy}, but its API does not support restrictive queries, or extra columns. Thus, we form our models directly from \Robject{aldoa\_gr}: \begin{block}{} <>= aldoa_vals <- values(aldoa_gr) values(aldoa_gr) <- NULL tx <- multisplit(aldoa_gr, aldoa_vals$tx_id) tx_to_val <- match(names(tx), unlist(aldoa_vals$tx_id)) values(tx)$gene_id <- rep(unlist(aldoa_vals$gene_id), elementLengths(aldoa_vals$tx_id))[tx_to_val] values(tx)$tx_id <- names(tx) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Exercise: \Rfunction{exonsBy}} \begin{alertblock}{Problem} As an alternative to last slide, use \Rfunction{exonsBy} and \Rfunction{subsetByOverlaps} to get the basic transcript structures for the ALDOA gene. \textit{Bonus: get the \Robject{gene\_id} my merging with \Rfunction{transcripts} return value.} \end{alertblock} \begin{onlyenv}<2> \begin{block}{Solution} <>= exons_grl <- exonsBy(txdb) ans <- subsetByOverlaps(exons_grl, aldoa_range) values(ans)$tx_id <- names(ans) tx_gr <- transcripts(txdb, columns = c("tx_id", "gene_id")) values(ans)$gene_id <- drop(values(tx_gr)$gene_id)[match(names(ans), values(tx_gr)$tx_id)] @ \end{block} \end{onlyenv} \end{frame} \begin{frame}[fragile] \frametitle{Merging the Gene Symbols} Would be trivial in our case, but here is a general demonstration: \begin{block}{} <>= gene_id_keys <- values(tx)$gene_id[!is.na(values(tx)$gene_id)] tx_gene_sym <- rep.int(NA, length(tx)) tx_gene_sym[!is.na(values(tx)$gene_id)] <- unlist(mget(gene_id_keys, org.Hs.egSYMBOL, ifnotfound = NA), use.names = FALSE) values(tx)$gene_sym <- tx_gene_sym @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Selecting Only the CCDS Models} Just take my word for it: \begin{block}{} <>= tx <- tx[c(2, 4, 5, 7)] @ \end{block} \end{frame} %% Reading the GappedAlignments, with XS tag \begin{frame}[fragile] \frametitle{Reading the Alignments} We obtain the normal BAM file for demonstration purposes: \begin{block}{} <>= extdatadir <- system.file("extdata", package = "isoformExprTutorial") files <- tools::list_files_with_exts(extdatadir, "bam") names(files) <- tools::file_path_sans_ext(basename(files)) bamFiles <- Rsamtools::BamFileList(files) bam <- bamFiles$normal @ \end{block} And read it into a \Rclass{GappedAlignments} object: \begin{block}{} <>= param <- ScanBamParam(tag = "XS", which = aldoa_range) ga <- readGappedAlignments(path(bam), use.names = TRUE, param = param) @ % \end{block} We request the \textit{XS} tag, the strand as determined by the aligner from the splice directions. \end{frame} \begin{frame}[fragile] \frametitle{Ranges from the Alignments} The \Rfunction{grglist} function returns the alignment ranges as a \Rclass{GRangesList}. \begin{block}{} <>= reads <- grglist(ga) metadata(reads)$bamfile <- bam @ \end{block} We store the BAM path in the metadata to track provenance. \end{frame} \begin{frame}[fragile] \frametitle{Exercise: \Rfunction{metadata}} \begin{alertblock}{Problem} Store the \Rclass{ScanBamParam} object in the metadata. \end{alertblock} \begin{onlyenv}<2> \begin{block}{Solution} <>= metadata(reads)$param <- param @ \end{block} \end{onlyenv} \end{frame} %% Finding the splices \begin{frame}[fragile] \frametitle{Splices} <>= elementGaps <- function(x) { x_flat <- unlist(x, use.names = FALSE) egaps <- gaps(ranges(x)) first_segment <- start(PartitioningByWidth(x)) sn <- seqnames(x_flat)[first_segment][togroup(egaps)] strand <- strand(x_flat)[first_segment][togroup(egaps)] relist(GRanges(sn, unlist(egaps, use.names = FALSE), strand, seqlengths = seqlengths(x)), egaps) } @ We use a custom function, \Rfunction{elementGaps}, to find the gaps within each element of the \Rclass{GRangesList}: \begin{block}{} <>= splices <- elementGaps(reads) values(splices)$XS <- values(reads)$XS @ \end{block} All metadata needs to be carried over explicitly. \end{frame} \begin{frame}[fragile] \frametitle{\Rfunction{elementGaps}: Simple Implementation} \begin{block}{} <>= elementGaps <- function(reads) { psetdiff(unlist(range(reads), use.names=FALSE), reads) } @ \end{block} Problem: the call to \code{range(reads)} is painfully slow. \end{frame} \begin{frame}[fragile] \frametitle{\Rfunction{elementGaps}: \textit{Optimization} via partitioning} \begin{block}{} <>= <> @ \end{block} \begin{alertblock}{Assumption} Strand and chromosome is the same within the elements, i.e., no trans-splicing, genomic rearrangements, etc. \end{alertblock} \end{frame} \begin{frame}[fragile] \frametitle{Exercise: Partitionings} \begin{alertblock}{Problem} Use a \Rclass{PartitioningByWidth} to find the start of the first exon in each transcript. \end{alertblock} \begin{onlyenv}<2> \begin{block}{Solution} <>= tx_part <- PartitioningByWidth(tx) tx_flat <- unlist(tx, use.names = FALSE) start(tx_flat)[start(tx_part)] @ \end{block} \end{onlyenv} \end{frame} %% Pairing the reads (and splices) \begin{frame}[fragile] \frametitle{Pairing the Reads} We depend on each end of a pair sharing the same name: \begin{block}{} <>= pairs <- split(unlist(reads, use.names=FALSE), factor(names(reads)[togroup(reads)], unique(names(reads)))) metadata(pairs) <- metadata(reads) @ \end{block} \textit{XS} information is shared within pairs: \begin{block}{} <>= xs <- values(reads)$XS has_xs <- !is.na(xs) pair_xs <- setNames(rep.int(NA, length(pairs)), names(pairs)) pair_xs[names(reads)[has_xs]] <- xs[has_xs] values(pairs)$XS <- unname(pair_xs) @ \end{block} \end{frame} %% Resolving strand from the XS tag \begin{frame}[fragile] \frametitle{Resolving the Strand from \textit{XS}} The mapping is direct, except \code{NA} maps to \code{*}. \begin{block}{} <>= xs <- values(pairs)$XS strand <- ifelse(!is.na(xs) & xs != "?", xs, "*") strand(pairs) <- relist(Rle(strand, elementLengths(pairs)), pairs) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Pairing and Stranding the Splices} We have factored the preceding code into two functions. <>= pairReadRanges <- function(reads) { <> <> pairs } @ <>= strandFromXS <- function(pairs) { <> pairs } @ \begin{block}{} <>= splices <- pairReadRanges(splices) splices <- strandFromXS(splices) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Batch Processing Both Tumor and Normal} \begin{block}{} <>= readReadRanges <- function(bam) { <> <> <> pairs <- pairReadRanges(reads) pairs <- strandFromXS(pairs) <> values(pairs)$splices <- splices pairs } @ <>= normal <- readReadRanges(bamFiles$normal) tumor <- readReadRanges(bamFiles$tumor) @ \end{block} \end{frame} \subsection{Finding and Annotating the Overlaps} \begin{frame} \frametitle{Overview of Algorithm} \begin{columns}[l] \column{6cm} \begin{block}{Steps} \begin{enumerate} \item \alert<1>{Run RNA-seq pipeline, and load \textit{concordant unique} alignments} \item \alert<2>{Find reads and isoforms with any exonic overlap} \item Split the alignments into two bins: \textit{compatible} with splicing, and \textit{incompatible} \item For the \textit{compatible}, find reads that map \textit{uniquely} to an isoform \item For the \textit{incompatible}, find those indicating \textit{novel} junctions or splice sites \end{enumerate} \end{block} \column{5cm} \includegraphics[width=5cm]{isoform-approach-1} \end{columns} \end{frame} %% findOverlaps \begin{frame}[fragile] \frametitle{Finding Overlaps} \begin{block}{} <>= hits <- findOverlaps(pairs, tx) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Extracting the Hits} \begin{block}{} <>= hit_pairs <- ranges(pairs)[queryHits(hits)] hit_splices <- ranges(splices)[queryHits(hits)] hit_tx <- ranges(tx)[subjectHits(hits)] @ \end{block} \begin{alertblock}{} Memory inefficient, but permits vectorized operations \end{alertblock} \end{frame} \begin{frame} \frametitle{Overview of Algorithm} \begin{columns}[l] \column{6cm} \begin{block}{Steps} \begin{enumerate} \item Run RNA-seq pipeline, and load \textit{concordant unique} alignments \item \alert<1>{Find reads and isoforms with any exonic overlap} \item \alert<2>{Split the alignments into two bins: \textit{compatible} with splicing, and \textit{incompatible}} \item For the \textit{compatible}, find reads that map \textit{uniquely} to an isoform \item For the \textit{incompatible}, find those indicating \textit{novel} junctions or splice sites \end{enumerate} \end{block} \column{5cm} \includegraphics<1>[width=5cm]{isoform-approach-1} \includegraphics<2>[width=5cm]{isoform-approach-2} \end{columns} \end{frame} %% checking for compatibility \begin{frame}[fragile] \frametitle{Compatibility Checking} \begin{block}{} <>= read_within <- elementLengths(setdiff(hit_pairs, hit_tx)) == 0L tx_within <- elementLengths(intersect(hit_tx, hit_splices)) == 0L compatible <- read_within & tx_within @ \end{block} \includegraphics[width=11cm]{isoform-expr-intro-misfits-4} \end{frame} \begin{frame} \frametitle{Overview of Algorithm} \begin{columns}[l] \column{6cm} \begin{block}{Steps} \begin{enumerate} \item Run RNA-seq pipeline, and load \textit{concordant unique} alignments \item Find reads and isoforms with any exonic overlap \item \alert<1>{Split the alignments into two bins: \textit{compatible} with splicing, and \textit{incompatible}} \item \alert<2>{For the \textit{compatible}, find reads that map \textit{uniquely} to an isoform} \item For the \textit{incompatible}, find those indicating \textit{novel} junctions or splice sites \end{enumerate} \end{block} \column{5cm} \includegraphics<1>[width=5cm]{isoform-approach-2} \includegraphics<2>[width=5cm]{isoform-approach-3} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{Uniquely Mapped Reads} We find the compatible hits where the read is represented only once: \begin{block}{} <>= compat_hits <- hits[compatible] reads_unique <- tabulate(queryHits(compat_hits), queryLength(compat_hits)) == 1L unique <- logical(length(hits)) unique[compatible] <- reads_unique[queryHits(compat_hits)] @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Annotating the Overlaps} Like any other \Rclass{Vector}, \Rclass{Hits} can have element metadata: \begin{block}{} <>= strand_specific <- all(strand(pairs) != "*")[queryHits(hits)] values(hits) <- DataFrame(strand_specific, compatible, unique) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Exercise: Add Spliced Indicator} \begin{alertblock}{Problem} Add a column to the \Robject{hits} element metadata indicating whether the read had a splice (ignoring XS tag and assuming all reads are paired). \end{alertblock} \begin{onlyenv}<2> \begin{block}{Solution} <>= values(hits)$spliced <- (elementLengths(pairs) > 2)[queryHits(hits)] @ \end{block} \end{onlyenv} \end{frame} %% checking for novel events -- just junctions? \begin{frame} \frametitle{Overview of Algorithm} \begin{columns}[l] \column{6cm} \begin{block}{Steps} \begin{enumerate} \item Run RNA-seq pipeline, and load \textit{concordant unique} alignments \item Find reads and isoforms with any exonic overlap \item Split the alignments into two bins: \textit{compatible} with splicing, and \textit{incompatible} \item \alert<1>{For the \textit{compatible}, find reads that map \textit{uniquely} to an isoform} \item \alert<2>{For the \textit{incompatible}, find those indicating \textit{novel} junctions or splice sites} \end{enumerate} \end{block} \column{5cm} \includegraphics<1>[width=5cm]{isoform-approach-3} \includegraphics<2>[width=5cm]{isoform-approach-4} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{Junction Counting} \framesubtitle{Simplest algorithm: form keys from \Rclass{GRanges} and hashing.} \begin{block}{Key generator} <>= gr2key <- function(x) { paste(seqnames(x), start(x), end(x), strand(x), sep = ":") } @ \end{block} \begin{block}{Key Parser} <>= key2gr <- function(x, ...) { key_mat <- matrix(unlist(strsplit(x, ":", fixed=TRUE)), nrow = 4) GRanges(key_mat[1,], IRanges(as.integer(key_mat[2,]), as.integer(key_mat[3,])), key_mat[4,], ...) } @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Junction Counting} \begin{block}{} <>= introns <- elementGaps(tx) introns_flat <- unlist(introns, use.names = FALSE) tx_keys <- gr2key(introns_flat) @ <>= splices_flat <- unlist(splices, use.names = FALSE) splice_table <- table(gr2key(splices_flat)) splice_summary <- key2gr(names(splice_table), score = as.integer(splice_table), novel = !names(splice_table) %in% tx_keys, seqlengths = seqlengths(splices)) @ \end{block} \end{frame} %% Aggregate over the isoforms \begin{frame}[fragile] \frametitle{Aggregating over the Transcripts} \begin{block}{} <>= countByTx <- function(x) { tabulate(subjectHits(hits)[x], subjectLength(hits)) } compatible_strand <- countByTx(with(values(hits), compatible & strand_specific)) counts <- DataFrame(compatible_strand, lapply(values(hits)[-1], countByTx)) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Batch Processing the Samples} <>= findIsoformOverlaps <- function(pairs) { splices <- values(pairs)$splices <> <> <> <> <> hits } @ <>= countIsoformHits <- function(hits) { <> counts } @ <>= summarizeSplices <- function(reads) { splices <- values(reads)$splices <> splice_summary } @ \begin{block}{} <>= normal_hits <- findIsoformOverlaps(normal) normal_counts <- countIsoformHits(normal_hits) normal_splices <- summarizeSplices(normal) tumor_hits <- findIsoformOverlaps(tumor) tumor_counts <- countIsoformHits(tumor_hits) tumor_splices <- summarizeSplices(tumor) @ \end{block} \end{frame} \subsection{Interpreting the Results} %% Building a SummarizedExperiment \begin{frame}[fragile] \frametitle{Combining the Samples} We reshape the data into a \Rclass{SummarizedExperiment}: \begin{block}{} <>= assays <- mapply(cbind, normal_counts, tumor_counts, SIMPLIFY = FALSE) colData <- DataFrame(tumorStatus = c("tumor", "normal")) rownames(colData) <- colData$tumorStatus se <- SummarizedExperiment(assays, tx, colData) @ \end{block} \end{frame} %% Fisher test between top two expressers \begin{frame}[fragile] \frametitle{Comparing the Top Two Isoforms} We use a Fisher Test to check whether the balance is shifting between top two isoforms (by uniquely mapped reads), between tumor and normal. \begin{block}{} <>= uc <- assay(se, "unique") uc_ord <- order(rowSums(uc), decreasing = TRUE) uc_top <- uc[head(uc_ord, 2),] fisher.test(uc_top)$estimate @ \end{block} \end{frame} %% TODO: Generate a coverage+arc plot of those isoforms using ggbio \begin{frame}[fragile] \frametitle{Plotting the Isoform Coverage} \framesubtitle{Setup} \begin{block}{Get Uniquely Mapped Reads} <>= getUniqueReads <- function(reads, hits) { sel <- values(hits)$unique & subjectHits(hits) %in% c(1, 4) reads[unique(queryHits(hits)[sel])] } normal_uniq <- getUniqueReads(normal, normal_hits) tumor_uniq <- getUniqueReads(tumor, tumor_hits) both_uniq <- mstack(normal = unlist(normal_uniq), tumor = unlist(tumor_uniq)) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Plotting the Isoform Coverage} \framesubtitle{Setup} \begin{block}{Get Uniquely Mapped Splices} <>= normal_uniq_splices <- summarizeSplices(normal_uniq) tumor_uniq_splices <- summarizeSplices(tumor_uniq) uniq_splices <- mstack(normal = normal_uniq_splices, tumor = tumor_uniq_splices) @ \end{block} \end{frame} \SweaveOpts{width=10, height=7} \begin{frame}[fragile] \frametitle{Plotting the Isoform Coverage} We leverage the \Rfunction{autoplot} function from \Rpackage{ggbio}: \begin{block}{} <>= read_track <- autoplot(uniq_splices, geom = "arch", aes(size = score, height = width / 5), color = "deepskyblue3", ylab = "coverage", facets = name ~ .) + stat_coverage(both_uniq, facets = name ~ .) tx_16 <- keepSeqlevels(tx, "chr16") tx_track <- autoplot(tx_16, geom = "alignment", ylab = "") tracks(read_track, tx_track, heights = c(3, 1)) @ \end{block} \end{frame} \begin{frame}[plain] \frametitle{Coverage Plot} \includegraphics[width=11cm]{tutorial-ggbio-isoform-coverage} \end{frame} \begin{frame}[fragile] \frametitle{Zooming into the Plot} The \code{xlim} argument restricts the genomic interval: \begin{block}{} <>= tracks(read_track, tx_track, heights = c(3, 1), xlim = c(30075000, 30080000)) @ \end{block} \end{frame} \begin{frame}[plain] \frametitle{Zoomed Coverage Plot} \includegraphics[width=11cm]{tutorial-ggbio-zoom} \end{frame} \begin{frame}[fragile] \frametitle{Exercise: Zoom Using the Coverage} \begin{alertblock}{Problem} Zoom to a similar region of interest by finding a window around the tallest peaks. Hint: \Rfunction{coverage} and \Rfunction{slice} would be useful here. \end{alertblock} \begin{onlyenv}<2> \begin{block}{Solution} <>= cov_chr16 <- coverage(both_uniq)$chr16 roi <- range(ranges(slice(cov_chr16, 1000))) roi <- roi + 500 tracks(read_track, tx_track, heights = c(3, 1), xlim = roi) @ \end{block} \end{onlyenv} \end{frame} %% TODO: somehow show the novel junctions with ggbio, i.e., as a range %% with color or a number. \begin{frame}[fragile] \frametitle{Plotting the Novel Junctions} \framesubtitle{Setup} \begin{block}{Get the Novel Splices} <>= all_splices <- mstack(normal = normal_splices, tumor = tumor_splices) novel_splices <- all_splices[values(all_splices)$novel & values(all_splices)$score == 9] uniq_novel_splices <- c(uniq_splices, novel_splices) @ \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Plotting the Novel Junctions} This time, we specify the \code{color = novel} aesthetic: \begin{block}{} <>= novel_track <- autoplot(uniq_novel_splices, geom = "arch", aes(size = score, height = width / 5, color = novel), ylab = "coverage", facets = name ~ .) + stat_coverage(both_uniq, facets = name ~ .) tracks(novel_track, tx_track, heights = c(3, 1), xlim = roi) @ \end{block} \end{frame} \begin{frame}[plain] \frametitle{Coverage Plot, with Novel Junctions} \includegraphics[width=11cm]{tutorial-ggbio-novel-junctions} \end{frame} \section{Conclusion} \begin{frame} \frametitle{Take Home Tips} \begin{itemize} \item Use the right data structure for the right job \item Take advantage of metadata facilities \item Long ragged arrays: partitioning faster than looping over lists \end{itemize} \end{frame} \end{document}