% \VignetteEngine{knitr::knitr} % \VignetteIndexEntry{03. Sequence Analysis -- slides} \documentclass[xcolor=dvipsnames]{beamer} \usepackage{BioconductorSlides} \hypersetup{colorlinks,linkcolor=,urlcolor=Blue} \title{\Bioconductor{} for Sequence Analysis} \author{Martin T.\ Morgan\footnote{\url{mtmorgan@fhcrc.org}}} \date{27-28 February 2014} \begin{document} \maketitle \begin{frame}[fragile]{Sequencing: Work flows} \begin{columns} \column{.5\textwidth} \begin{enumerate} \item Experimental design \item `Wet lab' sample prep \item Sequencing \begin{itemize} \item 100's of millions of reads \item 30-150 nucleotides \item Single and paired-end \item Bar codes, lanes \& flow cells \end{itemize} \item Alignment \item Analysis: DNA, RNA, epigenetics, integrative, microbiome, \ldots \end{enumerate} \column{.5\textwidth} \includegraphics[width=\textwidth,height=!]{figures/Solexa-bridge-pcr.jpg} \par Bentley et al., 2008, Nature 456: \href{http://www.ncbi.nlm.nih.gov/pubmed/18987734}{53-9} \end{columns} \end{frame} \begin{frame}{Experimental design and wet lab} \begin{itemize} \item RNA-seq \begin{itemize} \item Known gene / transcript diffential expression \item Novel transcript discovery \item Single- versus paired-end \end{itemize} \item ChIP-seq \item Variants \begin{itemize} \item Germline vs.\ somatic \item Exome vs.\ whole genome \item SNP vs.\ indel vs.\ structural \end{itemize} \item Copy number \begin{itemize} \item Low vs.\ high coverage \end{itemize} \end{itemize} \end{frame} \begin{frame}{RNA-seq: single versus paired end} \begin{itemize} \item Most analysis now paired-end \item Reads within exons \item A single end spanning exons: `junction reads' \item Reads spanning exons \end{itemize} \end{frame} {\scriptsize\begin{verbatim} @ERR127302.1703 HWI-EAS350_0441:1:1:1460:19184#0/1 CCTGAGTGAAGCTGATCTTGATCTACGAAGAGAGATAGATCTTGATCGTCGAGGAGATGCTGACCTTGACCT + HHGHHGHHHHHHHHDGG>CE?=896=: @ERR127302.1704 HWI-EAS350_0441:1:1:1460:16861#0/1 GCGGTATGCTGGAAGGTGCTCGAATGGAGAGCGCCAGCGCCCCGGCGCTGAGCCGCAGCCTCAGGTCCGCCC + DE?DD>ED4>EEE>DE8EEEDE8B?EB<@3;BA79?,881B?@73;1?######################## @ERR127302.1705 HWI-EAS350_0441:1:1:1460:13054#0/1 AAAACACCCTGCAATCTTTCAGACAGGATGTTGACAATGCGTCTCTGGCACGTCTTGACCTTGAACGCAAAG + EEDEE>AD>BBGGB8E8EEEGBGGGGBGGGGG3G>E3*?BE??BBC8GB8??:??GGDGDDD>D>BGGD8EG,<6D@<@G@>AB@B?8AA>CE@D8@B=?CC>AG @ERR127302.1707 HWI-EAS350_0441:1:1:1461:6983#0/1 CGACGCTGACACCGGAACGGCAGCAGCAGCAGGACGATTAAGACAAGGAGGATGGCTCCACAGACGCTCATG + GEEGEGE@GGGGGGEGGGGGBB>G3?33?8*;;79?<9@?DD8@DDEE888;-BB?.A############## @ERR127302.1708 HWI-EAS350_0441:1:1:1461:10827#0/1 AAAGAAGGTCCTTGCAATAGACTGCCTCTGCTTGAGAACTTATGATGTAATTATTGCATGCTGCTAATATAC + GGGGGDDEBFGGGGGBE,DAGDDGGGEEEGACEBEFDEEFEDH:@.7@49;88G8>G>DDG@D>D@G@GE>@DDBDDGDH:@.7@49;88G8>G>DDG@D>D@G@GE>@DDBDDG ? @ A B ## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 ## C D E F G H I ## 34 35 36 37 38 39 40 \end{verbatim} \end{kframe} \end{knitrout} \end{frame} \begin{frame}{Quality assessment} Assessment, e.g., \href{http://www.bioinformatics.babraham.ac.uk/projects/fastqc/}{fastqc} \begin{itemize} \item Read length, duplication \item Nucleotide use: per cycle, GC content, N content, \ldots \item Quality: per cycle, per read \item Consistency across samples, without obvious treatment-specific associations \end{itemize} Remediation, e.g., \href{http://www.usadellab.org/cms/?page=trimmomatic}{trimmomatic} \begin{itemize} \item Crop e.g., leading / tailing artifacts of sequencing protocol \item Trim based on quality \end{itemize} \end{frame} \begin{frame}{Alignment} Bowtie / Tophat / Cufflinks \begin{itemize} \item \href{http://bowtie-bio.sourceforge.net/bowtie2/index.shtml}{Bowtie2} -- alignment \item \href{http://tophat.cbcb.umd.edu/}{Tophat} -- splice junction mapper \item \href{http://cufflinks.cbcb.umd.edu/}{Cufflinks} / \Biocpkg{cummeRbund} -- isoform assembly \& quantification \end{itemize} Other aligners \begin{itemize} \item \href{http://snap.cs.berkeley.edu/}{SNAP} -- fast and accurate \item \href{http://subread.sourceforge.net/}{subread} / \Biocpkg{Rsubread} -- memory efficient \item \href{http://research-pub.gene.com/gmap/}{GSNAP / GMAP} / \Biocpkg{gmapR} -- flexible and high quality alignments \end{itemize} \end{frame} \begin{frame}[fragile]{BAM files} \begin{itemize} \item Visualization with, e.g., \href{http://www.broadinstitute.org/igv/}{IGV} \item SAM / BAM (and other) \href{https://github.com/samtools/hts-specs}{specifications} \end{itemize} \begin{verbatim} ERR127302.25553011 403 chr14 19413639 1 72M = 19413589 -122 CAAAGAATTGATTGAATTCATCAGGGCTAAAATCTCCAAAAATATACTGCGG... !#&&"%&$&%%&&%&"%***&'(')')")')%('#++++++*)'&%+*+*++... AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:7C64 YT:Z:UU NH:i:3 CC:Z:= CP:i:20145991 HI:i:0 \end{verbatim} \end{frame} \begin{frame} \begin{tabular}{lll} Field & Name & Value \\\hline\noalign{\smallskip} 1 & QNAME & Query (read) NAME \\ 2 & FLAG & Bitwise FLAG, e.g., strand of alignment \\ 3 & RNAME & Reference sequence NAME \\ 4 & POS & 1-based leftmost POSition of sequence \\ 5 & MAPQ & MAPping Quality (Phred-scaled) \\ 6 & CIGAR & Extended CIGAR string \\ 7 & MRNM & Mate Reference sequence NaMe \\ 8 & MPOS & 1-based Mate POSition \\ 9 & ISIZE & Inferred insert SIZE \\ 10 & SEQ & Query SEQuence on the reference strand \\ 11 & QUAL & Query QUALity \\ 12$+$ & OPT & OPTional fields, format TAG:VTYPE:VALUE \\\hline \end{tabular} \end{frame} \end{document}