\documentclass{article} \usepackage{url} \usepackage{hyperref} \usepackage{breakurl} \usepackage{amsmath} \usepackage{amssymb} %\VignetteIndexEntry{The DMRcate package user's guide} %\VignetteEngine{knitr::knitr} \begin{document} \title{The \texttt{DMRcate} package user's guide} \author{Peters TJ} \maketitle \renewcommand{\abstractname}{Summary} \begin{abstract} \texttt{DMRcate} extracts the most differentially methylated regions (DMRs) and variably methylated regions (VMRs) from both Whole Genome Bisulphite Sequencing (WGBS) and Illumina\textregistered Infinium BeadChip Array samples via kernel smoothing. \end{abstract} <>= if (!require("BiocManager")) install.packages("BiocManager") BiocManager::install("DMRcate") @ Load \texttt{DMRcate} into the workspace: <>= library(DMRcate) @ \section*{Illumina\textregistered Array Workflow} For this vignette, we will demonstrate DMRcate's array utility using data from \texttt{ExperimentHub}, namely Illumina HumanMethylationEPIC data from the data packages \texttt{FlowSorted.Blood.EPIC}. Specifically, we are interested in the methylation differences between CD4+ and CD8+ T cells. <>= library(ExperimentHub) eh <- ExperimentHub() FlowSorted.Blood.EPIC <- eh[["EH1136"]] tcell <- FlowSorted.Blood.EPIC[,colData(FlowSorted.Blood.EPIC)$CD4T==100 | colData(FlowSorted.Blood.EPIC)$CD8T==100] @ Firstly we have to filter out any probes where any sample has a failed position. Then we will normalise using \texttt{minfi::preprocessFunnorm}. After this, we extract the \emph{M}-values from the GenomicRatioSet. <>== detP <- detectionP(tcell) remove <- apply(detP, 1, function (x) any(x > 0.01)) tcell <- tcell[!remove,] tcell <- preprocessFunnorm(tcell) tcellms <- getM(tcell) @ M-values (logit-transform of beta) are preferable to beta values for significance testing via \texttt{limma} because of increased sensitivity, but we will transform this to a beta matrix for visualisation purposes later on. Some of the methylation measurements on the array may be confounded by proximity to SNPs, and cross-hybridisation to other areas of the genome\cite{Pidsley, Chena}. In particular, probes that are 0, 1, or 2 nucleotides from the methylcytosine of interest show a markedly different distribution to those farther away, in healthy tissue (Figure 1). \begin{figure}[htbp!] \caption{Beta distribution of 450K probes from publically available data from blood samples of healthy individuals \cite{Heyn} by their proximity to a SNP. ``All SNP probes'' refers to the 153 113 probes listed by Illumina\textregistered\ whose values may potentially be confounded by a SNP. } \centering \includegraphics[width=\textwidth]{heynSNP.pdf} \end{figure} It is with this in mind that we filter out probes 2 nucleotides or closer to a SNP that have a minor allele frequency greater than 0.05, and the approximately 48,000 \cite{Pidsley, Chena} cross-reactive probes on either 450K and/or EPIC, so as to reduce confounding. Here we use a combination of \textit{in silico} analyses from \cite{Pidsley, Chena}. About 60,000 are removed from our M-matrix of approximately 864,000: <>= nrow(tcellms) tcellms.noSNPs <- rmSNPandCH(tcellms, dist=2, mafcut=0.05) nrow(tcellms.noSNPs) @ Here we have 6 CD8+ T cell assays, and 7 CD4+ T cell assays; we want to call DMRs between these groups. One of the CD4+ assays is a technical replicate, so we will average these two replicates like so: <>== tcell$Replicate tcell$Replicate[tcell$Replicate==""] <- tcell$Sample_Name[tcell$Replicate==""] tcellms.noSNPs <- limma::avearrays(tcellms.noSNPs, tcell$Replicate) tcell <- tcell[,!duplicated(tcell$Replicate)] tcell <- tcell[rownames(tcellms.noSNPs),] colnames(tcellms.noSNPs) <- colnames(tcell) assays(tcell)[["M"]] <- tcellms.noSNPs assays(tcell)[["Beta"]] <- ilogit2(tcellms.noSNPs) @ Next we want to annotate our matrix of M-values with relevant information. We also use the backbone of the \texttt{limma} pipeline for differential array analysis. We want to compare within patients across tissue samples, so we set up our variables for a standard limma pipeline, and set \texttt{coef=2} in \texttt{cpg.annotate} since this corresponds to the phenotype comparison in \texttt{design}. \texttt{cpg.annotate()} takes either a data matrix with Illumina probe IDs, or an already prepared GenomicRatioSet from \texttt{minfi}. <>= type <- factor(tcell$CellType) design <- model.matrix(~type) myannotation <- cpg.annotate("array", tcell, arraytype = "EPIC", analysis.type="differential", design=design, coef=2) @ <>= myannotation @ Now we can find our most differentially methylated regions with \texttt{dmrcate()}. For each chromosome, two smoothed estimates are computed: one weighted with per-CpG \textit{t}-statistics and one not, for a null comparison. The two estimates are compared via a Satterthwaite approximation\cite{Satterthwaite}, and a significance test is calculated at all hg19 coordinates that an input probe maps to. After fdr-correction, regions are then agglomerated from groups of post-smoothed significant probes where the distance to the next consecutive probe is less than \texttt{lambda} nucleotides. <>= dmrcoutput <- dmrcate(myannotation, lambda=1000, C=2) dmrcoutput @ We can convert our DMR list to a GRanges object, which uses the \texttt{genome} argument to annotate overlapping gene loci. <>= results.ranges <- extractRanges(dmrcoutput, genome = "hg19") results.ranges @ DMRs are ranked by Fisher's multiple comparison statistic, but \texttt{Stouffer} scores and the harmonic mean of the individual component FDRs (\texttt{HMFDR}) are also given in this object as alternative options for ranking DMR significance. We can then pass this GRanges object to \texttt{DMR.plot()}, which uses the \texttt{Gviz} package as a backend for contextualising each DMR. <>= groups <- c(CD8T="magenta", CD4T="forestgreen") cols <- groups[as.character(type)] cols DMR.plot(ranges=results.ranges, dmr=1, CpGs=getBeta(tcell), what="Beta", arraytype = "EPIC", phen.col=cols, genome="hg19") @ Consonant with the expected biology, our top DMR shows the CD8+ T cells hypomethylated across parts of the CD8A locus. The two distinct hypomethylated sections have been merged because they are less than 1000 bp apart - specified by \texttt{lambda} in the call to \texttt{dmrcate()}. To call these as separate DMRs, make \texttt{lambda} smaller. Lastly, we would like to do a gene ontology test on our DMRs; this is made possible by the \texttt{goregion()} function in the \texttt{missMethyl} package. We will take the top 100 DMRs for this enrichment test. <>= library(missMethyl) enrichment_GO <- goregion(results.ranges[1:100], all.cpg = rownames(tcell), collection = "GO", array.type = "EPIC") enrichment_GO <- enrichment_GO[order(enrichment_GO$P.DE),] head(as.matrix(enrichment_GO), 10) @ From this enrichment test we can see the most enriched terms are germane to the contrast at hand, including lymphocyte activation and differentiation, T cell activation and MHC protein binding. \section*{Bisulfite sequencing workflow} Bisulfite sequencing assays are fundamentally different to arrays, because methylation is represented as a pair of methylated and unmethylated reads per sample, instead of a single beta value. Although we could simply take the logit-proportion of methylated reads per CpG, this removes the effect of varying read depth across the genome. For example, a sampling depth of 30 methylated reads and 10 unmethylated reads is a much more precise estimate of the methylation level of a given CpG site than 3 methylated and 1 unmethylated. Hence, we take advantage of the fact that the overall effect can be expressed as an interaction between the coefficient of interest and a two-level factor representing methylated and unmethylated reads \cite{Chenb}. The example shown here will be performed on a BSseq object containing bisulfite sequencing of regulatory T cells from various tissues as part of the \texttt{tissueTreg} package\cite{Delacher}, imported using ExperimentHub. First, we will import the data: <>= bis_1072 <- eh[["EH1072"]] bis_1072 colnames(bis_1072) @ The data contains 15 samples: 3 (unmatched) replicates of mouse Tregs from fat, liver, skin and lymph node, plus a group of 3 CD4+ conventional lymph node T cells (Tcon). We will annotate the BSseq object to reflect this phenotypic information: <>= pData(bis_1072) <- data.frame(replicate=gsub(".*-", "", colnames(bis_1072)), tissue=substr(colnames(bis_1072), 1, nchar(colnames(bis_1072))-3), row.names=colnames(bis_1072)) colData(bis_1072)$tissue <- gsub("-", "_", colData(bis_1072)$tissue) as.data.frame(colData(bis_1072)) @ For standardisation purposes (and for \texttt{DMR.plot} to recognise the genome) we will change the chromosome naming convention to UCSC: <>= bis_1072 <- renameSeqlevels(bis_1072, mapSeqlevels(seqlevels(bis_1072), "UCSC")) @ For demonstration purposes, we will retain CpGs on chromosome 19 only: <>= bis_1072 <- bis_1072[seqnames(bis_1072)=="chr19",] bis_1072 @ Now we can prepare the model to be fit for \texttt{sequencing.annotate()}. The arguments are equivalent to \texttt{cpg.annotate()} but for a couple of exceptions: \begin{itemize} \item There is an extra argument \texttt{all.cov} giving an option whether to retain only CpGs where \textit{all} samples have non-zero coverage, or whether to retain CpGs with only partial sample representation. \item The design matrix should be constructed to reflect the 2-factor structure of methylated and unmethylated reads. Fortunately, \texttt{edgeR::modelMatrixMeth()} can take a regular design matrix and transform is into the appropriate structure ready for model fitting. \end{itemize} <>= tissue <- factor(pData(bis_1072)$tissue) tissue <- relevel(tissue, "Liver_Treg") #Regular matrix design design <- model.matrix(~tissue) colnames(design) <- gsub("tissue", "", colnames(design)) colnames(design)[1] <- "Intercept" rownames(design) <- colnames(bis_1072) design #Methylation matrix design methdesign <- edgeR::modelMatrixMeth(design) methdesign @ Just like for \texttt{cpg.annotate()}, we can specify a contrast matrix to find our comparisons of interest. <>= cont.mat <- limma::makeContrasts(treg_vs_tcon=Lymph_N_Treg-Lymph_N_Tcon, fat_vs_ln=Fat_Treg-Lymph_N_Treg, skin_vs_ln=Skin_Treg-Lymph_N_Treg, fat_vs_skin=Fat_Treg-Skin_Treg, levels=methdesign) cont.mat @ Say we want to find DMRs between the regulatory and conventional T cells from the lymph node. First we would fit the model, where \texttt{sequencing.annotate()} transforms counts into log2CPMs (via \texttt{limma::voom()}) and uses \texttt{limma} under the hood to generate per-CpG \textit{t}-statistics, indexing the FDR at 0.05: <>= seq_annot <- sequencing.annotate(bis_1072, methdesign, all.cov = TRUE, contrasts = TRUE, cont.matrix = cont.mat, coef = "treg_vs_tcon", fdr=0.05) seq_annot @ And then, just like before, we can call DMRs with \texttt{dmrcate()}: <>= dmrcate.res <- dmrcate(seq_annot, C=2, min.cpgs = 5) dmrcate.res treg_vs_tcon.ranges <- extractRanges(dmrcate.res, genome="mm10") treg_vs_tcon.ranges @ Looks like the top DMR is associated with the \textit{Jak2} locus and hypomethylated in the Treg cells (since \texttt{meandiff < 0}). We can plot it like so: <>= cols <- as.character(plyr::mapvalues(tissue, unique(tissue), c("darkorange", "maroon", "blue", "black", "magenta"))) names(cols) <- tissue DMR.plot(treg_vs_tcon.ranges, dmr = 1, CpGs=bis_1072[,tissue %in% c("Lymph_N_Tcon", "Lymph_N_Treg")], phen.col = cols[tissue %in% c("Lymph_N_Tcon", "Lymph_N_Treg")], genome="mm10") @ Now, let's find DMRs between fat and skin Tregs. <>= seq_annot <- sequencing.annotate(bis_1072, methdesign, all.cov = TRUE, contrasts = TRUE, cont.matrix = cont.mat, coef = "fat_vs_skin", fdr=0.05) @ Because this comparison is a bit more subtle, there are very few significantly differential CpGs at this threshold. So we can use \texttt{changeFDR()} to relax the FDR to 0.25, taking into account that there is an increased risk of false positives. <>== seq_annot <- changeFDR(seq_annot, 0.25) @ <>= dmrcate.res <- dmrcate(seq_annot, C=2, min.cpgs = 5) fat_vs_skin.ranges <- extractRanges(dmrcate.res, genome="mm10") @ Now let's plot the top DMR with not only fat and skin, but with all samples: <>= cols DMR.plot(fat_vs_skin.ranges, dmr = 1, CpGs=bis_1072, phen.col = cols, genome="mm10") @ Here we can see the methylation of skin cells over this section of \textit{Gcnt1} is hypomethylated not only relative to fat, but to the other tissues as well. As an alternative to \texttt{limma}, there is also the option of taking CpG-level differential statistics using \texttt{DSS::DMLtest()} or \texttt{DSS::DMLtest.multiFactor()}. There is no need to pass arguments such as \texttt{design}, \texttt{coef}, etc. to \texttt{sequencing.annotate()} in this case since we do this outside of the function. \texttt{fdr}, however, must be specified. For example: <>= library(DSS) DMLfit <- DMLfit.multiFactor(bis_1072, design=data.frame(tissue=tissue), formula=~tissue) DSS_treg.vs.tcon <- DMLtest.multiFactor(DMLfit, Contrast=matrix(c(0, 0, -1, 1, 0))) #Make sure to filter out all sites where the test statistic is NA DSS_treg.vs.tcon <- DSS_treg.vs.tcon[!is.na(DSS_treg.vs.tcon$stat),] seq_annot <- sequencing.annotate(obj=DSS_treg.vs.tcon, fdr=0.05) seq_annot dmrcate.res <- dmrcate(seq_annot, C=2, min.cpgs = 5) DSS.treg_vs_tcon.ranges <- extractRanges(dmrcate.res, genome="mm10") findOverlaps(treg_vs_tcon.ranges, DSS.treg_vs_tcon.ranges) @ All of the 9 DMRs found using results from \texttt{limma} are also found using \texttt{DSS::DMLtest.multiFactor()}, with an extra 21 DMRs found by the latter at the same FDR. This suggests that DMLtest.multiFactor() is a little more permissive in calling differential methylation. <>= sessionInfo() @ \begin{thebibliography}{99} \bibitem{Pidsley} Pidsley R, Zotenko E, Peters TJ, Lawrence MG, Risbridger GP, Molloy P, Van Dijk S, Muhlhausler B, Stirzaker C, Clark SJ. Critical evaluation of the Illumina MethylationEPIC BeadChip microarray for whole-genome DNA methylation profiling. \emph{Genome Biology}. 2016 17(1), 208. \bibitem{Chena} Chen YA, Lemire M, Choufani S, Butcher DT, Grafodatskaya D, Zanke BW, Gallinger S, Hudson TJ, Weksberg R. Discovery of cross-reactive probes and polymorphic CpGs in the Illumina Infinium HumanMethylation450 microarray. \emph{Epigenetics}. 2013 Jan 11;8(2). \bibitem{Heyn} Heyn H, Li N, Ferreira HJ, Moran S, Pisano DG, Gomez A, Esteller M. Distinct DNA methylomes of newborns and centenarians. \emph{Proceedings of the National Academy of Sciences}. 2012 \textbf{109}(26), 10522-7. \bibitem{Satterthwaite} Satterthwaite FE. An Approximate Distribution of Estimates of Variance Components., \emph{Biometrics Bulletin}. 1946 \textbf{2}: 110-114 \bibitem{Chenb} Chen Y, Pal B, Visvader JE, Smyth GK. Differential methylation analysis of reduced representation bisulfite sequencing experiments using edgeR. \emph{F1000Research}. 2017 \textbf{6}, 2055. \bibitem{Delacher} Delacher M, Imbusch CD, Weichenhan D, Breiling A, Hotz-Wagenblatt A, Trager U, ... Feuerer M. (2017). Genome-wide DNA-methylation landscape defines specialization of regulatory T cells in tissues. \emph{Nature Immunology}. 2017 \textbf{18}(10), 1160-1172. \bibitem{Feng} Feng H, Conneely KN, Wu H. A Bayesian hierarchical model to detect differentially methylated loci from single nucleotide resolution sequencing data. \emph{Nucleic Acids Research}. 2014 \textbf{42}(8), e69. \bibitem{Park} Park Y, and Wu H. Differential methylation analysis for BS-seq data under general experimental design. \emph{Bioinformatics}. 2016 \textbf{32}(10), 1446-1453. \end{thebibliography} \end{document}