## ----style, eval=TRUE, echo=FALSE, results="asis"------------------------ BiocStyle::latex2() ## ----knitr, echo=FALSE, results="hide"----------------------------------- library("knitr") opts_chunk$set( tidy=FALSE, dev="png", fig.show="hide", fig.width=4, fig.height=4.5, fig.pos="tbh", cache=TRUE, message=FALSE) ## ----loadDESeq2, echo=FALSE---------------------------------------------- library("DESeq2") ## ----options, results="hide", echo=FALSE--------------------------------- options(digits=3, prompt=" ", continue=" ") ## ----quick, eval=FALSE--------------------------------------------------- ## dds <- DESeqDataSet(se, design = ~ batch + condition) ## dds <- DESeq(dds) ## res <- results(dds, contrast=c("condition","trt","con")) ## ----loadSumExp---------------------------------------------------------- library("airway") data("airway") se <- airway ## ----sumExpInput--------------------------------------------------------- library("DESeq2") ddsSE <- DESeqDataSet(se, design = ~ cell + dex) ddsSE ## ----loadPasilla--------------------------------------------------------- library("pasilla") pasCts <- system.file("extdata", "pasilla_gene_counts.tsv", package="pasilla", mustWork=TRUE) pasAnno <- system.file("extdata", "pasilla_sample_annotation.csv", package="pasilla", mustWork=TRUE) countData <- as.matrix(read.csv(pasCts,sep="\t",row.names="gene_id")) colData <- read.csv(pasAnno, row.names=1) colData <- colData[,c("condition","type")] ## ----showPasilla--------------------------------------------------------- head(countData) head(colData) ## ----reorderPasila------------------------------------------------------- rownames(colData) <- sub("fb","",rownames(colData)) all(rownames(colData) %in% colnames(countData)) countData <- countData[, rownames(colData)] all(rownames(colData) == colnames(countData)) ## ----matrixInput--------------------------------------------------------- dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design = ~ condition) dds ## ----addFeatureData------------------------------------------------------ featureData <- data.frame(gene=rownames(countData)) (mcols(dds) <- DataFrame(mcols(dds), featureData)) ## ----tximport------------------------------------------------------------ library("tximport") library("readr") library("tximportData") dir <- system.file("extdata", package="tximportData") samples <- read.table(file.path(dir,"samples.txt"), header=TRUE) files <- file.path(dir,"salmon", samples$run, "quant.sf") names(files) <- paste0("sample",1:6) tx2gene <- read.csv(file.path(dir, "tx2gene.csv")) txi <- tximport(files, type="salmon", tx2gene=tx2gene, reader=read_tsv) ## ----txi2dds------------------------------------------------------------- coldata <- data.frame(condition=factor(rep(c("A","B"),each=3))) rownames(coldata) <- colnames(txi$counts) ddsTxi <- DESeqDataSetFromTximport(txi, colData=coldata, design=~ condition) ## ----htseqDirI, eval=FALSE----------------------------------------------- ## directory <- "/path/to/your/files/" ## ----htseqDirII---------------------------------------------------------- directory <- system.file("extdata", package="pasilla", mustWork=TRUE) ## ----htseqInput---------------------------------------------------------- sampleFiles <- grep("treated",list.files(directory),value=TRUE) sampleCondition <- sub("(.*treated).*","\\1",sampleFiles) sampleTable <- data.frame(sampleName = sampleFiles, fileName = sampleFiles, condition = sampleCondition) ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = directory, design= ~ condition) ddsHTSeq ## ----prefilter----------------------------------------------------------- dds <- dds[ rowSums(counts(dds)) > 1, ] ## ----factorlvl----------------------------------------------------------- dds$condition <- factor(dds$condition, levels=c("untreated","treated")) ## ----relevel------------------------------------------------------------- dds$condition <- relevel(dds$condition, ref="untreated") ## ----droplevels---------------------------------------------------------- dds$condition <- droplevels(dds$condition) ## ----deseq--------------------------------------------------------------- dds <- DESeq(dds) res <- results(dds) res ## ----parallel, eval=FALSE------------------------------------------------ ## library("BiocParallel") ## register(MulticoreParam(4)) ## ----resOrder------------------------------------------------------------ resOrdered <- res[order(res$padj),] ## ----sumRes-------------------------------------------------------------- summary(res) ## ----sumRes01------------------------------------------------------------ sum(res$padj < 0.1, na.rm=TRUE) ## ----resAlpha05---------------------------------------------------------- res05 <- results(dds, alpha=0.05) summary(res05) sum(res05$padj < 0.05, na.rm=TRUE) ## ----IHW----------------------------------------------------------------- library("IHW") resIHW <- results(dds, filterFun=ihw) summary(resIHW) sum(resIHW$padj < 0.1, na.rm=TRUE) metadata(resIHW)$ihwResult ## ----MA, fig.width=4.5, fig.height=4.5----------------------------------- plotMA(res, main="DESeq2", ylim=c(-2,2)) ## ----MAidentify, eval=FALSE---------------------------------------------- ## idx <- identify(res$baseMean, res$log2FoldChange) ## rownames(res)[idx] ## ----resMLE-------------------------------------------------------------- resMLE <- results(dds, addMLE=TRUE) head(resMLE, 4) ## ----MANoPrior, fig.width=4.5, fig.height=4.5---------------------------- plotMA(resMLE, MLE=TRUE, main="unshrunken LFC", ylim=c(-2,2)) ## ----plotCounts, dev="pdf", fig.width=4.5, fig.height=5------------------ plotCounts(dds, gene=which.min(res$padj), intgroup="condition") ## ----plotCountsAdv, dev="pdf", fig.width=3.5, fig.height=3.5------------- d <- plotCounts(dds, gene=which.min(res$padj), intgroup="condition", returnData=TRUE) library("ggplot2") ggplot(d, aes(x=condition, y=count)) + geom_point(position=position_jitter(w=0.1,h=0)) + scale_y_log10(breaks=c(25,100,400)) ## ----metadata------------------------------------------------------------ mcols(res)$description ## ----export, eval=FALSE-------------------------------------------------- ## write.csv(as.data.frame(resOrdered), ## file="condition_treated_results.csv") ## ----subset-------------------------------------------------------------- resSig <- subset(resOrdered, padj < 0.1) resSig ## ----multifactor--------------------------------------------------------- colData(dds) ## ----copyMultifactor----------------------------------------------------- ddsMF <- dds ## ----replaceDesign------------------------------------------------------- design(ddsMF) <- formula(~ type + condition) ddsMF <- DESeq(ddsMF) ## ----multiResults-------------------------------------------------------- resMF <- results(ddsMF) head(resMF) ## ----multiTypeResults---------------------------------------------------- resMFType <- results(ddsMF, contrast=c("type", "single-read", "paired-end")) head(resMFType) ## ----rlogAndVST---------------------------------------------------------- rld <- rlog(dds, blind=FALSE) vsd <- varianceStabilizingTransformation(dds, blind=FALSE) vsd.fast <- vst(dds, blind=FALSE) head(assay(rld), 3) ## ----vsd1, echo=FALSE, fig.width=4.5, fig.height=4.5, fig.show="asis", fig.small=TRUE, fig.pos="!bt", fig.cap="VST and log2. Graphs of the variance stabilizing transformation for sample 1, in blue, and of the transformation $f(n) = \\log_2(n/s_1)$, in black. $n$ are the counts and $s_1$ is the size factor for the first sample.\\label{figure/vsd1-1}"---- px <- counts(dds)[,1] / sizeFactors(dds)[1] ord <- order(px) ord <- ord[px[ord] < 150] ord <- ord[seq(1, length(ord), length=50)] last <- ord[length(ord)] vstcol <- c("blue", "black") matplot(px[ord], cbind(assay(vsd)[, 1], log2(px))[ord, ], type="l", lty=1, col=vstcol, xlab="n", ylab="f(n)") legend("bottomright", legend = c( expression("variance stabilizing transformation"), expression(log[2](n/s[1]))), fill=vstcol) ## ----meansd, fig.width=4, fig.height=3, fig.show="asis", fig.wide=TRUE, fig.pos="tb", out.width=".32\\linewidth", fig.cap="Per-gene standard deviation (taken across samples), against the rank of the mean. {\\bfhelvet(a)} for the shifted logarithm $\\log_2(n+1)$, the regularized log transformation {\\bfhelvet(b)} and the variance stabilizing transformation {\\bfhelvet(c)}.\\label{fig:meansd}", fig.subcap=""---- library("vsn") notAllZero <- (rowSums(counts(dds))>0) meanSdPlot(log2(counts(dds,normalized=TRUE)[notAllZero,] + 1)) meanSdPlot(assay(rld[notAllZero,])) meanSdPlot(assay(vsd[notAllZero,])) ## ----heatmap, dev="pdf", fig.width=5, fig.height=7----------------------- library("pheatmap") select <- order(rowMeans(counts(dds,normalized=TRUE)), decreasing=TRUE)[1:20] nt <- normTransform(dds) # defaults to log2(x+1) log2.norm.counts <- assay(nt)[select,] df <- as.data.frame(colData(dds)[,c("condition","type")]) pheatmap(log2.norm.counts, cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) pheatmap(assay(rld)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) ## ----sampleClust--------------------------------------------------------- sampleDists <- dist(t(assay(rld))) ## ----figHeatmapSamples, dev="pdf", fig.width=7, fig.height=7, fig.show="asis", fig.small=TRUE, fig.pos="tb", fig.cap="Sample-to-sample distances. Heatmap showing the Euclidean distances between the samples as calculated from the regularized log transformation.\\label{figure/figHeatmapSamples-1}"---- library("RColorBrewer") sampleDistMatrix <- as.matrix(sampleDists) rownames(sampleDistMatrix) <- paste(rld$condition, rld$type, sep="-") colnames(sampleDistMatrix) <- NULL colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) pheatmap(sampleDistMatrix, clustering_distance_rows=sampleDists, clustering_distance_cols=sampleDists, col=colors) ## ----figPCA, dev="pdf", fig.width=5, fig.height=3------------------------ plotPCA(rld, intgroup=c("condition", "type")) ## ----figPCA2, dev="pdf", fig.width=5, fig.height=3----------------------- data <- plotPCA(rld, intgroup=c("condition", "type"), returnData=TRUE) percentVar <- round(100 * attr(data, "percentVar")) ggplot(data, aes(PC1, PC2, color=condition, shape=type)) + geom_point(size=3) + xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance")) + coord_fixed() ## ----WaldTest, eval=FALSE------------------------------------------------ ## dds <- estimateSizeFactors(dds) ## dds <- estimateDispersions(dds) ## dds <- nbinomWaldTest(dds) ## ----simpleContrast, eval=FALSE------------------------------------------ ## results(dds, contrast=c("condition","C","B")) ## ----combineFactors, eval=FALSE------------------------------------------ ## dds$group <- factor(paste0(dds$genotype, dds$condition)) ## design(dds) <- ~ group ## dds <- DESeq(dds) ## resultsNames(dds) ## results(dds, contrast=c("group", "IB", "IA")) ## ----interFig, dev="pdf", fig.width=4, fig.height=3, echo=FALSE, results="hide"---- npg <- 20 mu <- 2^c(8,10,9,11,10,12) cond <- rep(rep(c("A","B"),each=npg),3) geno <- rep(c("I","II","III"),each=2*npg) table(cond, geno) counts <- rnbinom(6*npg, mu=rep(mu,each=npg), size=1/.01) d <- data.frame(log2c=log2(counts+1), cond, geno) library(ggplot2) plotit <- function(d, title) { ggplot(d, aes(x=cond, y=log2c, group=geno)) + geom_jitter(size=1.5, position = position_jitter(width=.15)) + facet_wrap(~ geno) + stat_summary(fun.y=mean, geom="line", colour="red", size=0.8) + xlab("condition") + ylab("log2(counts+1)") + ggtitle(title) } plotit(d, "Gene 1") + ylim(7,13) lm(log2c ~ cond + geno + geno:cond, data=d) ## ----interFig2, dev="pdf", fig.width=4, fig.height=3, echo=FALSE, results="hide"---- mu[4] <- 2^12 mu[6] <- 2^8 counts <- rnbinom(6*npg, mu=rep(mu,each=npg), size=1/.01) d2 <- data.frame(log2c=log2(counts + 1), cond, geno) plotit(d2, "Gene 2") + ylim(7,13) lm(log2c ~ cond + geno + geno:cond, data=d2) ## ----simpleLRT, eval=FALSE----------------------------------------------- ## dds <- DESeq(dds, test="LRT", reduced=~1) ## res <- results(dds) ## ----simpleLRT2, eval=FALSE---------------------------------------------- ## dds <- DESeq(dds, test="LRT", reduced=~batch) ## res <- results(dds) ## ----boxplotCooks, fig.show="asis", fig.small=TRUE, fig.cap="Boxplot of Cook's distances. Here we can look to see if one sample has much higher Cook's distances than the other samples. In this case, the samples all have comparable range of Cook's distances.\\label{figure/boxplotCooks-1}"---- par(mar=c(8,5,2,2)) boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) ## ----dispFit, fig.show="asis", fig.small=TRUE, fig.cap="Dispersion plot. The dispersion estimate plot shows the gene-wise estimates (black), the fitted values (red), and the final maximum \\textit{a posteriori} estimates used in testing (blue).\\label{figure/dispFit-1}"---- plotDispEsts(dds) ## ----dispFitCustom------------------------------------------------------- ddsCustom <- dds useForMedian <- mcols(ddsCustom)$dispGeneEst > 1e-7 medianDisp <- median(mcols(ddsCustom)$dispGeneEst[useForMedian], na.rm=TRUE) dispersionFunction(ddsCustom) <- function(mu) medianDisp ddsCustom <- estimateDispersionsMAP(ddsCustom) ## ----filtByMean, dev="pdf", fig.show="asis", fig.small=TRUE, fig.cap="Independent filtering. The \\Rfunction{results} function maximizes the number of rejections (adjusted $p$ value less than a significance level), over the quantiles of a filter statistic (the mean of normalized counts). The threshold chosen (vertical line) is the lowest quantile of the filter for which the number of rejections is within 1 residual standard deviation to the peak of a curve fit to the number of rejections over the filter quantiles.\\label{figure/filtByMean-1}"---- metadata(res)$alpha metadata(res)$filterThreshold plot(metadata(res)$filterNumRej, type="b", ylab="number of rejections", xlab="quantiles of filter") lines(metadata(res)$lo.fit, col="red") abline(v=metadata(res)$filterTheta) ## ----noFilt-------------------------------------------------------------- resNoFilt <- results(dds, independentFiltering=FALSE) addmargins(table(filtering=(res$padj < .1), noFiltering=(resNoFilt$padj < .1))) ## ----ddsNoPrior---------------------------------------------------------- ddsNoPrior <- DESeq(dds, betaPrior=FALSE) ## ----lfcThresh, fig.show="asis", fig.cap='MA-plots of tests of log2 fold change with respect to a threshold value. Going left to right across rows, the tests are for \\Robject{altHypothesis = "greaterAbs"}, \\Robject{"lessAbs"}, \\Robject{"greater"}, and \\Robject{"less"}.\\label{figure/lfcThresh-1}'---- par(mfrow=c(2,2),mar=c(2,2,1,1)) yl <- c(-2.5,2.5) resGA <- results(dds, lfcThreshold=.5, altHypothesis="greaterAbs") resLA <- results(ddsNoPrior, lfcThreshold=.5, altHypothesis="lessAbs") resG <- results(dds, lfcThreshold=.5, altHypothesis="greater") resL <- results(dds, lfcThreshold=.5, altHypothesis="less") plotMA(resGA, ylim=yl) abline(h=c(-.5,.5),col="dodgerblue",lwd=2) plotMA(resLA, ylim=yl) abline(h=c(-.5,.5),col="dodgerblue",lwd=2) plotMA(resG, ylim=yl) abline(h=.5,col="dodgerblue",lwd=2) plotMA(resL, ylim=yl) abline(h=-.5,col="dodgerblue",lwd=2) ## ----mcols--------------------------------------------------------------- mcols(dds,use.names=TRUE)[1:4,1:4] # here using substr() only for display purposes substr(names(mcols(dds)),1,10) mcols(mcols(dds), use.names=TRUE)[1:4,] ## ----muAndCooks---------------------------------------------------------- head(assays(dds)[["mu"]]) head(assays(dds)[["cooks"]]) ## ----dispersions--------------------------------------------------------- head(dispersions(dds)) # which is the same as head(mcols(dds)$dispersion) ## ----sizefactors--------------------------------------------------------- sizeFactors(dds) ## ----coef---------------------------------------------------------------- head(coef(dds)) ## ----betaPriorVar-------------------------------------------------------- attr(dds, "betaPriorVar") ## ----dispPriorVar-------------------------------------------------------- dispersionFunction(dds) attr(dispersionFunction(dds), "dispPriorVar") ## ----versionNum---------------------------------------------------------- metadata(dds)[["version"]] ## ----normFactors, eval=FALSE--------------------------------------------- ## normFactors <- normFactors / exp(rowMeans(log(normFactors))) ## normalizationFactors(dds) <- normFactors ## ----offsetTransform, eval=FALSE----------------------------------------- ## cqnOffset <- cqnObject$glm.offset ## cqnNormFactors <- exp(cqnOffset) ## EDASeqNormFactors <- exp(-1 * EDASeqOffset) ## ----lineardep, echo=FALSE----------------------------------------------- data.frame(batch=factor(c(1,1,2,2)), condition=factor(c("A","A","B","B"))) ## ----lineardep2, echo=FALSE---------------------------------------------- data.frame(batch=factor(c(1,1,1,1,2,2)), condition=factor(c("A","A","B","B","C","C"))) ## ----lineardep3, echo=FALSE---------------------------------------------- data.frame(batch=factor(c(1,1,1,2,2,2)), condition=factor(c("A","B","C","A","B","C"))) ## ----groupeffect--------------------------------------------------------- (coldata <- data.frame(grp=factor(rep(c("X","Y"),each=4)), ind=factor(rep(1:4,each=2)), cnd=factor(rep(c("A","B"),4)))) ## ----groupeffect2-------------------------------------------------------- coldata$ind.n <- factor(rep(rep(1:2,each=2),2)) coldata ## ----groupeffect3-------------------------------------------------------- model.matrix(~ grp + grp:ind.n + grp:cnd, coldata) ## ----groupeffect4, eval=FALSE-------------------------------------------- ## results(dds, contrast=list("grpY.cndB","grpX.cndB")) ## ----missingcombo-------------------------------------------------------- group <- factor(rep(1:3,each=6)) condition <- factor(rep(rep(c("A","B","C"),each=2),3)) (d <- data.frame(group, condition)[-c(17,18),]) ## ----missingcombo2------------------------------------------------------- m1 <- model.matrix(~ condition*group, d) colnames(m1) unname(m1) ## ----missingcombo3------------------------------------------------------- m1 <- m1[,-9] unname(m1) ## ----cooksPlot, fig.show="asis", fig.small=TRUE, fig.cap="Cook's distance. Plot of the maximum Cook's distance per gene over the rank of the Wald statistics for the condition. The two regions with small Cook's distances are genes with a single count in one sample. The horizontal line is the default cutoff used for 7 samples and 3 estimated parameters.\\label{figure/cooksPlot-1}"---- W <- res$stat maxCooks <- apply(assays(dds)[["cooks"]],1,max) idx <- !is.na(W) plot(rank(W[idx]), maxCooks[idx], xlab="rank of Wald statistic", ylab="maximum Cook's distance per gene", ylim=c(0,5), cex=.4, col=rgb(0,0,0,.3)) m <- ncol(dds) p <- 3 abline(h=qf(.99, p, m - p)) ## ----indFilt, fig.show="asis", fig.small=TRUE, fig.cap="Mean counts as a filter statistic. The mean of normalized counts provides an independent statistic for filtering the tests. It is independent because the information about the variables in the design formula is not used. By filtering out genes which fall on the left side of the plot, the majority of the low $p$ values are kept.\\label{figure/indFilt-1}"---- plot(res$baseMean+1, -log10(res$pvalue), log="x", xlab="mean of normalized counts", ylab=expression(-log[10](pvalue)), ylim=c(0,30), cex=.4, col=rgb(0,0,0,.3)) ## ----histindepfilt, dev="pdf", fig.width=7, fig.height=5----------------- use <- res$baseMean > metadata(res)$filterThreshold h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE) h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE) colori <- c(`do not pass`="khaki", `pass`="powderblue") ## ----fighistindepfilt, fig.show="asis", fig.small=TRUE, fig.cap="Histogram of p values for all tests. The area shaded in blue indicates the subset of those that pass the filtering, the area in khaki those that do not pass.\\label{figure/fighistindepfilt-1}"---- barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, main = "", ylab="frequency") text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA) legend("topright", fill=rev(colori), legend=rev(names(colori))) ## ----vanillaDESeq, eval=FALSE-------------------------------------------- ## dds <- DESeq(dds, minReplicatesForReplace=Inf) ## res <- results(dds, cooksCutoff=FALSE, independentFiltering=FALSE) ## ----varGroup, echo=FALSE, fig.width=5, fig.height=5, fig.show="asis", fig.small=TRUE, fig.cap="Extreme range of within-group variability. Typically, it is recommended to run \\Rfunction{DESeq} across samples from all groups, for datasets with multiple groups. However, this simulated dataset shows a case where it would be preferable to compare groups A and B by creating a smaller dataset without the C samples. Group C has much higher within-group variability, which would inflate the per-gene dispersion estimate for groups A and B as well.\\label{figure/varGroup-1}"---- set.seed(3) dds1 <- makeExampleDESeqDataSet(n=1000,m=12,betaSD=.3,dispMeanRel=function(x) 0.01) dds2 <- makeExampleDESeqDataSet(n=1000,m=12, betaSD=.3, interceptMean=mcols(dds1)$trueIntercept, interceptSD=0, dispMeanRel=function(x) 0.2) dds2 <- dds2[,7:12] dds2$condition <- rep("C",6) mcols(dds2) <- NULL dds <- cbind(dds1, dds2) rld <- rlog(dds, blind=FALSE, fitType="mean") plotPCA(rld) ## ----overShrink, echo=FALSE, fig.width=5, fig.height=5, fig.show="asis", fig.small=TRUE, fig.cap="Example of a dataset with where the log fold change prior should be turned off. Here we show a simulated MA-plot, where nearly all of the log fold changes are falling near the x-axis, with three genes that have very large log fold changes (note the y-axis is from -10 to 10 on the log2 scale). This would indicate a dataset where the log fold change prior would ``overshrink'' the large fold changes, and so \\Robject{betaPrior=FALSE} should be used.\\label{figure/overShrink-1}"---- plot(c(10^rnorm(1000, 3, 2),300,2000,5000), c(rnorm(1000, 0, .15), -5.5, -8.5, 7.5), ylim=c(-10,10), log="x", cex=.4, xlab="mean of normalized counts", ylab="log2 fold change") abline(h=0, col=rgb(1,0,0,.7)) ## ----convertNA, eval=FALSE----------------------------------------------- ## res$padj <- ifelse(is.na(res$padj), 1, res$padj) ## ----sessInfo, results="asis", echo=FALSE-------------------------------- toLatex(sessionInfo()) ## ----resetOptions, results="hide", echo=FALSE---------------------------- options(prompt="> ", continue="+ ")