## ----style, echo=FALSE, results='asis'-----------------------------------
BiocStyle::latex()


## ----options, include=FALSE----------------------------------------------
options(digits=3, width=80)
opts_chunk$set(echo=TRUE,tidy=FALSE,include=TRUE,
               fig.path='DESeq2_parathyroid-',fig.show='hide',dev='png')


## ----libraries-----------------------------------------------------------
library( "DESeq2" )
library( "parathyroidSE" )


## ----loadEcs-------------------------------------------------------------
data( "parathyroidGenesSE" )


## ----countTable----------------------------------------------------------
head( assay( parathyroidGenesSE ) )


## ----nrowSE--------------------------------------------------------------
nrow(parathyroidGenesSE)


## ----colData-------------------------------------------------------------
colData( parathyroidGenesSE )


## ----rowData-------------------------------------------------------------
rowData( parathyroidGenesSE )


## ----columnData----------------------------------------------------------
as.data.frame( colData(parathyroidGenesSE)[,c("sample","patient","treatment","time")] )


## ----poormanssum---------------------------------------------------------
a <- assay(parathyroidGenesSE)
countdata0 <- cbind( a[,1:8], a[,9]+a[,10], a[,11], 
   a[,12]+a[,13], a[,14:22], a[,23]+a[,24], a[,25], a[,26]+a[,27] )


## ----split---------------------------------------------------------------
allColSamples <- colData(parathyroidGenesSE)$sample
sp <- split( seq(along=allColSamples), allColSamples )


## ----addSamples----------------------------------------------------------
countdata <- sapply(sp, function(columns) 
   rowSums( assay(parathyroidGenesSE)[,columns,drop=FALSE] ) )
head(countdata)
all( countdata == countdata0 )


## ----subsetMetaData------------------------------------------------------
coldata <- colData(parathyroidGenesSE)[sapply(sp, `[`, 1),]
rownames(coldata) <- coldata$sample
coldata


## ----colDataSubsetCols---------------------------------------------------
coldata <- coldata[ , c( "patient", "treatment", "time" ) ]
head( coldata )


## ----rowdata-------------------------------------------------------------
rowdata <- rowData(parathyroidGenesSE)
rowdata


## ----makeddsfull---------------------------------------------------------
ddsFull <- DESeqDataSetFromMatrix(
  countData = countdata,
  colData = coldata,
  design = ~ patient + treatment,
  rowData = rowdata)
ddsFull  


## ----testCountSum, echo=FALSE--------------------------------------------
stopifnot(sum(assay(parathyroidGenesSE)) == sum(counts(ddsFull)))


## ----subset--------------------------------------------------------------
dds <- ddsFull[ , colData(ddsFull)$treatment %in% c("Control","DPN") & 
                  colData(ddsFull)$time == "48h" ]


## ----refactor------------------------------------------------------------
dds$patient <- factor(dds$patient)
dds$treatment <- factor(dds$treatment)


## ----relevel-------------------------------------------------------------
dds$treatment <- relevel( dds$treatment, "Control" )


## ----removeZeros---------------------------------------------------------
dds <- dds[rowSums(counts(dds)) > 0,]


## ----multifactorColData--------------------------------------------------
colData(dds)


## ----runDESeq, cache=TRUE------------------------------------------------
dds <- DESeq(dds)


## ----extractResults------------------------------------------------------
res <- results(dds)
head(res)


## ----resCols-------------------------------------------------------------
mcols(res)


## ----checkBaseMean-------------------------------------------------------
all.equal(res$baseMean, rowMeans(counts(dds)))
all.equal(res$baseMean, rowMeans(counts(dds,normalized=TRUE)))


## ----rawpvalue-----------------------------------------------------------
sum( res$pvalue < 0.05, na.rm=TRUE )


## ----anyNAs--------------------------------------------------------------
table( is.na(res$pvalue) )


## ----adjpvalue-----------------------------------------------------------
sum( res$padj < 0.1, na.rm=TRUE )


## ----strongGenes---------------------------------------------------------
resSig <- res[ which(res$padj < 0.1 ), ]
head( resSig[ order( resSig$log2FoldChange ), ] )


## ----strongGenesUp-------------------------------------------------------
tail( resSig[ order( resSig$log2FoldChange ), ] )


## ----proportionSigGenes--------------------------------------------------
table(sign(resSig$log2FoldChange))


## ----indepFilt-----------------------------------------------------------
plot(attr(res,"filterNumRej"),type="b",
        ylab="number of rejections")


## ----noFilt--------------------------------------------------------------
sum(res$padj < 0.1, na.rm=TRUE)
resNoFilt <- results(dds, independentFiltering=FALSE)
sum(resNoFilt$padj < 0.1, na.rm=TRUE)


## ----plotMA1-------------------------------------------------------------
plotMA(dds, ylim = c( -1.5, 1.5 ) )


## ----plotMApadjchange,results='hide'-------------------------------------
plotMA(dds, pvalues=resNoFilt$padj, ylim = c( -1.5, 1.5) )


## ----dispPlot------------------------------------------------------------
plotDispEsts( dds )


## ----pvalHist------------------------------------------------------------
hist( res$pvalue, breaks=100     )


## ----loadOrg-------------------------------------------------------------
library( "org.Hs.eg.db" )


## ----keyType-------------------------------------------------------------
columns(org.Hs.eg.db)


## ----convertIDs----------------------------------------------------------
convertIDs <- function( ids, fromKey, toKey, db, ifMultiple=c( "putNA", "useFirst" ) ) {
   stopifnot( inherits( db, "AnnotationDb" ) )
   ifMultiple <- match.arg( ifMultiple )
   suppressWarnings( selRes <- AnnotationDbi::select( 
      db, keys=ids, keytype=fromKey, cols=c(fromKey,toKey) ) )
   if( ifMultiple == "putNA" ) {
      duplicatedIds <- selRes[ duplicated( selRes[,1] ), 1 ]   
      selRes <- selRes[ ! selRes[,1] %in% duplicatedIds, ] }
   return( selRes[ match( ids, selRes[,1] ), 2 ] )
}


## ----addSymbols----------------------------------------------------------
res$symbol <- convertIDs( row.names(res), "ENSEMBL", "SYMBOL", org.Hs.eg.db )
head(res)


## ----writeCSV------------------------------------------------------------
write.csv( as.data.frame(res), file="results.csv" )


## ----loadReactome--------------------------------------------------------
library( "reactome.db" )


## ----addEntrez-----------------------------------------------------------
res$entrez <- convertIDs( row.names(res), "ENSEMBL", "ENTREZID", org.Hs.eg.db )


## ----subsetTores2--------------------------------------------------------
res2 <- res[ res$entrez %in% keys( reactome.db, "ENTREZID" ) & 
   !is.na( res$pvalue) , ]
head(res2)


## ----queryReactome-------------------------------------------------------
suppressWarnings( reactomeTable <- AnnotationDbi::select( reactome.db, keys=res2$entrez, 
   keytype="ENTREZID", columns=c("ENTREZID","REACTOMEID") ) )
head(reactomeTable)


## ----CreateIncidenceMatrix-----------------------------------------------
incm <- do.call( rbind, with(reactomeTable, tapply( 
   ENTREZID, factor(REACTOMEID), function(x) res2$entrez %in% x ) ))
colnames(incm) <- res2$entrez

str(incm)


## ----PruneIncm-----------------------------------------------------------
incm <- incm[ rowSums(incm) >= 5, ]


## ----testFun-------------------------------------------------------------
testCategory <- function( reactomeID ) {
  isMember <- incm[ reactomeID, ]
  data.frame( 
     reactomeID = reactomeID,
     numGenes = sum( isMember ),
     avgLFC = mean( res2$log2FoldChange[isMember] ),
     strength = sum( res2$log2FoldChange[isMember] ) / sqrt(sum(isMember)),
     pvalue = t.test( res2$log2FoldChange[ isMember ] )$p.value,
     reactomeName = reactomePATHID2NAME[[reactomeID]] ) }


## ----testTestFun---------------------------------------------------------
testCategory("109581")


## ----runGSEA,cache=TRUE--------------------------------------------------
reactomeResult <- do.call( rbind, lapply( rownames(incm), testCategory ) )


## ----padjGSEA------------------------------------------------------------
reactomeResult$padjust <- p.adjust( reactomeResult$pvalue, "BH" )


## ----GSEAresult----------------------------------------------------------
reactomeResultSignif <- reactomeResult[ reactomeResult$padjust < 0.05, ]
reactomeResultSignif[ order(-abs(reactomeResultSignif$strength)), ]


## ----topDEGene-----------------------------------------------------------
deGeneID <- "ENSG00000099194"
res[deGeneID,]
deGene <- range(rowData(dds[deGeneID,])[[1]])
names(deGene) <- deGeneID
deGene


## ----defineRange---------------------------------------------------------
as.character(seqnames(deGene))
ucscChrom <- paste0("chr",as.character(seqnames(deGene)))
ucscRanges <- ranges(flank(deGene,width=10e6,both=TRUE))
subsetRange <- GRanges(ucscChrom, ucscRanges)
subsetRange


## ----downloadERBS, eval=FALSE--------------------------------------------
## ##
## ## Please do not run this code if you do not have an internet connection,
## ## alternatively use the local file import in the next code chunk.
## ##
## 
## # library( "rtracklayer" )
## # trackName <- "wgEncodeRegTfbsClusteredV2"
## # tableName <- "wgEncodeRegTfbsClusteredV2"
## # trFactor <- "ERalpha_a"
## # mySession <- browserSession()
## # ucscTable <- getTable(ucscTableQuery(mySession, track=trackName,
## #                                      range=subsetRange, table=tableName,
## #                                      name=trFactor))


## ----localERBS-----------------------------------------------------------
ucscTableFile <- "/share/home/teacher/EMBO_HTS/DESeq_lab/localUcscTable.csv"
ucscTable <- read.csv(ucscTableFile, stringsAsFactors=FALSE)


## ----ERBS2Peaks----------------------------------------------------------
peaks <- with(ucscTable, GRanges(chrom, IRanges(chromStart, chromEnd), 
                                 score=score))
seqlevels(peaks) <- gsub("chr(.+)","\\1",seqlevels(peaks))
seqlevels(peaks) <- seqlevels(deGene)


## ----nearestPeakDo, cache=TRUE, echo=FALSE-------------------------------
# suppress warning which exists for BioC 2.12 of changes to distance()
suppressWarnings({d2nearest <- distanceToNearest(deGene, peaks)})


## ----nearestPeakShow, eval=FALSE-----------------------------------------
## d2nearest <- distanceToNearest(deGene, peaks)


## ----distanceShow, eval=FALSE--------------------------------------------
## distance(deGene, peaks)


## ----distanceDo, echo=FALSE----------------------------------------------
suppressWarnings({distance(deGene, peaks)})


## ----distanceToNearest---------------------------------------------------
d2nearest


## ----nearestPeakOut------------------------------------------------------
deGene
peaks[subjectHits(d2nearest)]


## ----plotPeaksAndGene, fig.width=6, fig.height=4-------------------------
plotRange <- start(deGene) + 1e6 * c(-1,1)
peakNearest <- ( seq_along(peaks) == subjectHits(d2nearest) )
plot(x=start(peaks), y=ifelse(peakNearest,.3,.2),
     ylim=c(0,1), xlim=plotRange, pch='p',
     col=ifelse(peakNearest,"red","grey60"),
     yaxt="n", ylab="",
     xlab=paste("2 Mb on chromosome",as.character(seqnames(deGene))))
points(x=start(deGene),y=.8,pch='g')


## ----calculateSumForCaption, echo=FALSE----------------------------------
plotGRange <- GRanges(seqnames(peaks[1]),IRanges(plotRange[1],plotRange[2]))
numPeaksInPlotRange <- sum(peaks %over% plotGRange)


## ----peakDists-----------------------------------------------------------
  peakDists <- diff(sort(start(peaks)))
  summary(peakDists)
  mean(peakDists)


## ----rld, cache=TRUE-----------------------------------------------------
rld <- rlogTransformation(dds)
head( assay(rld) )


## ----rldPlot,fig.width=10,fig.height=5-----------------------------------
par( mfrow = c( 1, 2 ) )
plot( log2( 1+counts(dds, normalized=TRUE)[, 1:2] ), col="#00000020", pch=20, cex=0.3 )
plot( assay(rld)[, 1:2], col="#00000020", pch=20, cex=0.3 )


## ----euclDist------------------------------------------------------------
sampleDists <- dist( t( assay(rld) ) )
sampleDists


## ----sampleDistHeatmap---------------------------------------------------
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( colData(rld)$treatment, 
   colData(rld)$patient, sep="-" )
   
library( "gplots" )   
heatmap.2( sampleDistMatrix, trace="none" )


## ----betterheatmap,results='hide'----------------------------------------
library("RColorBrewer")  
colours = colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
heatmap.2( sampleDistMatrix, trace="none", col=colours)


## ----samplePCA-----------------------------------------------------------
print( plotPCA( rld, intgroup = c( "patient", "treatment") ) )


## ----topVarGenes---------------------------------------------------------
library( "genefilter" )
topVarGenes <- head( order( rowVars( assay(rld) ), decreasing=TRUE ), 35 )


## ----geneHeatmap, fig.width=9, fig.height=9------------------------------
heatmap.2( assay(rld)[ topVarGenes, ], scale="row", 
     trace="none", dendrogram="column", 
     col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))


## ----sessInfo, echo=FALSE------------------------------------------------
sessionInfo()


## ----relocateAnswers, echo=FALSE-----------------------------------------
relocateAnswers = function(con="DESeq2_parathyroid.tex"){
  txt = readLines(con=con)
  i1 = grep("\\begin{answer}", txt, fixed=TRUE)
  i2 = grep("\\end{answer}",  txt, fixed=TRUE)
  if(!((length(i1)==length(i2))&&all(i2>i1))) stop("\\begin{answer}/\\end{answer} macros are unbalanced.")
  i = unlist(mapply(`:`, i1, i2))
  res = txt[-i]
  sol = txt[i]
  dest = grep("\\section{Solutions}", res, fixed=TRUE)
  if(length(dest)!=1) stop("\\section{Solutions} not found exactly once.")
  res = c(res[1:dest], sol, res[(dest+1):length(res)])
  writeLines(res, con=con)
}