################################################### ### chunk number 1: libraries ################################################### library(lumi) library(vsn) library(limma) library(lumiBarnes) set.seed(0xbadbeef) ################################################### ### chunk number 2: load data ################################################### ## Load the Barnes data set data("lumiBarnes") ## Only selected the dilution samples selChip = !is.na(lumiBarnes$pctBlood) x.lumi = lumiBarnes[, selChip] ## summary of the data x.lumi ################################################### ### chunk number 3: background removal ################################################### ## Since the Barnes data was not background removed, we will do background adjustment first. ## The background estimation will be based on the control probe information. x.lumi = lumiB(x.lumi, method='bgAdjust', probs=0) ################################################### ### chunk number 4: scatterPlotTechReplicate ################################################### selPair <- exprs(x.lumi[,c(1,7)]) vstSelPair <- exprs(lumiT(x.lumi[, c(1,7)])) ## randomly select 10000 probes for fast plotting selInd <- sample(1:nrow(selPair), 10000) oldpar <- par(mfrow=c(2,2)) plot(selPair[selInd,1], selPair[selInd,2], main='At raw scale') plot(log2(selPair[selInd,1]), log2(selPair[selInd,2]), main='After Log2 transform') plot(log2(selPair[selInd,1] + 100), log2(selPair[selInd,2] + 100), main='After Log2 transform with 100 offset') plot(vstSelPair[selInd,1], vstSelPair[selInd,2], main='After VST transform') par(oldpar) ################################################### ### chunk number 5: quantile normalize simulation ################################################### # Demonstrate the limitation of quantile noramlization # Let us simulate three arrays in a matrix: each row is a gene; each column is an array. # Say, they are biological replicates, similar to each other, # but still have critical differences. The biological variation is # necessary to keep. # x <- as.matrix(data.frame( x1= seq (1, 10000) + rnorm (10000,0,0.3), x2= seq (1, 10000) + rnorm (10000,0,0.3), x3= seq (1, 10000) + rnorm (10000,0,0.3) )) print(x[1:5,]) # # Now, after quantile normalization, it seems that some of the # variations are gone. x1 <- normalize.quantiles(x) print(x1[1:5,]) # # To confirm this, we take a look at the variance of all genes, # before and after normalization. v <- apply(x, 1, var) summary(v) v1 <- apply(x1, 1, var) summary(v1) ################################################### ### chunk number 6: RSN normalize simulation ################################################### x.lumiN <- lumiN(x) print(x.lumiN[1:5,]) v.lumiN <- apply(x.lumiN, 1, var) summary(v.lumiN) ################################################### ### chunk number 7: Filtering Probes ################################################### ## Estimate the detection count of lumiBarnes data presentCount <- detectionCall(x.lumi) ## If using old version (before 1.3.2) lumiBarnes library # presentCount <- detectionCall(x.lumi, 0.99) dataMatrix <- exprs(x.lumi) # ## check the correlation between 6 pairs of technique replicates cc.bad <- apply(dataMatrix[presentCount == 0,], 1, function(x) cor(x[1:6], x[7:12])) cc.good <- apply(dataMatrix[presentCount == ncol(dataMatrix),], 1, function(x) cor(x[1:6], x[7:12])) ################################################### ### chunk number 8: figFiltering ################################################### oldpar <- par(mfrow=c(1,2)) hist(cc.bad, xlab='Correlation coefficients between technique replicates', main='Histogram of probes with no sample "Present"') hist(cc.good, xlab='Correlation coefficients between technique replicates', main='Histogram of probes with all samples "Present"') par(oldpar) ################################################### ### chunk number 9: nuID conversion ################################################### ## examples of nuID nuID <- featureNames(x.lumi)[1] print(nuID) ## convert to nucleotide sequence probeSeq <- id2seq(nuID) print(probeSeq) ## convert back to nuIDs nuID.new <- sapply(probeSeq, seq2id) print(nuID.new) ## check a random sequence is.nuID(nuID) ## check a random sequence is.nuID('adfqeqe') ################################################### ### chunk number 10: Preprocessing the data for evaluation ################################################### ## load the library library("lumi") library("vsn") library("limma") library("lumiBarnes") set.seed(0xbadbeef) # ## Load the Barnes data set data("lumiBarnes") selChip = !is.na(lumiBarnes$pctBlood) x.lumi = lumiBarnes[, selChip] # # Background removal x.lumi = lumiB(x.lumi, method='bgAdjust', probs=0) # ## VST transform x.lumi.vst <- lumiT(x.lumi) # ## Quantile normalization x.lumi.vst.quantile <- lumiN(x.lumi.vst, method='quantile') # ## We can also use lumiExpresso to combine multiple preprocessing steps ## log2 transform and Quantile normalization x.lumi.log.quantile <- lumiExpresso(x.lumi, varianceStabilize.param=list(method='log2'), normalize.param=list(method='quantile')) # ## VSN normalization: use lts.quantile=0.5 since in the blood/placenta ## comparison more genes are differentially expressed than what is ## expected by the default of 0.9. x.lumi.vsn <- lumiExpresso(x.lumi, variance.stabilize=FALSE, normalize.param=list(method='vsn', lts.quantile=0.5)) # ## combine them as a list normDataList <- list('VST-Quantile'=exprs(x.lumi.vst.quantile), 'Log2-Quantile'=exprs(x.lumi.log.quantile), 'VSN'=exprs(x.lumi.vsn)) ################################################### ### chunk number 11: chip correlation after processing ################################################### ## replicate index repl1 <- 1:6; repl2 <- 7:12 ## Check the correlation between technique replicates chipCorList = matrix(as.numeric(NA), nrow=length(repl1), ncol=length(normDataList)) colnames(chipCorList)= names(normDataList) for (i in seq(along=normDataList)) for (j in seq(along=repl1)) chipCorList[j,i] = cor(normDataList[[i]][, c(repl1[j], repl2[j])])[1,2] ################################################### ### chunk number 12: boxplotChipCorrelation ################################################### labels <- colnames(chipCorList) ## set the margin of the plot mar <- c(max(nchar(labels))/2 + 4.5, 5, 5, 3) oldpar = par(xaxt='n', mar=mar) boxplot(chipCorList ~ col(chipCorList), xlab='', ylab='Correlation between technique replicate chips', col='skyblue') par(xaxt='s') axis(1, at=1:ncol(chipCorList), labels=labels, tick=TRUE, las=2) par(oldpar) ################################################### ### chunk number 13: figMeanSd ################################################### ## select the technique replicates selChip <- c(repl1[1],repl2[1]) oldpar <- par(mfrow=c(length(normDataList),1)) for (i in 1:length(normDataList)) { meanSdPlot(normDataList[[i]][, selChip], ylab='Standard deviation', main=names(normDataList)[i], ylim=c(0,1)) } par(oldpar) ################################################### ### chunk number 14: differential gene identification using limma ################################################### sampleInfo <- pData(phenoData(x.lumi)) sampleType <- paste(sampleInfo[,'pctBlood'], sampleInfo[,'pctPlacenta'], sep=':') sampleType <- paste('c', sampleType, sep='') presentCount <- detectionCall(x.lumi) ## If using old version (before 1.3.2) lumiBarnes library # presentCount <- detectionCall(x.lumi, 0.99) ## Comparing index compareInd <- c(repl1[1:2], repl2[1:2]) # compareInd <- c(1,6,9,14) ## additional example compareType <- sampleType[compareInd] fitList <- NULL for (i in 1:length(normDataList)) { selDataMatrix <- normDataList[[i]] selDataMatrix <- selDataMatrix[presentCount > 0, ] selProbe <- rownames(selDataMatrix) compareMatrix <- selDataMatrix[, compareInd] design <- model.matrix(~ 0 + as.factor(compareType)) colnames(design) <- c('A', 'B') fit1 <- lmFit(compareMatrix, design) contMatrix <- makeContrasts('A-B'=A - B, levels=design) fit2 <- contrasts.fit(fit1, contMatrix) fit <- eBayes(fit2) fitList <- c(fitList, list(fit)) } names(fitList) <- names(normDataList) ################################################### ### chunk number 15: correlation of most significant genes ################################################### ## Check the correlation of the top differentiated genes based on the limma results ## rank the genes based on the p-values of limma result topNumList <- c(30, seq(35, 500, by=30)) corTh <- 0.8 corrList <- NULL highCorrNumMatrix <- NULL for (i in 1:length(fitList)) { probeList <- rownames(fitList[[i]]$p.value) fc.i <- fitList[[i]]$coef[,1] ordProbe.i <- probeList[order(abs(fitList[[i]]$p.value[,1]), decreasing=F)] selDataMatrix <- normDataList[[i]][ordProbe.i, ] modelProfile1 <- c(100, 95, 75, 50, 25, 0, 100, 95, 75, 50, 25, 0) profileMatrix <- selDataMatrix corr1 <- apply(profileMatrix, 1, cor, y=modelProfile1) names(corr1) <- ordProbe.i matchNum.j <- NULL for (topNum.j in topNumList) { topProbe.j <- ordProbe.i[1:topNum.j] matchNum.j <- c(matchNum.j, length(which(abs(corr1[topProbe.j]) > corTh))) } highCorrNumMatrix <- cbind(highCorrNumMatrix, matchNum.j) corrList <- c(corrList, list(c(list(corr1)))) } rownames(highCorrNumMatrix) <- topNumList colnames(highCorrNumMatrix) <- names(corrList) <- names(fitList) ################################################### ### chunk number 16: figCorrelationConcordance ################################################### pcol <- seq(along= normDataList) plty <- c(1,2,4) # seq(along= normDataList) plwd <- 1.5 matplot(topNumList, (100 * highCorrNumMatrix/(topNumList %*% t(rep(1,ncol(highCorrNumMatrix))))), type='l', xlab='Number of genes selected as differentially expressed', ylab='Expression and dilution profiles concordance (%)', lty=plty, col=pcol, lwd=plwd, ylim=c(0,100)) legend(x=350, y=50, legend=colnames(highCorrNumMatrix), lty=plty, col=pcol, lwd=plwd) ################################################### ### chunk number 17: sessionInfo ################################################### toLatex(sessionInfo())