###################################################
### chunk number 1: libraries
###################################################
library(lumi)
library(vsn)
library(limma)
library(lumiBarnes)
set.seed(0xbadbeef)


###################################################
### chunk number 2: load data
###################################################
## Load the Barnes data set
data("lumiBarnes")
## Only selected the dilution samples
selChip = !is.na(lumiBarnes$pctBlood)
x.lumi = lumiBarnes[, selChip]
## summary of the data
x.lumi


###################################################
### chunk number 3: background removal
###################################################
## Since the Barnes data was not background removed, we will do background adjustment first.
## The background estimation will be based on the control probe information. 
x.lumi = lumiB(x.lumi, method='bgAdjust', probs=0)


###################################################
### chunk number 4: scatterPlotTechReplicate
###################################################
selPair <- exprs(x.lumi[,c(1,7)])
vstSelPair <- exprs(lumiT(x.lumi[, c(1,7)]))
## randomly select 10000 probes for fast plotting
selInd <- sample(1:nrow(selPair), 10000)
oldpar <- par(mfrow=c(2,2))
plot(selPair[selInd,1], selPair[selInd,2], main='At raw scale')
plot(log2(selPair[selInd,1]), log2(selPair[selInd,2]), main='After Log2 transform')
plot(log2(selPair[selInd,1] + 100), log2(selPair[selInd,2] + 100), main='After Log2 transform with 100 offset')
plot(vstSelPair[selInd,1], vstSelPair[selInd,2], main='After VST transform')
par(oldpar)


###################################################
### chunk number 5: quantile normalize simulation
###################################################
# Demonstrate the limitation of quantile noramlization
# Let us simulate three arrays in a matrix: each row is a gene; each column is an array.
#  Say, they are biological replicates, similar to each other,
#  but still have critical differences. The biological variation is 
#  necessary to keep.
# 
 x <- as.matrix(data.frame(
     x1= seq (1, 10000) + rnorm (10000,0,0.3),
     x2= seq (1, 10000) + rnorm (10000,0,0.3),
     x3= seq (1, 10000) + rnorm (10000,0,0.3)
))
print(x[1:5,])
#
# Now, after quantile normalization, it seems that some of the
# variations are gone.
x1 <- normalize.quantiles(x)
print(x1[1:5,])
# 
# To confirm this, we take a look at the variance of all genes,
# before and after normalization.
v <- apply(x, 1, var)
summary(v)
v1 <- apply(x1, 1, var)
summary(v1)


###################################################
### chunk number 6: RSN normalize simulation
###################################################
x.lumiN <- lumiN(x)
print(x.lumiN[1:5,])
v.lumiN <- apply(x.lumiN, 1, var)
summary(v.lumiN)


###################################################
### chunk number 7: Filtering Probes
###################################################
## Estimate the detection count of lumiBarnes data
presentCount <- detectionCall(x.lumi)
## If using old version (before 1.3.2) lumiBarnes library
# presentCount <- detectionCall(x.lumi, 0.99)
dataMatrix <- exprs(x.lumi)
#
## check the correlation between 6 pairs of technique replicates
cc.bad <- apply(dataMatrix[presentCount == 0,], 1, function(x) cor(x[1:6], x[7:12]))
cc.good <- apply(dataMatrix[presentCount == ncol(dataMatrix),], 1, function(x) cor(x[1:6], x[7:12]))


###################################################
### chunk number 8: figFiltering
###################################################
oldpar <- par(mfrow=c(1,2))
hist(cc.bad, xlab='Correlation coefficients between technique replicates', main='Histogram of probes with no sample "Present"')
hist(cc.good, xlab='Correlation coefficients between technique replicates', main='Histogram of probes with all samples "Present"')
par(oldpar)


###################################################
### chunk number 9: nuID conversion
###################################################
## examples of nuID
nuID <- featureNames(x.lumi)[1]
print(nuID)

## convert to nucleotide sequence
probeSeq <- id2seq(nuID)
print(probeSeq)

## convert back to nuIDs
nuID.new <- sapply(probeSeq, seq2id)
print(nuID.new)

## check a random sequence
is.nuID(nuID)

## check a random sequence
is.nuID('adfqeqe')


###################################################
### chunk number 10: Preprocessing the data for evaluation
###################################################
## load the library
library("lumi")
library("vsn")
library("limma")
library("lumiBarnes")
set.seed(0xbadbeef)
#
## Load the Barnes data set
data("lumiBarnes")
selChip = !is.na(lumiBarnes$pctBlood)
x.lumi = lumiBarnes[, selChip]
#
# Background removal
x.lumi = lumiB(x.lumi, method='bgAdjust', probs=0)
#
## VST transform 
x.lumi.vst <- lumiT(x.lumi)
#
## Quantile normalization
x.lumi.vst.quantile <- lumiN(x.lumi.vst, method='quantile')
#
## We can also use lumiExpresso to combine multiple preprocessing steps
## log2 transform and Quantile normalization
x.lumi.log.quantile <- lumiExpresso(x.lumi, varianceStabilize.param=list(method='log2'), normalize.param=list(method='quantile'))
#
## VSN normalization: use lts.quantile=0.5 since in the blood/placenta
##  comparison more genes are differentially expressed than what is
##   expected by the default of 0.9.
x.lumi.vsn <- lumiExpresso(x.lumi, variance.stabilize=FALSE, normalize.param=list(method='vsn', lts.quantile=0.5))
#
## combine them as a list
normDataList <- list('VST-Quantile'=exprs(x.lumi.vst.quantile), 
                     'Log2-Quantile'=exprs(x.lumi.log.quantile),
	            'VSN'=exprs(x.lumi.vsn))


###################################################
### chunk number 11: chip correlation after processing
###################################################
## replicate index
repl1 <- 1:6; repl2 <- 7:12
## Check the correlation between technique replicates
chipCorList = matrix(as.numeric(NA), nrow=length(repl1), ncol=length(normDataList))
colnames(chipCorList)= names(normDataList)
for (i in seq(along=normDataList))
  for (j in seq(along=repl1))
    chipCorList[j,i] = cor(normDataList[[i]][, c(repl1[j], repl2[j])])[1,2]


###################################################
### chunk number 12: boxplotChipCorrelation
###################################################
labels <- colnames(chipCorList)
## set the margin of the plot
mar <- c(max(nchar(labels))/2 + 4.5, 5, 5, 3)
oldpar = par(xaxt='n', mar=mar)
boxplot(chipCorList ~ col(chipCorList),  xlab='', 
        ylab='Correlation between technique replicate chips',
        col='skyblue')
par(xaxt='s')
axis(1, at=1:ncol(chipCorList), labels=labels, tick=TRUE, las=2)
par(oldpar)


###################################################
### chunk number 13: figMeanSd
###################################################
## select the technique replicates
selChip <-  c(repl1[1],repl2[1]) 
oldpar <- par(mfrow=c(length(normDataList),1))
for (i in 1:length(normDataList)) {
  meanSdPlot(normDataList[[i]][, selChip], ylab='Standard deviation',
      main=names(normDataList)[i], ylim=c(0,1)) 
}
par(oldpar)


###################################################
### chunk number 14: differential gene identification using limma
###################################################
sampleInfo <- pData(phenoData(x.lumi))
sampleType <- paste(sampleInfo[,'pctBlood'], sampleInfo[,'pctPlacenta'], sep=':')
sampleType <- paste('c', sampleType, sep='')
presentCount <- detectionCall(x.lumi)
## If using old version (before 1.3.2) lumiBarnes library
# presentCount <- detectionCall(x.lumi, 0.99)
## Comparing index
compareInd <- c(repl1[1:2], repl2[1:2])  	
# compareInd <- c(1,6,9,14)	## additional example
compareType <- sampleType[compareInd]
fitList <- NULL
for (i in 1:length(normDataList)) {
	selDataMatrix <- normDataList[[i]]
	selDataMatrix <- selDataMatrix[presentCount > 0, ]
	selProbe <- rownames(selDataMatrix)
	compareMatrix <- selDataMatrix[, compareInd]
	
	design <- model.matrix(~ 0 + as.factor(compareType))
	colnames(design) <- c('A', 'B')
	fit1 <- lmFit(compareMatrix, design)
	contMatrix <- makeContrasts('A-B'=A - B, levels=design)
	fit2 <- contrasts.fit(fit1, contMatrix)
	fit <- eBayes(fit2)
	fitList <- c(fitList, list(fit))
}
names(fitList) <- names(normDataList)


###################################################
### chunk number 15: correlation of most significant genes
###################################################
## Check the correlation of the top differentiated genes based on the limma results 
## rank the genes based on the p-values of limma result
topNumList <- c(30, seq(35, 500, by=30))
corTh <- 0.8
corrList <- NULL
highCorrNumMatrix <- NULL
for (i in 1:length(fitList)) {
	probeList <- rownames(fitList[[i]]$p.value)
	fc.i <- fitList[[i]]$coef[,1]
	ordProbe.i <- probeList[order(abs(fitList[[i]]$p.value[,1]), decreasing=F)]

	selDataMatrix <- normDataList[[i]][ordProbe.i, ]

	modelProfile1 <- c(100, 95, 75, 50, 25, 0, 100, 95, 75, 50, 25, 0)
	profileMatrix <- selDataMatrix
	corr1 <- apply(profileMatrix, 1, cor, y=modelProfile1)
	names(corr1) <- ordProbe.i
	matchNum.j <- NULL
	for (topNum.j in topNumList) {
		topProbe.j <- ordProbe.i[1:topNum.j]
		matchNum.j <- c(matchNum.j, length(which(abs(corr1[topProbe.j]) > corTh)))
	}
	highCorrNumMatrix <- cbind(highCorrNumMatrix, matchNum.j)
	corrList <- c(corrList, list(c(list(corr1))))
}
rownames(highCorrNumMatrix) <- topNumList
colnames(highCorrNumMatrix) <- names(corrList) <- names(fitList)


###################################################
### chunk number 16: figCorrelationConcordance
###################################################
pcol <- seq(along= normDataList) 
plty <-  c(1,2,4)	  # seq(along= normDataList)
plwd <- 1.5
matplot(topNumList, (100 * highCorrNumMatrix/(topNumList %*% t(rep(1,ncol(highCorrNumMatrix))))),
 	type='l', xlab='Number of genes selected as differentially expressed', 
	ylab='Expression and dilution profiles concordance (%)', 
        lty=plty, col=pcol, lwd=plwd, ylim=c(0,100))
legend(x=350, y=50, legend=colnames(highCorrNumMatrix), lty=plty, col=pcol, lwd=plwd)


###################################################
### chunk number 17: sessionInfo
###################################################
toLatex(sessionInfo())