## ----style, echo=FALSE, results='hide', message=FALSE, cache=FALSE---------
library(BiocStyle)
library(knitr)
opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE, cache=TRUE)
opts_chunk$set(fig.asp=1)

## ---- cache=FALSE, echo=FALSE, results="hide"------------------------------
simpleSingleCell:::.compile("reads") # 416B
simpleSingleCell:::.compile("batch") # pancreas

## --------------------------------------------------------------------------
library(SingleCellExperiment)
sce.pancreas <- readRDS("pancreas_data.rds") 

# Same code as in pancreas MNN correction workflow.
library(scran)
m.out <- findMarkers(sce.pancreas, sce.pancreas$Cluster, 
    block=sce.pancreas$batch, direction="up", assay.type="original") 
demo <- m.out[["1"]] 
as.data.frame(demo[demo$Top <= 5,1:3])

## --------------------------------------------------------------------------
# Setting up the design matrix (we remove intercept for full rank
# in the final design matrix with the cluster-specific terms).
design <- model.matrix(~sce.pancreas$batch)
design <- design[,-1,drop=FALSE]

m.alt <- findMarkers(sce.pancreas, sce.pancreas$Cluster, 
    design=design, direction="up", assay.type="original")
demo <- m.alt[["1"]]
as.data.frame(demo[demo$Top <= 5,1:3])

## --------------------------------------------------------------------------
sce.416b <- readRDS("416B_data.rds")
o.out <- overlapExprs(sce.416b, group=sce.416b$cluster, block=sce.416b$Plate)
head(o.out[["1"]]) # top DEGs for cluster 1 against the others.

## ----viol-de-wilcox, fig.cap="Distribution of log-normalized expression values for the top 10 DE genes involving cluster 1 with the Wilcoxon rank sum test, stratified by cluster assignment and coloured by the plate of origin for each cell."----
library(scater)
plotExpression(sce.416b, x="cluster", colour_by="Plate",
    features=head(rownames(o.out[[1]])))

## --------------------------------------------------------------------------
library(limma)
design <- model.matrix(~0 + cluster + Plate, data=colData(sce.416b))
colnames(design)

keep <- calcAverage(sce.416b) > 1 # filter to remove very low-abundance genes.
summary(keep)

y <- convertTo(sce.416b, subset.row=keep)
v <- voom(y, design)
fit <- lmFit(v, design)

## --------------------------------------------------------------------------
clust.terms <- head(colnames(design), length(unique(sce.416b$cluster)))
all.results <- all.pairs <- list()
counter <- 1L

for (x in seq_along(clust.terms)) {
    for (y in seq_len(x-1L)) {
        con <- integer(ncol(design))
        con[x] <- 1
        con[y] <- -1
        fit2 <- contrasts.fit(fit, con)
        fit2 <- treat(fit2, robust=TRUE, lfc=0.5)

        res <- topTreat(fit2, n=Inf, sort.by="none")
        all.results[[counter]] <- res
        all.pairs[[counter]] <- c(clust.terms[x], clust.terms[y])
        counter <- counter+1L

        # Also filling the reverse comparison.
        res$logFC <- -res$logFC
        all.results[[counter]] <- res
        all.pairs[[counter]] <- c(clust.terms[y], clust.terms[x])
        counter <- counter+1L
    }
}

## --------------------------------------------------------------------------
all.pairs <- do.call(rbind, all.pairs)
combined <- combineMarkers(all.results, all.pairs, pval.field="P.Value")
as.data.frame(head(combined[["cluster1"]][,1:3]))

## ----pval-dist, fig.cap="Distribution of $p$-values from a DE analysis between two clusters in a simulation with no true subpopulation structure."----
set.seed(0)
y <- matrix(rnorm(100000), ncol=200)
clusters <- kmeans(t(y), centers=2)$cluster
out <- findMarkers(y, clusters)
hist(out[[1]]$p.value, col="grey80", xlab="p-value")

## --------------------------------------------------------------------------
combos <- with(colData(sce.416b), paste(cluster, Plate, sep="."))
(num <- table(combos))

## --------------------------------------------------------------------------
library(scater)
summed <- sumCountsAcrossCells(counts(sce.416b), combos)
head(summed)

## --------------------------------------------------------------------------
library(edgeR)
y <- DGEList(summed)
y <- y[aveLogCPM(y) > 1 & !isSpike(sce.416b),]
y <- calcNormFactors(y)

## --------------------------------------------------------------------------
sum.terms <- strsplit(colnames(y), split="\\.")
sum.clust <- unlist(lapply(sum.terms, "[[", i=1))
sum.plate <- unlist(lapply(sum.terms, "[[", i=2))
design <- model.matrix(~0 + sum.clust + sum.plate)

## --------------------------------------------------------------------------
y <- estimateDisp(y, design)
summary(y$trended.dispersion)    
fit <- glmQLFit(y, design, robust=TRUE)
summary(fit$df.prior)    

## --------------------------------------------------------------------------
res <- glmQLFTest(fit, contrast=c(1, -1, 0, 0, 0, 0))
summary(decideTests(res))
(top <- topTags(res))

## ----violde, fig.height=10, fig.asp=1.5, fig.cap="Distribution of log-normalized expression values for the top 10 DE genes between clusters 1 and 2 from the summation strategy, stratified by cluster assignment and coloured by the plate of origin for each cell."----
sub.sce <- sce.416b[,sce.416b$cluster %in% c("1", "2")]
plotExpression(sub.sce, x="cluster", colour_by="Plate",
    features=rownames(top)[1:10])