## ----style, echo=FALSE, results='hide', message=FALSE, cache=FALSE--------- library(BiocStyle) library(knitr) opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE, cache=TRUE) opts_chunk$set(fig.asp=1) ## ---- cache=FALSE, echo=FALSE, results="hide"------------------------------ simpleSingleCell:::.compile("reads") # 416B simpleSingleCell:::.compile("batch") # pancreas ## -------------------------------------------------------------------------- library(SingleCellExperiment) sce.pancreas <- readRDS("pancreas_data.rds") # Same code as in pancreas MNN correction workflow. library(scran) m.out <- findMarkers(sce.pancreas, sce.pancreas$Cluster, block=sce.pancreas$batch, direction="up", assay.type="original") demo <- m.out[["1"]] as.data.frame(demo[demo$Top <= 5,1:3]) ## -------------------------------------------------------------------------- # Setting up the design matrix (we remove intercept for full rank # in the final design matrix with the cluster-specific terms). design <- model.matrix(~sce.pancreas$batch) design <- design[,-1,drop=FALSE] m.alt <- findMarkers(sce.pancreas, sce.pancreas$Cluster, design=design, direction="up", assay.type="original") demo <- m.alt[["1"]] as.data.frame(demo[demo$Top <= 5,1:3]) ## -------------------------------------------------------------------------- sce.416b <- readRDS("416B_data.rds") o.out <- overlapExprs(sce.416b, group=sce.416b$cluster, block=sce.416b$Plate) head(o.out[["1"]]) # top DEGs for cluster 1 against the others. ## ----viol-de-wilcox, fig.cap="Distribution of log-normalized expression values for the top 10 DE genes involving cluster 1 with the Wilcoxon rank sum test, stratified by cluster assignment and coloured by the plate of origin for each cell."---- library(scater) plotExpression(sce.416b, x="cluster", colour_by="Plate", features=head(rownames(o.out[[1]]))) ## -------------------------------------------------------------------------- library(limma) design <- model.matrix(~0 + cluster + Plate, data=colData(sce.416b)) colnames(design) keep <- calcAverage(sce.416b) > 1 # filter to remove very low-abundance genes. summary(keep) y <- convertTo(sce.416b, subset.row=keep) v <- voom(y, design) fit <- lmFit(v, design) ## -------------------------------------------------------------------------- clust.terms <- head(colnames(design), length(unique(sce.416b$cluster))) all.results <- all.pairs <- list() counter <- 1L for (x in seq_along(clust.terms)) { for (y in seq_len(x-1L)) { con <- integer(ncol(design)) con[x] <- 1 con[y] <- -1 fit2 <- contrasts.fit(fit, con) fit2 <- treat(fit2, robust=TRUE, lfc=0.5) res <- topTreat(fit2, n=Inf, sort.by="none") all.results[[counter]] <- res all.pairs[[counter]] <- c(clust.terms[x], clust.terms[y]) counter <- counter+1L # Also filling the reverse comparison. res$logFC <- -res$logFC all.results[[counter]] <- res all.pairs[[counter]] <- c(clust.terms[y], clust.terms[x]) counter <- counter+1L } } ## -------------------------------------------------------------------------- all.pairs <- do.call(rbind, all.pairs) combined <- combineMarkers(all.results, all.pairs, pval.field="P.Value") as.data.frame(head(combined[["cluster1"]][,1:3])) ## ----pval-dist, fig.cap="Distribution of $p$-values from a DE analysis between two clusters in a simulation with no true subpopulation structure."---- set.seed(0) y <- matrix(rnorm(100000), ncol=200) clusters <- kmeans(t(y), centers=2)$cluster out <- findMarkers(y, clusters) hist(out[[1]]$p.value, col="grey80", xlab="p-value") ## -------------------------------------------------------------------------- combos <- with(colData(sce.416b), paste(cluster, Plate, sep=".")) (num <- table(combos)) ## -------------------------------------------------------------------------- library(scater) summed <- sumCountsAcrossCells(counts(sce.416b), combos) head(summed) ## -------------------------------------------------------------------------- library(edgeR) y <- DGEList(summed) y <- y[aveLogCPM(y) > 1 & !isSpike(sce.416b),] y <- calcNormFactors(y) ## -------------------------------------------------------------------------- sum.terms <- strsplit(colnames(y), split="\\.") sum.clust <- unlist(lapply(sum.terms, "[[", i=1)) sum.plate <- unlist(lapply(sum.terms, "[[", i=2)) design <- model.matrix(~0 + sum.clust + sum.plate) ## -------------------------------------------------------------------------- y <- estimateDisp(y, design) summary(y$trended.dispersion) fit <- glmQLFit(y, design, robust=TRUE) summary(fit$df.prior) ## -------------------------------------------------------------------------- res <- glmQLFTest(fit, contrast=c(1, -1, 0, 0, 0, 0)) summary(decideTests(res)) (top <- topTags(res)) ## ----violde, fig.height=10, fig.asp=1.5, fig.cap="Distribution of log-normalized expression values for the top 10 DE genes between clusters 1 and 2 from the summation strategy, stratified by cluster assignment and coloured by the plate of origin for each cell."---- sub.sce <- sce.416b[,sce.416b$cluster %in% c("1", "2")] plotExpression(sub.sce, x="cluster", colour_by="Plate", features=rownames(top)[1:10])