## ----style, echo=FALSE, results='hide', message=FALSE---------------------- library(BiocStyle) library(knitr) opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) opts_chunk$set(fig.asp=1) ## ---- echo=FALSE, results='hide'------------------------------------------- all.urls <- c("ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE81nnn/GSE81076/suppl/GSE81076%5FD2%5F3%5F7%5F10%5F17%2Etxt%2Egz", "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE85nnn/GSE85241/suppl/GSE85241%5Fcellsystems%5Fdataset%5F4donors%5Fupdated%2Ecsv%2Egz", "https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-5061/files/E-MTAB-5061.processed.1.zip", "https://www.ebi.ac.uk/arrayexpress/files/E-MTAB-5061/E-MTAB-5061.sdrf.txt") all.basenames <- basename(all.urls) all.basenames[1] <- "GSE81076_D2_3_7_10_17.txt.gz" all.basenames[2] <- "GSE85241_cellsystems_dataset_4donors_updated.csv" all.modes <- c("wb", "w", "wb", "w") for (x in seq_along(all.urls)) { if (!file.exists(all.basenames[x])) { download.file(all.urls[x], all.basenames[x], mode=all.modes[x]) } } ## -------------------------------------------------------------------------- gse81076.df <- read.table("GSE81076_D2_3_7_10_17.txt.gz", sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) dim(gse81076.df) ## -------------------------------------------------------------------------- donor.names <- sub("^(D[0-9]+).*", "\\1", colnames(gse81076.df)) table(donor.names) plate.id <- sub("^D[0-9]+(.*)_.*", "\\1", colnames(gse81076.df)) table(plate.id) ## -------------------------------------------------------------------------- gene.symb <- gsub("__chr.*$", "", rownames(gse81076.df)) is.spike <- grepl("^ERCC-", gene.symb) table(is.spike) library(org.Hs.eg.db) gene.ids <- mapIds(org.Hs.eg.db, keys=gene.symb, keytype="SYMBOL", column="ENSEMBL") gene.ids[is.spike] <- gene.symb[is.spike] keep <- !is.na(gene.ids) & !duplicated(gene.ids) gse81076.df <- gse81076.df[keep,] rownames(gse81076.df) <- gene.ids[keep] summary(keep) ## -------------------------------------------------------------------------- library(SingleCellExperiment) sce.gse81076 <- SingleCellExperiment(list(counts=as.matrix(gse81076.df)), colData=DataFrame(Donor=donor.names, Plate=plate.id), rowData=DataFrame(Symbol=gene.symb[keep])) isSpike(sce.gse81076, "ERCC") <- grepl("^ERCC-", rownames(gse81076.df)) sce.gse81076 ## -------------------------------------------------------------------------- library(scater) sce.gse81076 <- calculateQCMetrics(sce.gse81076, compact=TRUE) QC <- sce.gse81076$scater_qc low.lib <- isOutlier(QC$all$log10_total_counts, type="lower", nmad=3) low.genes <- isOutlier(QC$all$log10_total_features_by_counts, type="lower", nmad=3) high.spike <- isOutlier(QC$feature_control_ERCC$pct_counts, type="higher", nmad=3) data.frame(LowLib=sum(low.lib), LowNgenes=sum(low.genes), HighSpike=sum(high.spike, na.rm=TRUE)) ## -------------------------------------------------------------------------- discard <- low.lib | low.genes | high.spike sce.gse81076 <- sce.gse81076[,!discard] summary(discard) ## -------------------------------------------------------------------------- library(scran) clusters <- quickCluster(sce.gse81076, min.mean=0.1) table(clusters) sce.gse81076 <- computeSumFactors(sce.gse81076, min.mean=0.1, clusters=clusters) summary(sizeFactors(sce.gse81076)) ## -------------------------------------------------------------------------- sce.gse81076 <- computeSpikeFactors(sce.gse81076, general.use=FALSE) summary(sizeFactors(sce.gse81076, "ERCC")) ## -------------------------------------------------------------------------- sce.gse81076 <- normalize(sce.gse81076) ## ----var-gse81076, fig.cap="Variance of normalized log-expression values for each gene in the GSE81076 dataset, plotted against the mean log-expression. The blue line represents the mean-dependent trend fitted to the variances of the spike-in transcripts (red)."---- block <- paste0(sce.gse81076$Plate, "_", sce.gse81076$Donor) fit <- trendVar(sce.gse81076, block=block, parametric=TRUE) dec <- decomposeVar(sce.gse81076, fit) plot(dec$mean, dec$total, xlab="Mean log-expression", ylab="Variance of log-expression", pch=16) OBis.spike <- isSpike(sce.gse81076) points(dec$mean[is.spike], dec$total[is.spike], col="red", pch=16) curve(fit$trend(x), col="dodgerblue", add=TRUE) ## -------------------------------------------------------------------------- dec.gse81076 <- dec dec.gse81076$Symbol <- rowData(sce.gse81076)$Symbol dec.gse81076 <- dec.gse81076[order(dec.gse81076$bio, decreasing=TRUE),] head(dec.gse81076) ## ---- echo=FALSE, results="hide"------------------------------------------- rm(gse81076.df) gc() ## -------------------------------------------------------------------------- gse85241.df <- read.table("GSE85241_cellsystems_dataset_4donors_updated.csv", sep='\t', h=TRUE, row.names=1, stringsAsFactors=FALSE) dim(gse85241.df) ## -------------------------------------------------------------------------- donor.names <- sub("^(D[0-9]+).*", "\\1", colnames(gse85241.df)) table(donor.names) plate.id <- sub("^D[0-9]+\\.([0-9]+)_.*", "\\1", colnames(gse85241.df)) table(plate.id) ## -------------------------------------------------------------------------- gene.symb <- gsub("__chr.*$", "", rownames(gse85241.df)) is.spike <- grepl("^ERCC-", gene.symb) table(is.spike) library(org.Hs.eg.db) gene.ids <- mapIds(org.Hs.eg.db, keys=gene.symb, keytype="SYMBOL", column="ENSEMBL") gene.ids[is.spike] <- gene.symb[is.spike] keep <- !is.na(gene.ids) & !duplicated(gene.ids) gse85241.df <- gse85241.df[keep,] rownames(gse85241.df) <- gene.ids[keep] summary(keep) ## -------------------------------------------------------------------------- sce.gse85241 <- SingleCellExperiment(list(counts=as.matrix(gse85241.df)), colData=DataFrame(Donor=donor.names, Plate=plate.id), rowData=DataFrame(Symbol=gene.symb[keep])) isSpike(sce.gse85241, "ERCC") <- grepl("^ERCC-", rownames(gse85241.df)) sce.gse85241 ## -------------------------------------------------------------------------- sce.gse85241 <- calculateQCMetrics(sce.gse85241, compact=TRUE) QC <- sce.gse85241$scater_qc low.lib <- isOutlier(QC$all$log10_total_counts, type="lower", nmad=3) low.genes <- isOutlier(QC$all$log10_total_features_by_counts, type="lower", nmad=3) high.spike <- isOutlier(QC$feature_control_ERCC$pct_counts, type="higher", nmad=3) data.frame(LowLib=sum(low.lib), LowNgenes=sum(low.genes), HighSpike=sum(high.spike, na.rm=TRUE)) ## -------------------------------------------------------------------------- discard <- low.lib | low.genes | high.spike sce.gse85241 <- sce.gse85241[,!discard] summary(discard) ## -------------------------------------------------------------------------- clusters <- quickCluster(sce.gse85241, min.mean=0.1, method="igraph") table(clusters) sce.gse85241 <- computeSumFactors(sce.gse85241, min.mean=0.1, clusters=clusters) summary(sizeFactors(sce.gse85241)) sce.gse85241 <- computeSpikeFactors(sce.gse85241, general.use=FALSE) summary(sizeFactors(sce.gse85241, "ERCC")) sce.gse85241 <- normalize(sce.gse85241) ## ----var-gse85241, fig.cap="Variance of normalized log-expression values for each gene in the GSE85241 dataset, plotted against the mean log-expression. The blue line represents the mean-dependent trend fitted to the variances of the spike-in transcripts (red)."---- block <- paste0(sce.gse85241$Plate, "_", sce.gse85241$Donor) fit <- trendVar(sce.gse85241, block=block, parametric=TRUE) dec <- decomposeVar(sce.gse85241, fit) plot(dec$mean, dec$total, xlab="Mean log-expression", ylab="Variance of log-expression", pch=16) is.spike <- isSpike(sce.gse85241) points(dec$mean[is.spike], dec$total[is.spike], col="red", pch=16) curve(fit$trend(x), col="dodgerblue", add=TRUE) ## -------------------------------------------------------------------------- dec.gse85241 <- dec dec.gse85241$Symbol <- rowData(sce.gse85241)$Symbol dec.gse85241 <- dec.gse85241[order(dec.gse85241$bio, decreasing=TRUE),] head(dec.gse85241) ## ---- echo=FALSE, results="hide"------------------------------------------- rm(gse85241.df) gc() ## -------------------------------------------------------------------------- unzip("E-MTAB-5061.processed.1.zip") # Figuring out the number of libraries (-1 for the '#sample'). header <- read.table("pancreas_refseq_rpkms_counts_3514sc.txt", nrow=1, sep="\t", comment.char="", stringsAsFactors=FALSE) ncells <- ncol(header) - 1L # Loading only the gene names and the counts. col.types <- vector("list", ncells*2 + 2) col.types[1:2] <- "character" col.types[2+ncells + seq_len(ncells)] <- "integer" e5601.df <- read.table("pancreas_refseq_rpkms_counts_3514sc.txt", sep="\t", colClasses=col.types) # Disentangling the gene names and the counts. gene.data <- e5601.df[,1:2] e5601.df <- e5601.df[,-(1:2)] colnames(e5601.df) <- as.character(header[1,-1]) dim(e5601.df) ## -------------------------------------------------------------------------- is.spike <- grepl("^ERCC-", gene.data[,2]) table(is.spike) library(org.Hs.eg.db) gene.ids <- mapIds(org.Hs.eg.db, keys=gene.data[,1], keytype="SYMBOL", column="ENSEMBL") gene.ids[is.spike] <- gene.data[is.spike,2] keep <- !is.na(gene.ids) & !duplicated(gene.ids) e5601.df <- e5601.df[keep,] rownames(e5601.df) <- gene.ids[keep] summary(keep) ## -------------------------------------------------------------------------- metadata <- read.table("E-MTAB-5061.sdrf.txt", header=TRUE, sep="\t", check.names=FALSE, stringsAsFactors=FALSE) m <- match(colnames(e5601.df), metadata[["Assay Name"]]) stopifnot(all(!is.na(m))) metadata <- metadata[m,] donor.id <- metadata[["Characteristics[individual]"]] table(donor.id) ## -------------------------------------------------------------------------- sce.e5601 <- SingleCellExperiment(list(counts=as.matrix(e5601.df)), colData=DataFrame(Donor=donor.id), rowData=DataFrame(Symbol=gene.data[keep,1])) isSpike(sce.e5601, "ERCC") <- grepl("^ERCC-", rownames(e5601.df)) sce.e5601 ## -------------------------------------------------------------------------- sce.e5601 <- calculateQCMetrics(sce.e5601, compact=TRUE) QC <- sce.e5601$scater_qc low.lib <- isOutlier(QC$all$log10_total_counts, type="lower", nmad=3) low.genes <- isOutlier(QC$all$log10_total_features_by_counts, type="lower", nmad=3) high.spike <- isOutlier(QC$feature_control_ERCC$pct_counts, type="higher", nmad=3) low.spike <- isOutlier(QC$feature_control_ERCC$log10_total_counts, type="lower", nmad=2) data.frame(LowLib=sum(low.lib), LowNgenes=sum(low.genes), HighSpike=sum(high.spike, na.rm=TRUE), LowSpike=sum(low.spike)) ## -------------------------------------------------------------------------- discard <- low.lib | low.genes | high.spike | low.spike sce.e5601 <- sce.e5601[,!discard] summary(discard) ## -------------------------------------------------------------------------- clusters <- quickCluster(sce.e5601, min.mean=1, method="igraph") table(clusters) sce.e5601 <- computeSumFactors(sce.e5601, min.mean=1, clusters=clusters) summary(sizeFactors(sce.e5601)) sce.e5601 <- computeSpikeFactors(sce.e5601, general.use=FALSE) summary(sizeFactors(sce.e5601, "ERCC")) sce.e5601 <- normalize(sce.e5601) ## ----var-e5601, fig.cap="Variance of normalized log-expression values for each gene in the E-MTAB-5601 dataset, plotted against the mean log-expression. Each plot corresponds to a donor, where the blue line represents the mean-dependent trend fitted to the variances of the spike-in transcripts (red).", fig.width=6, fig.asp=2.5---- donors <- sort(unique(sce.e5601$Donor)) is.spike <- isSpike(sce.e5601) par(mfrow=c(ceiling(length(donors)/2), 2), mar=c(4.1, 4.1, 2.1, 0.1)) collected <- list() for (x in unique(sce.e5601$Donor)) { current <- sce.e5601[,sce.e5601$Donor==x] if (ncol(current)<2L) { next } current <- normalize(current) fit <- trendVar(current, parametric=TRUE) dec <- decomposeVar(current, fit) plot(dec$mean, dec$total, xlab="Mean log-expression", ylab="Variance of log-expression", pch=16, main=x) points(fit$mean, fit$var, col="red", pch=16) curve(fit$trend(x), col="dodgerblue", add=TRUE) collected[[x]] <- dec } ## -------------------------------------------------------------------------- dec.e5601 <- do.call(combineVar, collected) dec.e5601$Symbol <- rowData(sce.e5601)$Symbol dec.e5601 <- dec.e5601[order(dec.e5601$bio, decreasing=TRUE),] head(dec.e5601) ## ---- echo=FALSE, results="hide"------------------------------------------- rm(e5601.df) gc() ## -------------------------------------------------------------------------- top.e5601 <- rownames(dec.e5601)[seq_len(1000)] top.gse85241 <- rownames(dec.gse85241)[seq_len(1000)] top.gse81076 <- rownames(dec.gse81076)[seq_len(1000)] chosen <- Reduce(intersect, list(top.e5601, top.gse85241, top.gse81076)) # Adding some gene symbols for interpretation. symb <- mapIds(org.Hs.eg.db, keys=chosen, keytype="ENSEMBL", column="SYMBOL") DataFrame(ID=chosen, Symbol=symb) ## -------------------------------------------------------------------------- # Identifying genes that are annotated in all batches. in.all <- Reduce(intersect, list(rownames(dec.e5601), rownames(dec.gse85241), rownames(dec.gse81076))) # Setting weighted=FALSE so each batch contributes equally. combined <- combineVar(dec.e5601[in.all,], dec.gse85241[in.all,], dec.gse81076[in.all,], weighted=FALSE) chosen2 <- rownames(combined)[head(order(combined$bio, decreasing=TRUE), 1000)] ## -------------------------------------------------------------------------- original <- list(logcounts(sce.e5601)[chosen,], logcounts(sce.gse81076)[chosen,], logcounts(sce.gse85241)[chosen,]) corrected <- do.call(mnnCorrect, c(original, list(k=20, sigma=0.1))) str(corrected$corrected) ## -------------------------------------------------------------------------- corrected$pairs ## -------------------------------------------------------------------------- omat <- do.call(cbind, original) mat <- do.call(cbind, corrected$corrected) colnames(mat) <- NULL sce <- SingleCellExperiment(list(original=omat, corrected=mat)) colData(sce)$Batch <- rep(c("e5601", "gse81076", "gse85241"), lapply(corrected$corrected, ncol)) sce ## ----tsne-batch, fig.width=10, fig.asp=0.6, fig.cap="t-SNE plots of the pancreas datasets, before and after MNN correction. Each point represents a cell and is coloured by the batch of origin."---- osce <- runTSNE(sce, exprs_values="original", rand_seed=100) ot <- plotTSNE(osce, colour_by="Batch") + ggtitle("Original") csce <- runTSNE(sce, exprs_values="corrected", rand_seed=100) ct <- plotTSNE(csce, colour_by="Batch") + ggtitle("Corrected") multiplot(ot, ct, cols=2) ## ----tsne-markers, fig.width=10, fig.height=10, fig.cap="t-SNE plots after MNN correction, where each point represents a cell and is coloured by its corrected expression of key marker genes for known cell types in the pancreas."---- ct.gcg <- plotTSNE(csce, by_exprs_values="corrected", colour_by="ENSG00000115263") + ggtitle("Alpha cells (GCG)") ct.ins <- plotTSNE(csce, by_exprs_values="corrected", colour_by="ENSG00000254647") + ggtitle("Beta cells (INS)") ct.sst <- plotTSNE(csce, by_exprs_values="corrected", colour_by="ENSG00000157005") + ggtitle("Delta cells (SST)") ct.ppy <- plotTSNE(csce, by_exprs_values="corrected", colour_by="ENSG00000108849") + ggtitle("PP cells (PPY)") multiplot(ct.gcg, ct.ins, ct.sst, ct.ppy, cols=2)