Chapter 3 Unfiltered human PBMCs (10X Genomics)

3.1 Introduction

Here, we describe a brief analysis of the peripheral blood mononuclear cell (PBMC) dataset from 10X Genomics (Zheng et al. 2017). The data are publicly available from the 10X Genomics website, from which we download the raw gene/barcode count matrices, i.e., before cell calling from the CellRanger pipeline.

3.2 Data loading

library(DropletTestFiles)
raw.path <- getTestFile("tenx-2.1.0-pbmc4k/1.0.0/raw.tar.gz")
out.path <- file.path(tempdir(), "pbmc4k")
untar(raw.path, exdir=out.path)

library(DropletUtils)
fname <- file.path(out.path, "raw_gene_bc_matrices/GRCh38")
sce.pbmc <- read10xCounts(fname, col.names=TRUE)
library(scater)
rownames(sce.pbmc) <- uniquifyFeatureNames(
    rowData(sce.pbmc)$ID, rowData(sce.pbmc)$Symbol)

library(EnsDb.Hsapiens.v86)
location <- mapIds(EnsDb.Hsapiens.v86, keys=rowData(sce.pbmc)$ID, 
    column="SEQNAME", keytype="GENEID")

3.3 Quality control

We perform cell detection using the emptyDrops() algorithm, as discussed in Advanced Section 7.2.

set.seed(100)
e.out <- emptyDrops(counts(sce.pbmc))
sce.pbmc <- sce.pbmc[,which(e.out$FDR <= 0.001)]
unfiltered <- sce.pbmc

We use a relaxed QC strategy and only remove cells with large mitochondrial proportions, using it as a proxy for cell damage. This reduces the risk of removing cell types with low RNA content, especially in a heterogeneous PBMC population with many different cell types.

stats <- perCellQCMetrics(sce.pbmc, subsets=list(Mito=which(location=="MT")))
high.mito <- isOutlier(stats$subsets_Mito_percent, type="higher")
sce.pbmc <- sce.pbmc[,!high.mito]
summary(high.mito)
##    Mode   FALSE    TRUE 
## logical    3985     315
colData(unfiltered) <- cbind(colData(unfiltered), stats)
unfiltered$discard <- high.mito

gridExtra::grid.arrange(
    plotColData(unfiltered, y="sum", colour_by="discard") +
        scale_y_log10() + ggtitle("Total count"),
    plotColData(unfiltered, y="detected", colour_by="discard") +
        scale_y_log10() + ggtitle("Detected features"),
    plotColData(unfiltered, y="subsets_Mito_percent",
        colour_by="discard") + ggtitle("Mito percent"),
    ncol=2
)
Distribution of various QC metrics in the PBMC dataset after cell calling. Each point is a cell and is colored according to whether it was discarded by the mitochondrial filter.

Figure 3.1: Distribution of various QC metrics in the PBMC dataset after cell calling. Each point is a cell and is colored according to whether it was discarded by the mitochondrial filter.

plotColData(unfiltered, x="sum", y="subsets_Mito_percent",
    colour_by="discard") + scale_x_log10()
Proportion of mitochondrial reads in each cell of the PBMC dataset compared to its total count.

Figure 3.2: Proportion of mitochondrial reads in each cell of the PBMC dataset compared to its total count.

3.4 Normalization

library(scran)
set.seed(1000)
clusters <- quickCluster(sce.pbmc)
sce.pbmc <- computeSumFactors(sce.pbmc, cluster=clusters)
sce.pbmc <- logNormCounts(sce.pbmc)
summary(sizeFactors(sce.pbmc))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.007   0.712   0.875   1.000   1.099  12.254
plot(librarySizeFactors(sce.pbmc), sizeFactors(sce.pbmc), pch=16,
    xlab="Library size factors", ylab="Deconvolution factors", log="xy")
Relationship between the library size factors and the deconvolution size factors in the PBMC dataset.

Figure 3.3: Relationship between the library size factors and the deconvolution size factors in the PBMC dataset.

3.5 Variance modelling

set.seed(1001)
dec.pbmc <- modelGeneVarByPoisson(sce.pbmc)
top.pbmc <- getTopHVGs(dec.pbmc, prop=0.1)
plot(dec.pbmc$mean, dec.pbmc$total, pch=16, cex=0.5,
    xlab="Mean of log-expression", ylab="Variance of log-expression")
curfit <- metadata(dec.pbmc)
curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
Per-gene variance as a function of the mean for the log-expression values in the PBMC dataset. Each point represents a gene (black) with the mean-variance trend (blue) fitted to simulated Poisson counts.

Figure 3.4: Per-gene variance as a function of the mean for the log-expression values in the PBMC dataset. Each point represents a gene (black) with the mean-variance trend (blue) fitted to simulated Poisson counts.

3.6 Dimensionality reduction

set.seed(10000)
sce.pbmc <- denoisePCA(sce.pbmc, subset.row=top.pbmc, technical=dec.pbmc)

set.seed(100000)
sce.pbmc <- runTSNE(sce.pbmc, dimred="PCA")

set.seed(1000000)
sce.pbmc <- runUMAP(sce.pbmc, dimred="PCA")

We verify that a reasonable number of PCs is retained.

ncol(reducedDim(sce.pbmc, "PCA"))
## [1] 9

3.7 Clustering

g <- buildSNNGraph(sce.pbmc, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(sce.pbmc) <- factor(clust)
table(colLabels(sce.pbmc))
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15 
## 205 731 617  56 541 352 125  46 819  47 153  61 129  87  16
plotTSNE(sce.pbmc, colour_by="label")
Obligatory $t$-SNE plot of the PBMC dataset, where each point represents a cell and is colored according to the assigned cluster.

Figure 3.5: Obligatory \(t\)-SNE plot of the PBMC dataset, where each point represents a cell and is colored according to the assigned cluster.

3.8 Interpretation

markers <- findMarkers(sce.pbmc, pval.type="some", direction="up")

We examine the markers for cluster 2 in more detail. High expression of CD14, CD68 and MNDA combined with low expression of FCGR3A (CD16) suggests that this cluster contains monocytes, compared to macrophages in cluster 14 (Figure 3.6).

marker.set <- markers[["2"]]
as.data.frame(marker.set[1:30,1:3])
##                  p.value        FDR summary.logFC
## MNDA           0.000e+00  0.000e+00        2.4270
## CSTA           0.000e+00 8.108e-321        2.2749
## FCN1          5.675e-266 6.374e-262        2.7085
## RP11-1143G9.4 4.422e-252 3.725e-248        2.6287
## VCAN          9.765e-235 6.581e-231        1.8445
## MS4A6A        2.287e-209 1.284e-205        1.5333
## FGL2          1.077e-208 5.183e-205        1.4499
## S100A12       3.976e-207 1.674e-203        2.4102
## LGALS2        1.732e-194 6.482e-191        2.0107
## CFD           1.207e-193 4.067e-190        1.4583
## AIF1          1.362e-180 4.173e-177        2.6862
## CD14          4.650e-170 1.306e-166        1.3215
## CLEC7A        3.055e-169 7.917e-166        1.0966
## TYMP          4.932e-166 1.187e-162        2.0425
## CD68          1.008e-161 2.264e-158        1.1025
## S100A8        2.499e-158 5.262e-155        4.5407
## SERPINA1      1.262e-157 2.502e-154        1.5040
## TNFSF13B      3.069e-151 5.745e-148        1.0353
## KLF4          1.351e-150 2.395e-147        1.2414
## AP1S2         3.613e-149 6.087e-146        1.8689
## CFP           8.387e-144 1.346e-140        1.1019
## S100A9        1.301e-141 1.993e-138        4.5307
## NAMPT         1.074e-138 1.573e-135        1.1066
## IFI30         2.558e-133 3.591e-130        0.9717
## MPEG1         9.448e-132 1.273e-128        0.9856
## CYBB          5.226e-129 6.773e-126        1.1825
## LGALS3        2.868e-128 3.580e-125        0.9434
## LYZ           1.108e-123 1.334e-120        5.0812
## CPVL          3.905e-123 4.537e-120        0.8642
## CD36          6.119e-123 6.873e-120        0.9696
plotExpression(sce.pbmc, features=c("CD14", "CD68",
    "MNDA", "FCGR3A"), x="label", colour_by="label")
Distribution of expression values for monocyte and macrophage markers across clusters in the PBMC dataset.

Figure 3.6: Distribution of expression values for monocyte and macrophage markers across clusters in the PBMC dataset.

Session Info

R Under development (unstable) (2024-10-21 r87258)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.1 LTS

Matrix products: default
BLAS:   /home/biocbuild/bbs-3.21-bioc/R/lib/libRblas.so 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB              LC_COLLATE=C              
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: America/New_York
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] scran_1.35.0                EnsDb.Hsapiens.v86_2.99.0  
 [3] ensembldb_2.31.0            AnnotationFilter_1.31.0    
 [5] GenomicFeatures_1.59.1      AnnotationDbi_1.69.0       
 [7] scater_1.35.0               ggplot2_3.5.1              
 [9] scuttle_1.17.0              DropletUtils_1.27.0        
[11] SingleCellExperiment_1.29.1 SummarizedExperiment_1.37.0
[13] Biobase_2.67.0              GenomicRanges_1.59.0       
[15] GenomeInfoDb_1.43.0         IRanges_2.41.0             
[17] S4Vectors_0.45.1            BiocGenerics_0.53.1        
[19] generics_0.1.3              MatrixGenerics_1.19.0      
[21] matrixStats_1.4.1           DropletTestFiles_1.17.0    
[23] BiocStyle_2.35.0            rebook_1.17.0              

loaded via a namespace (and not attached):
  [1] jsonlite_1.8.9            CodeDepends_0.6.6        
  [3] magrittr_2.0.3            ggbeeswarm_0.7.2         
  [5] farver_2.1.2              rmarkdown_2.29           
  [7] BiocIO_1.17.0             zlibbioc_1.53.0          
  [9] vctrs_0.6.5               memoise_2.0.1            
 [11] Rsamtools_2.23.0          DelayedMatrixStats_1.29.0
 [13] RCurl_1.98-1.16           htmltools_0.5.8.1        
 [15] S4Arrays_1.7.1            AnnotationHub_3.15.0     
 [17] curl_6.0.0                BiocNeighbors_2.1.0      
 [19] Rhdf5lib_1.29.0           SparseArray_1.7.1        
 [21] rhdf5_2.51.0              sass_0.4.9               
 [23] bslib_0.8.0               cachem_1.1.0             
 [25] GenomicAlignments_1.43.0  igraph_2.1.1             
 [27] mime_0.12                 lifecycle_1.0.4          
 [29] pkgconfig_2.0.3           rsvd_1.0.5               
 [31] Matrix_1.7-1              R6_2.5.1                 
 [33] fastmap_1.2.0             GenomeInfoDbData_1.2.13  
 [35] digest_0.6.37             colorspace_2.1-1         
 [37] dqrng_0.4.1               irlba_2.3.5.1            
 [39] ExperimentHub_2.15.0      RSQLite_2.3.7            
 [41] beachmat_2.23.0           labeling_0.4.3           
 [43] filelock_1.0.3            fansi_1.0.6              
 [45] httr_1.4.7                abind_1.4-8              
 [47] compiler_4.5.0            bit64_4.5.2              
 [49] withr_3.0.2               BiocParallel_1.41.0      
 [51] viridis_0.6.5             DBI_1.2.3                
 [53] HDF5Array_1.35.1          R.utils_2.12.3           
 [55] rappdirs_0.3.3            DelayedArray_0.33.1      
 [57] bluster_1.17.0            rjson_0.2.23             
 [59] tools_4.5.0               vipor_0.4.7              
 [61] beeswarm_0.4.0            R.oo_1.27.0              
 [63] glue_1.8.0                restfulr_0.0.15          
 [65] rhdf5filters_1.19.0       grid_4.5.0               
 [67] Rtsne_0.17                cluster_2.1.6            
 [69] gtable_0.3.6              R.methodsS3_1.8.2        
 [71] metapod_1.15.0            BiocSingular_1.23.0      
 [73] ScaledMatrix_1.15.0       utf8_1.2.4               
 [75] XVector_0.47.0            ggrepel_0.9.6            
 [77] BiocVersion_3.21.1        pillar_1.9.0             
 [79] limma_3.63.2              dplyr_1.1.4              
 [81] BiocFileCache_2.15.0      lattice_0.22-6           
 [83] FNN_1.1.4.1               rtracklayer_1.67.0       
 [85] bit_4.5.0                 tidyselect_1.2.1         
 [87] locfit_1.5-9.10           Biostrings_2.75.1        
 [89] knitr_1.49                gridExtra_2.3            
 [91] bookdown_0.41             ProtGenerics_1.39.0      
 [93] edgeR_4.5.0               xfun_0.49                
 [95] statmod_1.5.0             UCSC.utils_1.3.0         
 [97] lazyeval_0.2.2            yaml_2.3.10              
 [99] evaluate_1.0.1            codetools_0.2-20         
[101] tibble_3.2.1              BiocManager_1.30.25      
[103] graph_1.85.0              cli_3.6.3                
[105] uwot_0.2.2                munsell_0.5.1            
[107] jquerylib_0.1.4           Rcpp_1.0.13-1            
[109] dir.expiry_1.15.0         dbplyr_2.5.0             
[111] png_0.1-8                 XML_3.99-0.17            
[113] parallel_4.5.0            blob_1.2.4               
[115] sparseMatrixStats_1.19.0  bitops_1.0-9             
[117] viridisLite_0.4.2         scales_1.3.0             
[119] purrr_1.0.2               crayon_1.5.3             
[121] rlang_1.1.4               cowplot_1.1.3            
[123] KEGGREST_1.47.0          

References

Zheng, G. X., J. M. Terry, P. Belgrader, P. Ryvkin, Z. W. Bent, R. Wilson, S. B. Ziraldo, et al. 2017. Massively parallel digital transcriptional profiling of single cells.” Nat Commun 8 (January): 14049.