## ------------------------------------------------------------------------ library(ccfindR) ## ---- echo=T------------------------------------------------------------- # A toy matrix for count data set.seed(1) mat <- matrix(rpois(n = 80, lambda = 2), nrow = 4, ncol = 20) ABC <- LETTERS[1:4] abc <- letters[1:20] rownames(mat) <- ABC colnames(mat) <- abc ## ---- echo=T------------------------------------------------------------- # create scNMFSet object sc <- scNMFSet(count = mat) ## ---- echo=T------------------------------------------------------------- # create scNMFSet object sc <- scNMFSet(assays = list(counts = mat)) ## ------------------------------------------------------------------------ # set row and column names suppressMessages(library(S4Vectors)) genes <- as(ABC, 'DataFrame') rownames(genes) <- ABC cells <- as(abc, 'DataFrame') rownames(cells) <- abc sc <- scNMFSet(count = mat, rowData = genes, colData = cells) sc ## ------------------------------------------------------------------------ # read sparse matrix dir <- system.file('extdata', package = 'ccfindR') mat <- Matrix::readMM(paste0(dir,'/matrix.mtx')) sc <- scNMFSet(count = mat, rowData = as(1:nrow(mat), 'DataFrame'), colData = as(1:ncol(mat), 'DataFrame')) sc ## ------------------------------------------------------------------------ # read 10x files sc <- read_10x(dir = dir, count = 'matrix.mtx', genes = 'genes.tsv', barcodes = 'barcodes.tsv') sc ## ------------------------------------------------------------------------ # slots and subsetting counts(sc)[1:7,1:3] head(rowData(sc)) head(colData(sc)) sc2 <- sc[1:20,1:70] # subsetting of object sc2 <- remove_zeros(sc2) # remove empty rows/columns sc2 ## ---- cache=T, fig.width=4, fig.height=4,fig.cap='Quality control filtering of cells. Histogram of UMI counts is shown. Cells can be selected (red) by setting lower and upper thresholds of the UMI count.'---- sc <- filter_cells(sc, umi.min = 10^2.6, umi.max = 10^3.4) ## ---- cache=T, fig.width=4, fig.height=4, fig.cap='Selection of genes for clustering. The scatter plot shows distributions of expression variance to mean ratio (VMR) and the number of cells expressed. Minimum VMR and a range of cell number can be set to select genes (red). Symbols in orange are marker genes provided as input, selected irrespective of expression variance.'---- markers <- c('CD4','CD8A','CD8B','CD19','CD3G','CD3D', 'CD3Z','CD14') sc0 <- filter_genes(sc, markers = markers, vmr.min = 1.5, min.cells.expressed = 50, rescue.genes = FALSE) ## ---- cache=T, echo=T, fig.width=4, fig.height=4, fig.cap='Additional selection of genes with modes at nonzero counts. Symbols in blue represent genes rescued.'---- sc_rescue <- filter_genes(sc, markers = markers, vmr.min = 1.5, min.cells.expressed = 50, rescue.genes = TRUE, progress.bar = FALSE) ## ---- cache=T------------------------------------------------------------ rownames(sc_rescue) <- rowData(sc_rescue)[,2] sc <- sc_rescue ## ---- cache=T------------------------------------------------------------ set.seed(1) sc <- factorize(sc, ranks = 3, nrun = 1, ncnn.step = 1, criterion='connectivity', verbose = 3) ## ---- cache=T------------------------------------------------------------ sc <- factorize(sc, ranks = 3, nrun = 5, verbose = 2) ## ---- cache=T------------------------------------------------------------ sc <- factorize(sc, ranks = seq(3,7), nrun = 5, verbose = 1, progress.bar = FALSE) ## ------------------------------------------------------------------------ measure(sc) ## ---- fig.width=6.5, fig.height=3, fig.cap='Factorization quality measures as functions of the rank. Dispersion measures the degree of bimodality in consistency matrix. Cophenetic correlation measures the degree of agreement between consistency matrix and hierarchical clustering.'---- plot(sc) ## ---- cache=T------------------------------------------------------------ sb <- sc_rescue set.seed(2) sb <- vb_factorize(sb, ranks =3, verbose = 3, Tol = 2e-4, hyper.update.n0 = 5) ## ---- cache=T------------------------------------------------------------ sb <- vb_factorize(sb, ranks = seq(2,7), nrun = 5, verbose = 1, Tol = 1e-4, progress.bar = FALSE) ## ------------------------------------------------------------------------ measure(sb) ## ---- fig.width=4, fig.height=4, fig.cap='Dependence of log evidence with rank.'---- plot(sb) ## ------------------------------------------------------------------------ ranks(sb) head(basis(sb)[ranks(sb)==5][[1]]) # basis matrix W for rank 5 ## ---- cache=T, fig.width = 4, fig.height=6, fig.cap='Heatmap of basis matrix elements. Marker genes selected in rows, other than those provided as input, are based on the degree to which each features strongly in a particular cluster only and not in the rest. Columns represent the clusters.'---- gene_map(sb, markers = markers, rank = 5, max.per.cluster = 4, gene.name = rowData(sb)[,2], cexRow = 0.7) ## ------------------------------------------------------------------------ cell_type <- c('CD8+_T','NK','Monocytes','B_cell','CD4+_T') colnames(basis(sb)[ranks(sb) == 5][[1]]) <- cell_type rownames(coeff(sb)[ranks(sb) == 5][[1]]) <- cell_type ## ---- fig.width = 4, fig.height=4, fig.cap = 'Heatmap of cluster coefficient matrix elements. Rows indicate clusters and columns the cells.'---- cell_map(sb, rank = 5) ## ---- fig.width = 6.5, fig.height = 4, fig.cap='tSNE-based visualization of coefficient matrix elements of cells with colors indicating predicted cluster assignment. The bar plot shows the cell counts of each cluster.'---- visualize_clusters(sb, rank = 5, cex = 0.7) ## ---- fig.width = 4, fig.height = 4, fig.cap = 'Hierarchical tree of clusters derived from varying ranks. The rank increases from 2 to 5 horizontally and nodes are labeled by cluster IDs which bifurcated in each rank.'---- tree <- build_tree(sb, rmax = 5) tree <- rename_tips(tree, rank = 5, tip.labels = cell_type) plot_tree(tree, cex = 0.8, show.node.label = TRUE)