## ------------------------------------------------------------------------
library(ccfindR)

## ---- echo=T-------------------------------------------------------------
# A toy matrix for count data
set.seed(1)
mat <- matrix(rpois(n = 80, lambda = 2), nrow = 4, ncol = 20)
ABC <- LETTERS[1:4]
abc <- letters[1:20]
rownames(mat) <- ABC
colnames(mat) <- abc

## ---- echo=T-------------------------------------------------------------
# create scNMFSet object
sc <- scNMFSet(count = mat)

## ---- echo=T-------------------------------------------------------------
# create scNMFSet object
sc <- scNMFSet(assays = list(counts = mat))

## ------------------------------------------------------------------------
# set row and column names
suppressMessages(library(S4Vectors))
genes <- as(ABC, 'DataFrame')
rownames(genes) <- ABC
cells <- as(abc, 'DataFrame')
rownames(cells) <- abc
sc <- scNMFSet(count = mat, rowData = genes, colData = cells)
sc

## ------------------------------------------------------------------------
# read sparse matrix
dir <- system.file('extdata', package = 'ccfindR')
mat <- Matrix::readMM(paste0(dir,'/matrix.mtx'))
sc <- scNMFSet(count = mat, rowData = as(1:nrow(mat), 'DataFrame'),
               colData = as(1:ncol(mat), 'DataFrame'))
sc

## ------------------------------------------------------------------------
# read 10x files
sc <- read_10x(dir = dir, count = 'matrix.mtx', genes = 'genes.tsv',
               barcodes = 'barcodes.tsv')
sc

## ------------------------------------------------------------------------
# slots and subsetting
counts(sc)[1:7,1:3]
head(rowData(sc))
head(colData(sc))
sc2 <- sc[1:20,1:70]       # subsetting of object
sc2 <- remove_zeros(sc2)   # remove empty rows/columns
sc2

## ---- cache=T, fig.width=4, fig.height=4,fig.cap='Quality control filtering of cells. Histogram of UMI counts is shown. Cells can be selected (red) by setting lower and upper thresholds of the UMI count.'----
sc <- filter_cells(sc, umi.min = 10^2.6, umi.max = 10^3.4)

## ---- cache=T, fig.width=4, fig.height=4, fig.cap='Selection of genes for clustering. The scatter plot shows distributions of expression variance to mean ratio (VMR) and the number of cells expressed. Minimum VMR and a range of cell number can be set to select genes (red). Symbols in orange are marker genes provided as input, selected irrespective of expression variance.'----
markers <- c('CD4','CD8A','CD8B','CD19','CD3G','CD3D',
             'CD3Z','CD14')
sc0 <- filter_genes(sc, markers = markers, vmr.min = 1.5, 
            min.cells.expressed = 50, rescue.genes = FALSE)

## ---- cache=T, echo=T, fig.width=4, fig.height=4, fig.cap='Additional selection of genes with modes at nonzero counts. Symbols in blue represent genes rescued.'----
sc_rescue <- filter_genes(sc, markers = markers, vmr.min = 1.5, min.cells.expressed = 50,
                          rescue.genes = TRUE, progress.bar = FALSE)

## ---- cache=T------------------------------------------------------------
rownames(sc_rescue) <- rowData(sc_rescue)[,2]
sc <- sc_rescue

## ---- cache=T------------------------------------------------------------
set.seed(1)
sc <- factorize(sc, ranks = 3, nrun = 1, ncnn.step = 1, 
                criterion='connectivity', verbose = 3)

## ---- cache=T------------------------------------------------------------
sc <- factorize(sc, ranks = 3, nrun = 5, verbose = 2)

## ---- cache=T------------------------------------------------------------
sc <- factorize(sc, ranks = seq(3,7), nrun = 5, verbose = 1, progress.bar = FALSE)

## ------------------------------------------------------------------------
measure(sc)

## ---- fig.width=6.5, fig.height=3, fig.cap='Factorization quality measures as functions of the rank. Dispersion measures the degree of bimodality in consistency matrix. Cophenetic correlation measures the degree of agreement between consistency matrix and hierarchical clustering.'----
plot(sc)

## ---- cache=T------------------------------------------------------------
sb <- sc_rescue
set.seed(2)
sb <- vb_factorize(sb, ranks =3, verbose = 3, Tol = 2e-4, hyper.update.n0 = 5)

## ---- cache=T------------------------------------------------------------
sb <- vb_factorize(sb, ranks = seq(2,7), nrun = 5, verbose = 1, Tol = 1e-4, progress.bar = FALSE)

## ------------------------------------------------------------------------
measure(sb)

## ---- fig.width=4, fig.height=4, fig.cap='Dependence of log evidence with rank.'----
plot(sb)

## ------------------------------------------------------------------------
ranks(sb)

head(basis(sb)[ranks(sb)==5][[1]]) # basis matrix W for rank 5

## ---- cache=T, fig.width = 4, fig.height=6, fig.cap='Heatmap of basis matrix elements. Marker genes selected in rows, other than those provided as input, are based on the degree to which each features strongly in a particular cluster only and not in the rest. Columns represent the clusters.'----
gene_map(sb, markers = markers, rank = 5, max.per.cluster = 4, gene.name = rowData(sb)[,2],
         cexRow = 0.7)

## ------------------------------------------------------------------------
cell_type <- c('CD8+_T','NK','Monocytes','B_cell','CD4+_T')
colnames(basis(sb)[ranks(sb) == 5][[1]]) <- cell_type
rownames(coeff(sb)[ranks(sb) == 5][[1]]) <- cell_type

## ---- fig.width = 4, fig.height=4, fig.cap = 'Heatmap of cluster coefficient matrix elements. Rows indicate clusters and columns the cells.'----
cell_map(sb, rank = 5)

## ---- fig.width = 6.5, fig.height = 4, fig.cap='tSNE-based visualization of coefficient matrix elements of cells with colors indicating predicted cluster assignment. The bar plot shows the cell counts of each cluster.'----
visualize_clusters(sb, rank = 5, cex = 0.7)

## ---- fig.width = 4, fig.height = 4, fig.cap = 'Hierarchical tree of clusters derived from varying ranks. The rank increases from 2 to 5 horizontally and nodes are labeled by cluster IDs which bifurcated in each rank.'----
tree <- build_tree(sb, rmax = 5)
tree <- rename_tips(tree, rank = 5, tip.labels = cell_type)
plot_tree(tree, cex = 0.8, show.node.label = TRUE)