---
title: "Comparison with base R"
author: "Stefano Mangiola"
date: "`r Sys.Date()`"
package: tidybulk
output:
BiocStyle::html_document:
toc_float: true
vignette: >
%\VignetteEngine{knitr::knitr}
%\VignetteIndexEntry{Comparison with base R}
%\usepackage[UTF-8]{inputenc}
---
[![Lifecycle:maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing)
```{r, echo=FALSE, include=FALSE, }
library(knitr)
knitr::opts_chunk$set(cache = TRUE, warning = FALSE,
message = FALSE, cache.lazy = FALSE)
library(dplyr)
library(tidyr)
library(tibble)
library(magrittr)
library(ggplot2)
library(ggrepel)
library(tidybulk)
library(tidySummarizedExperiment)
my_theme =
theme_bw() +
theme(
panel.border = element_blank(),
axis.line = element_line(),
panel.grid.major = element_line(size = 0.2),
panel.grid.minor = element_line(size = 0.1),
text = element_text(size=12),
legend.position="bottom",
aspect.ratio=1,
strip.background = element_blank(),
axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
axis.title.y = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10))
)
tibble_counts = tidybulk::counts_SE %>% tidybulk() %>% as_tibble()
```
In this article we show some examples of the differences in coding between tidybulk/tidyverse and base R. We noted a decrease > 10x of assignments and a decrease of > 2x of line numbers.
## Create `tidybulk` tibble.
```{r}
tt = se_mini
```
## Aggregate duplicated `transcripts`
Tidy transcriptomics
```{r aggregate, cache=TRUE, message=FALSE, warning=FALSE, results='hide', class.source='yellow'}
rowData(tt)$gene_name = rownames(tt)
tt.aggr = tt %>% aggregate_duplicates(.transcript = gene_name)
```
Base R
```{r aggregate long, eval=FALSE}
temp = data.frame(
symbol = dge_list$genes$symbol,
dge_list$counts
)
dge_list.nr <- by(temp, temp$symbol,
function(df)
if(length(df[1,1])>0)
matrixStats:::colSums(as.matrix(df[,-1]))
)
dge_list.nr <- do.call("rbind", dge_list.nr)
colnames(dge_list.nr) <- colnames(dge_list)
```
## Scale `counts`
Tidy transcriptomics
```{r normalise, cache=TRUE}
tt.norm = tt.aggr %>% identify_abundant(factor_of_interest = condition) %>% scale_abundance()
```
Base R
```{r normalise long, eval=FALSE}
library(edgeR)
dgList <- DGEList(count_m=x,group=group)
keep <- filterByExpr(dgList)
dgList <- dgList[keep,,keep.lib.sizes=FALSE]
[...]
dgList <- calcNormFactors(dgList, method="TMM")
norm_counts.table <- cpm(dgList)
```
## Filter `variable transcripts`
We may want to identify and filter variable transcripts.
Tidy transcriptomics
```{r filter variable, cache=TRUE}
tt.norm.variable = tt.norm %>% keep_variable()
```
Base R
```{r filter variable long, eval=FALSE}
library(edgeR)
x = norm_counts.table
s <- rowMeans((x-rowMeans(x))^2)
o <- order(s,decreasing=TRUE)
x <- x[o[1L:top],,drop=FALSE]
norm_counts.table = norm_counts.table[rownames(x)]
norm_counts.table$cell_type = tibble_counts[
match(
tibble_counts$sample,
rownames(norm_counts.table)
),
"Cell type"
]
```
## Reduce `dimensions`
Tidy transcriptomics
```{r mds, cache=TRUE}
tt.norm.MDS =
tt.norm %>%
reduce_dimensions(method="MDS", .dims = 2)
```
Base R
```{r, eval = FALSE}
library(limma)
count_m_log = log(count_m + 1)
cmds = limma::plotMDS(ndim = .dims, plot = FALSE)
cmds = cmds %$%
cmdscale.out %>%
setNames(sprintf("Dim%s", 1:6))
cmds$cell_type = tibble_counts[
match(tibble_counts$sample, rownames(cmds)),
"Cell type"
]
```
### PCA
Tidy transcriptomics
```{r pca, cache=TRUE, message=FALSE, warning=FALSE, results='hide'}
tt.norm.PCA =
tt.norm %>%
reduce_dimensions(method="PCA", .dims = 2)
```
Base R
```{r,eval=FALSE}
count_m_log = log(count_m + 1)
pc = count_m_log %>% prcomp(scale = TRUE)
variance = pc$sdev^2
variance = (variance / sum(variance))[1:6]
pc$cell_type = counts[
match(counts$sample, rownames(pc)),
"Cell type"
]
```
### tSNE
Tidy transcriptomics
```{r tsne, cache=TRUE, message=FALSE, warning=FALSE, results='hide'}
tt.norm.tSNE =
breast_tcga_mini_SE %>%
tidybulk( sample, ens, count_scaled) %>%
identify_abundant() %>%
reduce_dimensions(
method = "tSNE",
perplexity=10,
pca_scale =TRUE
)
```
Base R
```{r, eval=FALSE}
count_m_log = log(count_m + 1)
tsne = Rtsne::Rtsne(
t(count_m_log),
perplexity=10,
pca_scale =TRUE
)$Y
tsne$cell_type = tibble_counts[
match(tibble_counts$sample, rownames(tsne)),
"Cell type"
]
```
## Rotate `dimensions`
Tidy transcriptomics
```{r rotate, cache=TRUE}
tt.norm.MDS.rotated =
tt.norm.MDS %>%
rotate_dimensions(`Dim1`, `Dim2`, rotation_degrees = 45, action="get")
```
Base R
```{r, eval=FALSE}
rotation = function(m, d) {
r = d * pi / 180
((bind_rows(
c(`1` = cos(r), `2` = -sin(r)),
c(`1` = sin(r), `2` = cos(r))
) %>% as_matrix) %*% m)
}
mds_r = pca %>% rotation(rotation_degrees)
mds_r$cell_type = counts[
match(counts$sample, rownames(mds_r)),
"Cell type"
]
```
## Test `differential abundance`
Tidy transcriptomics
```{r de, cache=TRUE, message=FALSE, warning=FALSE, results='hide'}
tt.de =
tt %>%
test_differential_abundance( ~ condition, action="get")
tt.de
```
Base R
```{r, eval=FALSE}
library(edgeR)
dgList <- DGEList(counts=counts_m,group=group)
keep <- filterByExpr(dgList)
dgList <- dgList[keep,,keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
design <- model.matrix(~group)
dgList <- estimateDisp(dgList,design)
fit <- glmQLFit(dgList,design)
qlf <- glmQLFTest(fit,coef=2)
topTags(qlf, n=Inf)
```
## Adjust `counts`
Tidy transcriptomics
```{r adjust, cache=TRUE, message=FALSE, warning=FALSE, results='hide'}
tt.norm.adj =
tt.norm %>% adjust_abundance( ~ condition + time)
```
Base R
```{r, eval=FALSE}
library(sva)
count_m_log = log(count_m + 1)
design =
model.matrix(
object = ~ condition + time,
data = annotation
)
count_m_log.sva =
ComBat(
batch = design[,2],
mod = design,
...
)
count_m_log.sva = ceiling(exp(count_m_log.sva) -1)
count_m_log.sva$cell_type = counts[
match(counts$sample, rownames(count_m_log.sva)),
"Cell type"
]
```
## Deconvolve `Cell type composition`
Tidy transcriptomics
```{r cibersort, cache=TRUE, eval=FALSE}
tt.cibersort =
tt %>%
deconvolve_cellularity(action="get", cores=1)
```
Base R
```{r, eval=FALSE}
source(‘CIBERSORT.R’)
count_m %>% write.table("mixture_file.txt")
results <- CIBERSORT(
"sig_matrix_file.txt",
"mixture_file.txt",
perm=100, QN=TRUE
)
results$cell_type = tibble_counts[
match(tibble_counts$sample, rownames(results)),
"Cell type"
]
```
## Cluster `samples`
### k-means
Tidy transcriptomics
```{r cluster, cache=TRUE}
tt.norm.cluster = tt.norm.MDS %>%
cluster_elements(method="kmeans", centers = 2, action="get" )
```
Base R
```{r, eval=FALSE}
count_m_log = log(count_m + 1)
k = kmeans(count_m_log, iter.max = 1000, ...)
cluster = k$cluster
cluster$cell_type = tibble_counts[
match(tibble_counts$sample, rownames(cluster)),
c("Cell type", "Dim1", "Dim2")
]
```
### SNN
Matrix package (v1.3-3) causes an error with Seurat::FindNeighbors used in this method. We are trying to solve this issue. At the moment this option in unaviable.
Tidy transcriptomics
```{r SNN, eval=FALSE, message=FALSE, warning=FALSE, results='hide'}
tt.norm.SNN =
tt.norm.tSNE %>%
cluster_elements(method = "SNN")
```
Base R
```{r, eval=FALSE}
library(Seurat)
snn = CreateSeuratObject(count_m)
snn = ScaleData(
snn, display.progress = TRUE,
num.cores=4, do.par = TRUE
)
snn = FindVariableFeatures(snn, selection.method = "vst")
snn = FindVariableFeatures(snn, selection.method = "vst")
snn = RunPCA(snn, npcs = 30)
snn = FindNeighbors(snn)
snn = FindClusters(snn, method = "igraph", ...)
snn = snn[["seurat_clusters"]]
snn$cell_type = tibble_counts[
match(tibble_counts$sample, rownames(snn)),
c("Cell type", "Dim1", "Dim2")
]
```
## Drop `redundant` transcripts
Tidy transcriptomics
```{r drop, cache=TRUE}
tt.norm.non_redundant =
tt.norm.MDS %>%
remove_redundancy( method = "correlation" )
```
Base R
```{r, eval=FALSE}
library(widyr)
.data.correlated =
pairwise_cor(
counts,
sample,
transcript,
rc,
sort = TRUE,
diag = FALSE,
upper = FALSE
) %>%
filter(correlation > correlation_threshold) %>%
distinct(item1) %>%
rename(!!.element := item1)
# Return non redundant data frame
counts %>% anti_join(.data.correlated) %>%
spread(sample, rc, - transcript) %>%
left_join(annotation)
```
## Draw `heatmap`
tidytranscriptomics
```{r heat, eval=FALSE}
tt.norm.MDS %>%
# filter lowly abundant
keep_abundant() %>%
# extract 500 most variable genes
keep_variable( .abundance = count_scaled, top = 500) %>%
# create heatmap
heatmap(sample, transcript, count_scaled, transform = log1p) %>%
add_tile(`Cell type`)
```
Base R
```{r, eval=FALSE}
# Example taken from airway dataset from BioC2020 workshop.
dgList <- SE2DGEList(airway)
group <- factor(dgList$samples$`Cell type`)
keep.exprs <- filterByExpr(dgList, group=group)
dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
logcounts <- cpm(dgList, log=TRUE)
var_genes <- apply(logcounts, 1, var)
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
highly_variable_lcpm <- logcounts[select_var,]
colours <- c("#440154FF", "#21908CFF", "#fefada" )
col.group <- c("red","grey")[group]
gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")
```
## Draw `density plot`
tidytranscriptomics
```{r density, eval=FALSE}
# Example taken from airway dataset from BioC2020 workshop.
airway %>%
tidybulk() %>%
identify_abundant() %>%
scale_abundance() %>%
pivot_longer(cols = starts_with("counts"), names_to = "source", values_to = "abundance") %>%
filter(!lowly_abundant) %>%
ggplot(aes(x=abundance + 1, color=sample)) +
geom_density() +
facet_wrap(~source) +
scale_x_log10()
```
Base R
```{r, eval=FALSE}
# Example taken from airway dataset from BioC2020 workshop.
dgList <- SE2DGEList(airway)
group <- factor(dgList$samples$dex)
keep.exprs <- filterByExpr(dgList, group=group)
dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
logcounts <- cpm(dgList, log=TRUE)
var_genes <- apply(logcounts, 1, var)
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
highly_variable_lcpm <- logcounts[select_var,]
colours <- c("#440154FF", "#21908CFF", "#fefada" )
col.group <- c("red","grey")[group]
gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")
```
## Appendix
```{r}
sessionInfo()
```