# Install BiocManager if not already
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
# Install devtools if not already
if (!requireNamespace("devtools", quietly = TRUE)) {
install.packages("devtools")
}
# Ensure repos includes both CRAN and Bioconductor repositories
options(repos = BiocManager::repositories())
devtools::install_github("https://github.com/hyochoi/ELViS.git")
Load required libraries.
library(ELViS)
#> Warning: replacing previous import 'BiocGenerics::setequal' by
#> 'dplyr::setequal' when loading 'ELViS'
library(ggplot2)
library(glue)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ComplexHeatmap)
#> Loading required package: grid
#> ========================================
#> ComplexHeatmap version 2.23.0
#> Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
#> Github page: https://github.com/jokergoo/ComplexHeatmap
#> Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
#>
#> If you use it in published research, please cite either one:
#> - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
#> - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
#> genomic data. Bioinformatics 2016.
#>
#>
#> The new InteractiveComplexHeatmap package can directly export static
#> complex heatmaps into an interactive Shiny app with zero effort. Have a try!
#>
#> This message can be suppressed by:
#> suppressPackageStartupMessages(library(ComplexHeatmap))
#> ========================================
theme_set(theme_bw())
Prepare BAM file name vector.
analysis_dir = tempdir()
dir.create(analysis_dir,showWarnings = FALSE)
package_name = "ELViS"
# load toy example meta data
data(toy_example,package = package_name)
# get lust of bam file paths
ext_path = system.file("extdata",package = package_name)
bam_files = list.files(ext_path,full.names = TRUE,pattern = "bam$")
Generate base-resolution read depth matrix from a list of BAM files. Parallel package is used to read BAM files fast.
os_name = Sys.info()["sysname"]
if( os_name == "Windows" ){
N_cores <- 1L
}else{
N_cores <- 2L
}
# the name of the reference viral sequence the reads were aligned to
target_virus_name = "gi|333031|lcl|HPV16REF.1|"
# temporary file directory
tmpdir="./tmpdir"
dir.create(tmpdir,recursive = TRUE)
# generate read depth matrix
system.time({
mtrx_samtools_reticulate__example =
get_depth_matrix(
bam_files = bam_files,target_virus_name = target_virus_name
,mode = "samtools_basilisk"
,N_cores = N_cores
,min_mapq = 30
,tmpdir=tmpdir
,condaenv = "env_samtools"
,condaenv_samtools_version="1.21"
)
})
#> + /var/cache/basilisk/1.19.0/0/bin/conda create --yes --prefix /var/cache/basilisk/1.19.0/ELViS/0.99.1/env_samtools 'python=3.10.14' --quiet -c conda-forge -c bioconda --override-channels
#> + /var/cache/basilisk/1.19.0/0/bin/conda install --yes --prefix /var/cache/basilisk/1.19.0/ELViS/0.99.1/env_samtools 'python=3.10.14' -c conda-forge -c bioconda --override-channels
#> + /var/cache/basilisk/1.19.0/0/bin/conda install --yes --prefix /var/cache/basilisk/1.19.0/ELViS/0.99.1/env_samtools -c conda-forge -c bioconda 'python=3.10.14' 'samtools=1.21' --override-channels
#> user system elapsed
#> 68.652 5.685 75.420
# remove temporary directory
unlink(tmpdir,recursive=TRUE)
Determine sample filtering threshold using histogram and filter out low depth samples
# loading precalculated depth matrix
data(mtrx_samtools_reticulate)
# threshold
th = 50
# histogram with adjustable thresholds for custom function
depth_hist(mtrx_samtools_reticulate,th=th,smry_fun=max)
#> Warning in scale_x_continuous(trans = "log10"): log-10 transformation
#> introduced infinite values.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
depth_hist(mtrx_samtools_reticulate,th=th,smry_fun=quantile,prob=0.75)
#> Warning in scale_x_continuous(trans = "log10"): log-10 transformation
#> introduced infinite values.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# filtered matrix
base_resol_depth = filt_samples(mtrx_samtools_reticulate,th=th,smry_fun=max)
print(base_resol_depth[1:4,1:4])
#> Control_100X_58.bam Control_100X_61.bam Control_100X_64.bam
#> [1,] 55 57 60
#> [2,] 56 59 62
#> [3,] 57 61 62
#> [4,] 57 61 63
#> Control_100X_83.bam
#> [1,] 49
#> [2,] 49
#> [3,] 52
#> [4,] 53
Running ELViS using the filtered read depth
matrix(base_resol_depth
).
Prepare plotting data
# ELViS run result
data(ELViS_toy_run_result)
result = ELViS_toy_run_result
# Directory where figures will be saved
figure_dir = glue("{analysis_dir}/figures")
dir.create(figure_dir)
# give the gff3 file of the virus of your interest. Sequence name or chromosome name should match with that in the reference genome FASTA file.
gff3_fn = system.file("extdata","HPV16REF_PaVE.gff",package = package_name)
Raw read depth profile line plots.
# Plotting raw depth profile
gg_lst_x =
plot_pileUp_multisample(
result = result,
X_raw = base_resol_depth,
plot_target = "x",
gff3 = gff3_fn,
baseline=1,
exclude_genes = c("E6*","E1^E4","E8^E2"),
)
#> Import genomic features from the file as a GRanges object ...
#> Warning in .local(con, format, text, ...): gff-version directive indicates
#> version is 3.1.26, not 3
#> OK
#> Prepare the 'metadata' data frame ... OK
#> Make the TxDb object ...
#> Warning in .extract_transcripts_from_GRanges(tx_IDX, gr, mcols0$type, mcols0$ID, : the transcript names ("tx_name" column in the TxDb object) imported from the
#> "Name" attribute are not unique
#> Warning in .makeTxDb_normarg_chrominfo(chrominfo): genome version information
#> is not available for this TxDb object
#> OK
# Save to pdf file, set SKIP = FALSE if you want to save as pdf
SKIP = TRUE
if(!SKIP){
pdf(glue("{figure_dir}/Raw_Depth_CNV_call.pdf"),height=4,width=6)
gg_lst_x
dev.off()
}
# an example of raw read depth line plot
print(gg_lst_x[[1]])
You can adjust baseline after examining depth profile plots.
# set the longest segment as a new baseline
new_baseline = get_new_baseline(result,mode="longest")
# Plotting raw depth profile with new baseline
gg_lst_x =
plot_pileUp_multisample(
result = result,
X_raw = base_resol_depth,
plot_target = "x",
gff3 = gff3_fn,
baseline=new_baseline,
exclude_genes = c("E6*","E1^E4","E8^E2"),
)
# Save to pdf file, set SKIP = FALSE if you want to save as pdf
SKIP = TRUE
if(!SKIP){
# Save to pdf file
pdf("figures/Raw_Depth_new_baseline_CNV_call.pdf",height=4,width=6)
gg_lst_x
dev.off()
}
# an example of raw read depth line plot with new baseline
gg_lst_x[[1]]
Normalized read depth profile line plots.
# Plotting normalized depth profile
gg_lst_y =
plot_pileUp_multisample(
result = result,
X_raw = base_resol_depth,
plot_target = "y",
gff3 = gff3_fn,
baseline=new_baseline,
exclude_genes = c("E6*","E1^E4","E8^E2"),
)
# Save to pdf file
SKIP = TRUE
if(!SKIP){
pdf("figures/Normalized_Depth_CNV_call.pdf",height=4,width=6)
gg_lst_y
dev.off()
}
# an example of normalized read depth line plot with new baseline
gg_lst_y[[1]]
Robust Z-score profile line plots.
# Plotting robust Z-score profile
gg_lst_z =
plot_pileUp_multisample(
result = result,
X_raw = base_resol_depth,
plot_target = "z",
gff3 = gff3_fn,
baseline=new_baseline,
exclude_genes = c("E6*","E1^E4","E8^E2")
)
SKIP = TRUE
if(!SKIP){
# Save to pdf file
pdf("figures/Robust-Z-score_CNV_call.pdf",height=4,width=6)
gg_lst_z
dev.off()
}
# an example of Z-score line plot with new baseline
gg_lst_z[[1]]
Generating heatmaps with integrative clustering.
Calculation of viral loads. - Get total aligned base using tools such as picard. Here we use randomly generated numbers instead.
data(total_aligned_base__host_and_virus)
viral_load = (10^6)*(apply(base_resol_depth,2,\(x) sum(x)) )/total_aligned_base__host_and_virus
# distribtuion of overall viral load
viral_load %>%log10 %>% hist
Generate heatmaps with integrative clustering using data transformed in various ways.
exclude_genes = c("E6*","E1^E4","E8^E2")
integ_ht_result = integrative_heatmap(
X_raw = base_resol_depth,
result = result,
gff3_fn = gff3_fn,
exclude_genes = exclude_genes,
# baseline = new_baseline,
baseline=1,
# col_z = col_z,
total_aligned_base__host_and_virus = total_aligned_base__host_and_virus
)
#> Import genomic features from the file as a GRanges object ...
#> Warning in .local(con, format, text, ...): gff-version directive indicates
#> version is 3.1.26, not 3
#> OK
#> Prepare the 'metadata' data frame ... OK
#> Make the TxDb object ...
#> Warning in .extract_transcripts_from_GRanges(tx_IDX, gr, mcols0$type, mcols0$ID, : the transcript names ("tx_name" column in the TxDb object) imported from the
#> "Name" attribute are not unique
#> Warning in .makeTxDb_normarg_chrominfo(chrominfo): genome version information
#> is not available for this TxDb object
#> OK
#> Import genomic features from the file as a GRanges object ...
#> Warning in .local(con, format, text, ...): gff-version directive indicates
#> version is 3.1.26, not 3
#> OK
#> Prepare the 'metadata' data frame ... OK
#> Make the TxDb object ...
#> Warning in .extract_transcripts_from_GRanges(tx_IDX, gr, mcols0$type, mcols0$ID, : the transcript names ("tx_name" column in the TxDb object) imported from the
#> "Name" attribute are not unique
#> Warning in .extract_transcripts_from_GRanges(tx_IDX, gr, mcols0$type, mcols0$ID, : genome version information is not available for this TxDb object
#> OK
#> `use_raster` is automatically set to TRUE for a matrix with more than
#> 2000 rows. You can control `use_raster` argument by explicitly setting
#> TRUE/FALSE to it.
#>
#> Set `ht_opt$message = FALSE` to turn off this message.
#> Warning: The input is a data frame-like object, convert it to a matrix.
#> `use_raster` is automatically set to TRUE for a matrix with more than
#> 2000 rows. You can control `use_raster` argument by explicitly setting
#> TRUE/FALSE to it.
#>
#> Set `ht_opt$message = FALSE` to turn off this message.
#> `use_raster` is automatically set to TRUE for a matrix with more than
#> 2000 rows. You can control `use_raster` argument by explicitly setting
#> TRUE/FALSE to it.
#>
#> Set `ht_opt$message = FALSE` to turn off this message.
#> `use_raster` is automatically set to TRUE for a matrix with more than
#> 2000 rows. You can control `use_raster` argument by explicitly setting
#> TRUE/FALSE to it.
#>
#> Set `ht_opt$message = FALSE` to turn off this message.
#> `use_raster` is automatically set to TRUE for a matrix with more than
#> 2000 rows. You can control `use_raster` argument by explicitly setting
#> TRUE/FALSE to it.
#>
#> Set `ht_opt$message = FALSE` to turn off this message.
#> Warning: The input is a data frame-like object, convert it to a matrix.
#> `use_raster` is automatically set to TRUE for a matrix with more than
#> 2000 rows. You can control `use_raster` argument by explicitly setting
#> TRUE/FALSE to it.
#>
#> Set `ht_opt$message = FALSE` to turn off this message.
#> Warning: The heatmap has not been initialized. You might have different results
#> if you repeatedly execute this function, e.g. when row_km/column_km was
#> set. It is more suggested to do as `ht = draw(ht); column_order(ht)`.
# top annotation
top_ant =
HeatmapAnnotation(
`Log2 Overall\nViral Load` = anno_points(log2(viral_load)),
annotation_name_side = "left",annotation_name_rot=0)
Generate heatmap showing maximum number of intact copies - min copy
of the overlapping copy segments - ratio relative to certain
gene(gene_ref
)
gene_ref="E7"
gene_cn =
gene_cn_heatmaps(
X_raw = base_resol_depth,
result = result,
gff3_fn = gff3_fn,
baseline = new_baseline,
# baseline = 1,
gene_ref = gene_ref,
exclude_genes = exclude_genes
)
#> Import genomic features from the file as a GRanges object ...
#> Warning in .local(con, format, text, ...): gff-version directive indicates
#> version is 3.1.26, not 3
#> OK
#> Prepare the 'metadata' data frame ... OK
#> Make the TxDb object ...
#> Warning in .extract_transcripts_from_GRanges(tx_IDX, gr, mcols0$type, mcols0$ID, : the transcript names ("tx_name" column in the TxDb object) imported from the
#> "Name" attribute are not unique
#> Warning in .makeTxDb_normarg_chrominfo(chrominfo): genome version information
#> is not available for this TxDb object
#> OK
Generate final heatmap in a single panel.
draw(top_ant %v% integ_ht_result$Heatmap %v% gene_cn$Heatmaps$intact_gene_cn %v% gene_cn$Heatmaps$rel_dosage)
sessionInfo()
#> R Under development (unstable) (2024-10-21 r87258)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.1 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.21-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] grid stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] ComplexHeatmap_2.23.0 dplyr_1.1.4 glue_1.8.0
#> [4] ggplot2_3.5.1 ELViS_0.99.1
#>
#> loaded via a namespace (and not attached):
#> [1] DBI_1.2.3 bitops_1.0-9
#> [3] RBGL_1.83.0 httr2_1.0.7
#> [5] biomaRt_2.63.0 rlang_1.1.4
#> [7] magrittr_2.0.3 clue_0.3-66
#> [9] GetoptLong_1.0.5 matrixStats_1.4.1
#> [11] compiler_4.5.0 RSQLite_2.3.9
#> [13] GenomicFeatures_1.59.1 dir.expiry_1.15.0
#> [15] txdbmaker_1.3.1 png_0.1-8
#> [17] vctrs_0.6.5 stringr_1.5.1
#> [19] pkgconfig_2.0.3 shape_1.4.6.1
#> [21] crayon_1.5.3 fastmap_1.2.0
#> [23] magick_2.8.5 dbplyr_2.5.0
#> [25] XVector_0.47.0 labeling_0.4.3
#> [27] utf8_1.2.4 Rsamtools_2.23.1
#> [29] rmarkdown_2.29 graph_1.85.0
#> [31] UCSC.utils_1.3.0 bit_4.5.0.1
#> [33] xfun_0.49 zlibbioc_1.53.0
#> [35] cachem_1.1.0 GenomeInfoDb_1.43.2
#> [37] jsonlite_1.8.9 progress_1.2.3
#> [39] blob_1.2.4 DelayedArray_0.33.3
#> [41] BiocParallel_1.41.0 uuid_1.2-1
#> [43] prettyunits_1.2.0 parallel_4.5.0
#> [45] cluster_2.1.6 R6_2.5.1
#> [47] bslib_0.8.0 stringi_1.8.4
#> [49] RColorBrewer_1.1-3 reticulate_1.40.0
#> [51] rtracklayer_1.67.0 GenomicRanges_1.59.1
#> [53] jquerylib_0.1.4 SummarizedExperiment_1.37.0
#> [55] Rcpp_1.0.13-1 iterators_1.0.14
#> [57] knitr_1.49 zoo_1.8-12
#> [59] IRanges_2.41.2 Matrix_1.7-1
#> [61] igraph_2.1.1 tidyselect_1.2.1
#> [63] abind_1.4-8 yaml_2.3.10
#> [65] doParallel_1.0.17 codetools_0.2-20
#> [67] curl_6.0.1 lattice_0.22-6
#> [69] tibble_3.2.1 withr_3.0.2
#> [71] Biobase_2.67.0 basilisk.utils_1.19.0
#> [73] KEGGREST_1.47.0 evaluate_1.0.1
#> [75] segclust2d_0.3.3 BiocFileCache_2.15.0
#> [77] xml2_1.3.6 circlize_0.4.16
#> [79] Biostrings_2.75.1 pillar_1.9.0
#> [81] filelock_1.0.3 MatrixGenerics_1.19.0
#> [83] foreach_1.5.2 stats4_4.5.0
#> [85] generics_0.1.3 RCurl_1.98-1.16
#> [87] hms_1.1.3 S4Vectors_0.45.2
#> [89] munsell_0.5.1 scales_1.3.0
#> [91] tools_4.5.0 BiocIO_1.17.1
#> [93] data.table_1.16.2 GenomicAlignments_1.43.0
#> [95] XML_3.99-0.17 Cairo_1.6-2
#> [97] AnnotationDbi_1.69.0 colorspace_2.1-1
#> [99] GenomeInfoDbData_1.2.13 patchwork_1.3.0
#> [101] basilisk_1.19.0 restfulr_0.0.15
#> [103] cli_3.6.3 rappdirs_0.3.3
#> [105] fansi_1.0.6 viridisLite_0.4.2
#> [107] S4Arrays_1.7.1 gtable_0.3.6
#> [109] sass_0.4.9 digest_0.6.37
#> [111] BiocGenerics_0.53.3 SparseArray_1.7.2
#> [113] rjson_0.2.23 farver_2.1.2
#> [115] memoise_2.0.1 htmltools_0.5.8.1
#> [117] lifecycle_1.0.4 httr_1.4.7
#> [119] GlobalOptions_0.1.2 bit64_4.5.2