Contents

1 Introduction

BamScale provides multithreaded sequential BAM traversal for R/Bioconductor workflows. The package is designed for users who already rely on Rsamtools, GenomicAlignments, and BiocParallel, but need faster BAM parsing without moving to a separate non-Bioconductor workflow model.

The package targets three common classes of BAM access:

The motivation for inclusion in Bioconductor is therefore straightforward: BamScale accelerates a core data-access bottleneck while preserving familiar Bioconductor inputs, filtering semantics, and output types.

2 Installation

At present, BamScale depends on the ompBAM engine and is intended for use with an OpenMP-capable toolchain.

if (!requireNamespace("BiocManager", quietly = TRUE)) {
    install.packages("BiocManager")
}

BiocManager::install("BamScale")

3 Basic usage

library(BamScale)

bam <- ompBAM::example_BAM("Unsorted")

3.1 Metadata-oriented BAM access

The most common BAM-access pattern is extraction of alignment metadata such as read name, flag, reference name, position, mapping quality, and CIGAR string.

x <- bam_read(
    file = bam,
    what = c("qname", "flag", "rname", "pos", "mapq", "cigar"),
    as = "data.frame",
    threads = 2
)

head(x)
#>                                       qname flag rname      pos mapq
#> 1  ST-E00600:137:H77Y3CCXY:1:1101:6837:1309  163    19   572614  255
#> 2  ST-E00600:137:H77Y3CCXY:1:1101:6837:1309   83    19   579499  255
#> 3 ST-E00600:137:H77Y3CCXY:1:1101:10450:1309  163     6 44252112  255
#> 4 ST-E00600:137:H77Y3CCXY:1:1101:10450:1309   83     6 44252124  255
#> 5 ST-E00600:137:H77Y3CCXY:1:1101:15077:1309   99    12 46185884  255
#> 6 ST-E00600:137:H77Y3CCXY:1:1101:15077:1309  147    12 46185968  255
#>           cigar
#> 1 1S88M6798N61M
#> 2      1S148M1S
#> 3      60S86M4S
#> 4      1S146M3S
#> 5        1S149M
#> 6      7S142M1S

3.2 Filtering with ScanBamParam

BamScale accepts Rsamtools::ScanBamParam, allowing existing filtering logic to be reused directly.

param <- Rsamtools::ScanBamParam(
    what = c("qname", "flag", "mapq"),
    mapqFilter = 20L,
    flag = Rsamtools::scanBamFlag(isUnmappedQuery = FALSE)
)

filtered <- bam_read(
    file = bam,
    param = param,
    as = "data.frame",
    threads = 2
)

head(filtered)
#>                                       qname flag mapq
#> 1  ST-E00600:137:H77Y3CCXY:1:1101:6837:1309  163  255
#> 2  ST-E00600:137:H77Y3CCXY:1:1101:6837:1309   83  255
#> 3 ST-E00600:137:H77Y3CCXY:1:1101:10450:1309  163  255
#> 4 ST-E00600:137:H77Y3CCXY:1:1101:10450:1309   83  255
#> 5 ST-E00600:137:H77Y3CCXY:1:1101:15077:1309   99  255
#> 6 ST-E00600:137:H77Y3CCXY:1:1101:15077:1309  147  255

3.3 Alignment-object output

When downstream workflows expect GenomicAlignments objects, bam_read() can return those directly.

ga <- bam_read(
    file = bam,
    what = c("qname", "flag", "rname", "pos", "cigar", "strand"),
    as = "GAlignments",
    threads = 2
)

ga
#> GAlignments object with 10000 alignments and 2 metadata columns:
#>           seqnames strand              cigar    qwidth     start       end
#>              <Rle>  <Rle>        <character> <integer> <integer> <integer>
#>       [1]       19      +      1S88M6798N61M       150    572614    579560
#>       [2]       19      -           1S148M1S       150    579499    579646
#>       [3]        6      +           60S86M4S       150  44252112  44252197
#>       [4]        6      -           1S146M3S       150  44252124  44252269
#>       [5]       12      +             1S149M       150  46185884  46186032
#>       ...      ...    ...                ...       ...       ...       ...
#>    [9996]        2      -         16S122M12S       150 186680457 186680578
#>    [9997]       15      +           4S143M3S       150  90501177  90501319
#>    [9998]       15      -        26M2I56M66S       150  90501301  90501382
#>    [9999]        4      + 3M336N95M752N51M1S       150 121801488 121802724
#>   [10000]        4      -      1S117M121N32M       150 121802677 121802946
#>               width     njunc |                  qname      flag
#>           <integer> <integer> |            <character> <integer>
#>       [1]      6947         1 | ST-E00600:137:H77Y3C..       163
#>       [2]       148         0 | ST-E00600:137:H77Y3C..        83
#>       [3]        86         0 | ST-E00600:137:H77Y3C..       163
#>       [4]       146         0 | ST-E00600:137:H77Y3C..        83
#>       [5]       149         0 | ST-E00600:137:H77Y3C..        99
#>       ...       ...       ... .                    ...       ...
#>    [9996]       122         0 | ST-E00600:137:H77Y3C..        83
#>    [9997]       143         0 | ST-E00600:137:H77Y3C..       163
#>    [9998]        82         0 | ST-E00600:137:H77Y3C..        83
#>    [9999]      1237         2 | ST-E00600:137:H77Y3C..       163
#>   [10000]       270         1 | ST-E00600:137:H77Y3C..        83
#>   -------
#>   seqinfo: 25 sequences from an unspecified genome; no seqlengths

3.4 Sequence and quality extraction

For seq and qual, BamScale supports both a compatibility-preserving mode and an optimized compact mode.

seqqual_compatible <- bam_read(
    file = bam,
    what = c("qname", "seq", "qual"),
    as = "data.frame",
    seqqual_mode = "compatible",
    threads = 2
)

head(seqqual_compatible)
#>                                       qname
#> 1  ST-E00600:137:H77Y3CCXY:1:1101:6837:1309
#> 2  ST-E00600:137:H77Y3CCXY:1:1101:6837:1309
#> 3 ST-E00600:137:H77Y3CCXY:1:1101:10450:1309
#> 4 ST-E00600:137:H77Y3CCXY:1:1101:10450:1309
#> 5 ST-E00600:137:H77Y3CCXY:1:1101:15077:1309
#> 6 ST-E00600:137:H77Y3CCXY:1:1101:15077:1309
#>                                                                                                                                                      seq
#> 1 NGACCGGCGAGGAATAGGAATCATGGCGGCTGCGCTGTTCGTGCTGCTGGGATTCGCGCTGCTGGGCACCCACGGAGCCTCCGGGGCTGCCGGCACAGTCTTCACTACCGTAGAAGACCTTGGCTCGAAGATACTCCTCACCTGCTCCTT
#> 2 TGCCGGCACAGTCTTCACTACCGTAGAAGACCTTGGCTCCAAGATACTCCTCACCTGCTCCTTGAATGACAGCGCCACAGAGGTCACAGGGCACCGCTGGCTGAAGGCGGGCGTGGTGCTGAAGGAGGACGCGCTGCCAGGCCAGAAAAN
#> 3 NGGAGCTTCGAGGTGGTATATATGACCGAGCCCATTAATGAGTACTGTGTGCAGCAGCGTAAGCAATTTGATGGGAAGAGCGTGGTCTCAGTTAGCAAGGAGGGTCTGGAGCTGCCTGAGGATGAGGAGGAGCTGAAGAAGATGGAGGAG
#> 4 TGGGAAGAGCCTGGTCTCAGTTACCAAGGAGGGTCTGGAGCTGCCTGAGGATGAGTAGGAGAAGAAGAAGATGGAAGAGAGCAAGGTAAAGTTTGAGAACCTCTGCAAGCTCATGAAAGAAATCTTAGATAAGAAGGTCGAGATGGTGAN
#> 5 NGCATCTCTACCCCTACTGTCCAGTAGGTGGGATGTGGCTGGGCTGGACAGTCCAGATTATACGAACTGGCAACTCTGAACAAACACCCTCCCTGGAAACAATATTATATTTGATGGTTAGATTCTTTAGCAAACCTATTACATTATTCG
#> 6 AACAAAAACCCTCCCTAGCAACAATATTATATTTGATCGTTAGATTCTTTAGCAAACCTATTACCTTATTCGATGTCAGCTAACCTCTTGGTTTGATCATCTTTTCCAGCTGCTCTAGGGGCCTAACCACCTTGGATAACCTGGTCTCTN
#>                                                                                                                                                     qual
#> 1 #AA-AAFJFJJJJJJJJJAAJJJJJJJJJJJJJF<-FJF-AAFJJJJFFJJFFF<FFJ-<<AFAFF7-AFJFJJJJJ77FJAAJJ7AA777AJJAJFJ-A<AA<AAF7FFA-FAFA<-77<FJA--)77A7AA7--7AF<)7777<))--
#> 2 -FAAA)-AJFF7JJFFAF-FJAF-<A-7--JJJFA-F-A7-<-F7-FFJFFJAFJFJJAJFJ<FFJJJFAF7JJFFJFFJ7<-<AA-<AF-JJA<7<777A7<JA77-7JJJJJFJJJFJJJFFJJFJJJJJJJJJF7-JJJF-AFAAA#
#> 3 #AA-AFFJFJJAAFFJAJJJ<AJJJFJJJJJA<<F7-F-<-<FFAFFJFAJJF<<FJ----<<-7<J-FF-AJJJF<J<JF-AFAA7-7AJJJF-7777A-7AA---777---7FJ--AFA7-7AAJFAA)7))7---A<7<--7---A7
#> 4 AJA-FFJJJFF<7FF-AA<7F<AF<-7FJFJJJJJFF-<7-JFJJJJFFAJFJ<A-JF-JJJAFFFFJJJJFJ7<--F-A-<7JF<-FF<<---<-77<-A-FJJFJJFJFJJJJJJJJJJJ<JJJJJJJJJJJJJJ<7JJJF<FFAAA#
#> 5 #AAFFF-FJJJA-AFJJJJJJJJJJJJJJJJJJJJJJJJFJF7-7FJJJFJJJ<<F-FF-FJJFJFJ-F<JA7-7-7AF-JJJFJJJFJJJ7FJJJ<FJJJJJJJJJJJJFJJJJJ7FFJ-7-7-A-AFFF-F--7F7AJJJJJ-<<FJJ
#> 6 7A7-7--<A-)<)AA7-7-A<<7-FF<7<F<7-7A-F-JJAAFJJJJJJA<JF<A7JJJFAA-J<JJFJJA-7<JJJJFFJJAJJA7<<<7<FJJAJJJFJJFJJJA-JJFJJFJJJJJJJJFJJJJJJ<F-JJJJJFFJJFJJFFAAA#

Compact mode returns lower-level raw vectors for throughput-oriented workflows. These values can be decoded back to ordinary strings explicitly when needed.

seqqual_compact <- bam_read(
    file = bam,
    what = c("qname", "qwidth", "seq", "qual"),
    as = "data.frame",
    seqqual_mode = "compact",
    threads = 2
)

head(seqqual_compact)
#>                                       qname qwidth          seq         qual
#> 1  ST-E00600:137:H77Y3CCXY:1:1101:6837:1309    150 f4, 12, .... 02, 20, ....
#> 2  ST-E00600:137:H77Y3CCXY:1:1101:6837:1309    150 84, 22, .... 0c, 25, ....
#> 3 ST-E00600:137:H77Y3CCXY:1:1101:10450:1309    150 f4, 41, .... 02, 20, ....
#> 4 ST-E00600:137:H77Y3CCXY:1:1101:10450:1309    150 84, 44, .... 20, 29, ....
#> 5 ST-E00600:137:H77Y3CCXY:1:1101:15077:1309    150 f4, 21, .... 02, 20, ....
#> 6 ST-E00600:137:H77Y3CCXY:1:1101:15077:1309    150 11, 21, .... 16, 20, ....

seqqual_decoded <- decode_seqqual_compact(seqqual_compact)
head(seqqual_decoded)
#>                                       qname qwidth
#> 1  ST-E00600:137:H77Y3CCXY:1:1101:6837:1309    150
#> 2  ST-E00600:137:H77Y3CCXY:1:1101:6837:1309    150
#> 3 ST-E00600:137:H77Y3CCXY:1:1101:10450:1309    150
#> 4 ST-E00600:137:H77Y3CCXY:1:1101:10450:1309    150
#> 5 ST-E00600:137:H77Y3CCXY:1:1101:15077:1309    150
#> 6 ST-E00600:137:H77Y3CCXY:1:1101:15077:1309    150
#>                                                                                                                                                      seq
#> 1 NGACCGGCGAGGAATAGGAATCATGGCGGCTGCGCTGTTCGTGCTGCTGGGATTCGCGCTGCTGGGCACCCACGGAGCCTCCGGGGCTGCCGGCACAGTCTTCACTACCGTAGAAGACCTTGGCTCGAAGATACTCCTCACCTGCTCCTT
#> 2 TGCCGGCACAGTCTTCACTACCGTAGAAGACCTTGGCTCCAAGATACTCCTCACCTGCTCCTTGAATGACAGCGCCACAGAGGTCACAGGGCACCGCTGGCTGAAGGCGGGCGTGGTGCTGAAGGAGGACGCGCTGCCAGGCCAGAAAAN
#> 3 NGGAGCTTCGAGGTGGTATATATGACCGAGCCCATTAATGAGTACTGTGTGCAGCAGCGTAAGCAATTTGATGGGAAGAGCGTGGTCTCAGTTAGCAAGGAGGGTCTGGAGCTGCCTGAGGATGAGGAGGAGCTGAAGAAGATGGAGGAG
#> 4 TGGGAAGAGCCTGGTCTCAGTTACCAAGGAGGGTCTGGAGCTGCCTGAGGATGAGTAGGAGAAGAAGAAGATGGAAGAGAGCAAGGTAAAGTTTGAGAACCTCTGCAAGCTCATGAAAGAAATCTTAGATAAGAAGGTCGAGATGGTGAN
#> 5 NGCATCTCTACCCCTACTGTCCAGTAGGTGGGATGTGGCTGGGCTGGACAGTCCAGATTATACGAACTGGCAACTCTGAACAAACACCCTCCCTGGAAACAATATTATATTTGATGGTTAGATTCTTTAGCAAACCTATTACATTATTCG
#> 6 AACAAAAACCCTCCCTAGCAACAATATTATATTTGATCGTTAGATTCTTTAGCAAACCTATTACCTTATTCGATGTCAGCTAACCTCTTGGTTTGATCATCTTTTCCAGCTGCTCTAGGGGCCTAACCACCTTGGATAACCTGGTCTCTN
#>                                                                                                                                                     qual
#> 1 #AA-AAFJFJJJJJJJJJAAJJJJJJJJJJJJJF<-FJF-AAFJJJJFFJJFFF<FFJ-<<AFAFF7-AFJFJJJJJ77FJAAJJ7AA777AJJAJFJ-A<AA<AAF7FFA-FAFA<-77<FJA--)77A7AA7--7AF<)7777<))--
#> 2 -FAAA)-AJFF7JJFFAF-FJAF-<A-7--JJJFA-F-A7-<-F7-FFJFFJAFJFJJAJFJ<FFJJJFAF7JJFFJFFJ7<-<AA-<AF-JJA<7<777A7<JA77-7JJJJJFJJJFJJJFFJJFJJJJJJJJJF7-JJJF-AFAAA#
#> 3 #AA-AFFJFJJAAFFJAJJJ<AJJJFJJJJJA<<F7-F-<-<FFAFFJFAJJF<<FJ----<<-7<J-FF-AJJJF<J<JF-AFAA7-7AJJJF-7777A-7AA---777---7FJ--AFA7-7AAJFAA)7))7---A<7<--7---A7
#> 4 AJA-FFJJJFF<7FF-AA<7F<AF<-7FJFJJJJJFF-<7-JFJJJJFFAJFJ<A-JF-JJJAFFFFJJJJFJ7<--F-A-<7JF<-FF<<---<-77<-A-FJJFJJFJFJJJJJJJJJJJ<JJJJJJJJJJJJJJ<7JJJF<FFAAA#
#> 5 #AAFFF-FJJJA-AFJJJJJJJJJJJJJJJJJJJJJJJJFJF7-7FJJJFJJJ<<F-FF-FJJFJFJ-F<JA7-7-7AF-JJJFJJJFJJJ7FJJJ<FJJJJJJJJJJJJFJJJJJ7FFJ-7-7-A-AFFF-F--7F7AJJJJJ-<<FJJ
#> 6 7A7-7--<A-)<)AA7-7-A<<7-FF<7<F<7-7A-F-JJAAFJJJJJJA<JF<A7JJJFAA-J<JJFJJA-7<JJJJFFJJAJJA7<<<7<FJJAJJJFJJFJJJA-JJFJJFJJJJJJJJFJJJJJJ<F-JJJJJFFJJFJJFFAAA#

3.5 Fast count summaries

bam_count() provides chromosome-level count summaries using the same BAM filtering model.

counts <- bam_count(bam, threads = 2)
counts
#>    seqname seqlength count
#> 1        1 248956422  1308
#> 2       10 133797422   308
#> 3       11 135086622   600
#> 4       12 133275309   648
#> 5       13 114364328   162
#> 6       14 107043718   230
#> 7       15 101991189   294
#> 8       16  90338345   334
#> 9       17  83257441   570
#> 10      18  80373285    78
#> 11      19  58617616   600
#> 12       2 242193529   586
#> 13      20  64444167   168
#> 14      21  46709983    76
#> 15      22  50818468   200
#> 16       3 198295559   454
#> 17       4 190214555   252
#> 18       5 181538259   516
#> 19       6 170805979   824
#> 20       7 159345973   412
#> 21       8 145138636   452
#> 22       9 138394717   372
#> 23      MT     16569   288
#> 24       X 156040895   254
#> 25       Y  57227415    14
#> 26       *        NA     0

4 Relationship to existing Bioconductor tools

BamScale is intended to complement, not replace, existing Bioconductor packages.

The main difference is therefore performance-oriented: BamScale accelerates the traversal step itself while staying close to existing Bioconductor usage patterns.

5 Session information

sessionInfo()
#> R version 4.6.0 RC (2026-04-17 r89917)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB              LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/New_York
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] BamScale_0.99.9  BiocStyle_2.40.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] Matrix_1.7-5                jsonlite_2.0.0             
#>  [3] compiler_4.6.0              BiocManager_1.30.27        
#>  [5] crayon_1.5.3                Rcpp_1.1.1-1.1             
#>  [7] SummarizedExperiment_1.42.0 Biobase_2.72.0             
#>  [9] Rsamtools_2.28.0            GenomicRanges_1.64.0       
#> [11] bitops_1.0-9                Biostrings_2.80.0          
#> [13] GenomicAlignments_1.48.0    parallel_4.6.0             
#> [15] jquerylib_0.1.4             IRanges_2.46.0             
#> [17] Seqinfo_1.2.0               BiocParallel_1.46.0        
#> [19] yaml_2.3.12                 fastmap_1.2.0              
#> [21] lattice_0.22-9              R6_2.6.1                   
#> [23] XVector_0.52.0              S4Arrays_1.12.0            
#> [25] generics_0.1.4              knitr_1.51                 
#> [27] BiocGenerics_0.58.0         DelayedArray_0.38.1        
#> [29] bookdown_0.46               MatrixGenerics_1.24.0      
#> [31] bslib_0.10.0                rlang_1.2.0                
#> [33] cachem_1.1.0                ompBAM_1.16.0              
#> [35] xfun_0.57                   sass_0.4.10                
#> [37] otel_0.2.0                  SparseArray_1.12.2         
#> [39] cli_3.6.6                   grid_4.6.0                 
#> [41] digest_0.6.39               lifecycle_1.0.5            
#> [43] S4Vectors_0.50.0            evaluate_1.0.5             
#> [45] cigarillo_1.2.0             codetools_0.2-20           
#> [47] abind_1.4-8                 stats4_4.6.0               
#> [49] rmarkdown_2.31              matrixStats_1.5.0          
#> [51] tools_4.6.0                 htmltools_0.5.9