DNA, amino acid, and other biological sequences. See earlier example in B.1 Introduction to Bioconductor.
GRanges()
: genomic coordinates to represent annotations (exons, genes, regulatory marks, …) and data (called peaks, variants, aligned reads)
GRangesList()
: genomic coordinates grouped into list elements (e.g., paired-end reads; exons grouped by transcript)
start()
/ end()
/ width()
length()
, subset, etc.mcols()
Seqinfo
, including seqlevels
and seqlengths
shift()
, narrow()
, flank()
, promoters()
, resize()
, restrict()
, trim()
?"intra-range-methods"
range()
, reduce()
, gaps()
, disjoin()
coverage()
(!)?"inter-range-methods"
findOverlaps()
, countOverlaps()
, …, %over%
, %within%
, %outside%
; union()
, intersect()
, setdiff()
, punion()
, pintersect()
, psetdiff()
library(GenomicRanges)
gr <- GRanges("A", IRanges(c(10, 20, 22), width=5), "+")
shift(gr, 1) # intra-range
## GRanges object with 3 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A [11, 15] +
## [2] A [21, 25] +
## [3] A [23, 27] +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
range(gr) # inter-range
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A [10, 26] +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
reduce(gr) # inter-range
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A [10, 14] +
## [2] A [20, 26] +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
snps <- GRanges("A", IRanges(c(11, 17, 24), width=1))
findOverlaps(snps, gr) # between-range
## Hits object with 3 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 1
## [2] 3 2
## [3] 3 3
## -------
## queryLength: 3 / subjectLength: 3
setdiff(range(gr), gr) # 'introns'
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] A [15, 19] +
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
Representation of aligned reads. See exercises below.
colData()
data frame for desciption of samplesrowRanges()
GRanges / GRangeList or data frame for description of featuresexptData()
to describe the entire objectassays()
can be any matrix-like object, including very large on-disk representations such as HDF5Arraylibrary(SummarizedExperiment)
library(airway)
data(airway)
airway
## class: RangedSummarizedExperiment
## dim: 64102 8
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
colData(airway)
## DataFrame with 8 rows and 9 columns
## SampleName cell dex albut Run avgLength
## <factor> <factor> <factor> <factor> <factor> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98
## Experiment Sample BioSample
## <factor> <factor> <factor>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRX384354 SRS508576 SAMN02422673
## SRR1039520 SRX384357 SRS508579 SAMN02422683
## SRR1039521 SRX384358 SRS508580 SAMN02422677
airway[, airway$dex %in% "trt"]
## class: RangedSummarizedExperiment
## dim: 64102 4
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(4): SRR1039509 SRR1039513 SRR1039517 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
chr14 <- as(seqinfo(airway), "GRanges")["14"]
airway[airway %over% chr14,]
## class: RangedSummarizedExperiment
## dim: 2244 8
## metadata(1): ''
## assays(1): counts
## rownames(2244): ENSG00000006432 ENSG00000009830 ... ENSG00000273259
## ENSG00000273307
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
?select
?exonsBy
page to retrieve all exons grouped by gene or transcript.org.*: identifier mappings
select()
, columns()
, keys()
mapIds()
library(org.Hs.eg.db)
org <- org.Hs.eg.db
select(org, "BRCA1", c("ENSEMBL", "GENENAME"), "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## SYMBOL ENSEMBL GENENAME
## 1 BRCA1 ENSG00000012048 BRCA1, DNA repair associated
TxDb.*: gene models
exons()
, transcripts()
, genes()
, promoters()
, …exonsBy()
, transcriptsBy()
select()
, etc.library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
promoters(txdb)
## GRanges object with 82960 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr1 [ 9874, 12073] + | 1 uc001aaa.3
## [2] chr1 [ 9874, 12073] + | 2 uc010nxq.1
## [3] chr1 [ 9874, 12073] + | 3 uc010nxr.1
## [4] chr1 [ 67091, 69290] + | 4 uc001aal.1
## [5] chr1 [319084, 321283] + | 5 uc001aaq.2
## ... ... ... ... . ... ...
## [82956] chrUn_gl000237 [ 2487, 4686] - | 82956 uc011mgu.1
## [82957] chrUn_gl000241 [36676, 38875] - | 82957 uc011mgv.2
## [82958] chrUn_gl000243 [ 9501, 11700] + | 82958 uc011mgw.1
## [82959] chrUn_gl000243 [11608, 13807] + | 82959 uc022brq.1
## [82960] chrUn_gl000247 [ 5617, 7816] - | 82960 uc022brr.1
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
library(AnnotationHub)
hub = AnnotationHub()
## snapshotDate(): 2017-04-25
hub
## AnnotationHub with 39332 records
## # snapshotDate(): 2017-04-25
## # $dataprovider: BroadInstitute, Ensembl, UCSC, Haemcode, Inparanoid8, ft...
## # $species: Homo sapiens, Mus musculus, Bos taurus, Pan troglodytes, Dani...
## # $rdataclass: GRanges, BigWigFile, FaFile, TwoBitFile, ChainFile, Rle, I...
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH2"]]'
##
## title
## AH2 | Ailuropoda_melanoleuca.ailMel1.69.dna.toplevel.fa
## AH3 | Ailuropoda_melanoleuca.ailMel1.69.dna_rm.toplevel.fa
## AH4 | Ailuropoda_melanoleuca.ailMel1.69.dna_sm.toplevel.fa
## AH5 | Ailuropoda_melanoleuca.ailMel1.69.ncrna.fa
## AH6 | Ailuropoda_melanoleuca.ailMel1.69.pep.all.fa
## ... ...
## AH54627 | Xiphophorus_maculatus.Xipmac4.4.2.cdna.all.2bit
## AH54628 | Xiphophorus_maculatus.Xipmac4.4.2.dna.toplevel.2bit
## AH54629 | Xiphophorus_maculatus.Xipmac4.4.2.dna_rm.toplevel.2bit
## AH54630 | Xiphophorus_maculatus.Xipmac4.4.2.dna_sm.toplevel.2bit
## AH54631 | Xiphophorus_maculatus.Xipmac4.4.2.ncrna.2bit
query(hub, c("ensembl", "81.gtf"))
## AnnotationHub with 69 records
## # snapshotDate(): 2017-04-25
## # $dataprovider: Ensembl
## # $species: Ailuropoda melanoleuca, Anas platyrhynchos, Anolis carolinens...
## # $rdataclass: GRanges
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH47937"]]'
##
## title
## AH47937 | Ailuropoda_melanoleuca.ailMel1.81.gtf
## AH47938 | Anas_platyrhynchos.BGI_duck_1.0.81.gtf
## AH47939 | Anolis_carolinensis.AnoCar2.0.81.gtf
## AH47940 | Astyanax_mexicanus.AstMex102.81.gtf
## AH47941 | Bos_taurus.UMD3.1.81.gtf
## ... ...
## AH48001 | Tupaia_belangeri.TREESHREW.81.gtf
## AH48002 | Tursiops_truncatus.turTru1.81.gtf
## AH48003 | Vicugna_pacos.vicPac1.81.gtf
## AH48004 | Xenopus_tropicalis.JGI_4.2.81.gtf
## AH48005 | Xiphophorus_maculatus.Xipmac4.4.2.81.gtf
hub[["AH48004"]]
## loading from cache '/home/mtmorgan//.AnnotationHub/54310'
## using guess work to populate seqinfo
## GRanges object with 581787 ranges and 19 metadata columns:
## seqnames ranges strand | source type score
## <Rle> <IRanges> <Rle> | <factor> <factor> <numeric>
## [1] GL172637.1 [ 34, 148] - | ensembl gene <NA>
## [2] GL172637.1 [ 34, 148] - | ensembl transcript <NA>
## [3] GL172637.1 [ 34, 148] - | ensembl exon <NA>
## [4] GL172637.1 [606, 720] - | ensembl gene <NA>
## [5] GL172637.1 [606, 720] - | ensembl transcript <NA>
## ... ... ... ... . ... ... ...
## [581783] GL180121.1 [ 865, 867] + | ensembl start_codon <NA>
## [581784] GL180121.1 [ 992, 1334] + | ensembl exon <NA>
## [581785] GL180121.1 [ 992, 1334] + | ensembl CDS <NA>
## [581786] GL180121.1 [1817, 1835] + | ensembl exon <NA>
## [581787] GL180121.1 [1817, 1835] + | ensembl CDS <NA>
## phase gene_id gene_version gene_name gene_source
## <integer> <character> <numeric> <character> <character>
## [1] <NA> ENSXETG00000030486 1 U5 ensembl
## [2] <NA> ENSXETG00000030486 1 U5 ensembl
## [3] <NA> ENSXETG00000030486 1 U5 ensembl
## [4] <NA> ENSXETG00000031766 1 U5 ensembl
## [5] <NA> ENSXETG00000031766 1 U5 ensembl
## ... ... ... ... ... ...
## [581783] 0 ENSXETG00000033193 1 <NA> ensembl
## [581784] <NA> ENSXETG00000033193 1 <NA> ensembl
## [581785] 2 ENSXETG00000033193 1 <NA> ensembl
## [581786] <NA> ENSXETG00000033193 1 <NA> ensembl
## [581787] 1 ENSXETG00000033193 1 <NA> ensembl
## gene_biotype transcript_id transcript_version
## <character> <character> <numeric>
## [1] snRNA <NA> <NA>
## [2] snRNA ENSXETT00000065882 1
## [3] snRNA ENSXETT00000065882 1
## [4] snRNA <NA> <NA>
## [5] snRNA ENSXETT00000061796 1
## ... ... ... ...
## [581783] protein_coding ENSXETT00000053735 2
## [581784] protein_coding ENSXETT00000053735 2
## [581785] protein_coding ENSXETT00000053735 2
## [581786] protein_coding ENSXETT00000053735 2
## [581787] protein_coding ENSXETT00000053735 2
## transcript_name transcript_source transcript_biotype exon_number
## <character> <character> <character> <numeric>
## [1] <NA> <NA> <NA> <NA>
## [2] U5-201 ensembl snRNA <NA>
## [3] U5-201 ensembl snRNA 1
## [4] <NA> <NA> <NA> <NA>
## [5] U5-201 ensembl snRNA <NA>
## ... ... ... ... ...
## [581783] <NA> ensembl protein_coding 1
## [581784] <NA> ensembl protein_coding 2
## [581785] <NA> ensembl protein_coding 2
## [581786] <NA> ensembl protein_coding 3
## [581787] <NA> ensembl protein_coding 3
## exon_id exon_version protein_id
## <character> <numeric> <character>
## [1] <NA> <NA> <NA>
## [2] <NA> <NA> <NA>
## [3] ENSXETE00000393193 1 <NA>
## [4] <NA> <NA> <NA>
## [5] <NA> <NA> <NA>
## ... ... ... ...
## [581783] <NA> <NA> <NA>
## [581784] ENSXETE00000303775 2 <NA>
## [581785] <NA> <NA> ENSXETP00000053735
## [581786] ENSXETE00000416553 1 <NA>
## [581787] <NA> <NA> ENSXETP00000053735
## protein_version
## <numeric>
## [1] <NA>
## [2] <NA>
## [3] <NA>
## [4] <NA>
## [5] <NA>
## ... ...
## [581783] <NA>
## [581784] <NA>
## [581785] 2
## [581786] <NA>
## [581787] 2
## -------
## seqinfo: 2375 sequences from JGI_4 genome; no seqlengths
Genome annotations: BED, WIG, GTF, etc. files. E.g., GTF:
Component coordinates
7 protein_coding gene 27221129 27224842 . - . ...
...
7 protein_coding transcript 27221134 27224835 . - . ...
7 protein_coding exon 27224055 27224835 . - . ...
7 protein_coding CDS 27224055 27224763 . - 0 ...
7 protein_coding start_codon 27224761 27224763 . - 0 ...
7 protein_coding exon 27221134 27222647 . - . ...
7 protein_coding CDS 27222418 27222647 . - 2 ...
7 protein_coding stop_codon 27222415 27222417 . - 0 ...
7 protein_coding UTR 27224764 27224835 . - . ...
7 protein_coding UTR 27221134 27222414 . - . ...
Annotations
gene_id "ENSG00000005073"; gene_name "HOXA11"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
...
... transcript_id "ENST00000006015"; transcript_name "HOXA11-001"; transcript_source "ensembl_havana"; tag "CCDS"; ccds_id "CCDS5411";
... exon_number "1"; exon_id "ENSE00001147062";
... exon_number "1"; protein_id "ENSP00000006015";
... exon_number "1";
... exon_number "2"; exon_id "ENSE00002099557";
... exon_number "2"; protein_id "ENSP00000006015";
... exon_number "2";
...
import()
: import various formats to GRanges
and similar instancesexport()
: transform from GRanges
and similar types to BED, GTF, …Sequenced reads: FASTQ files
@ERR127302.1703 HWI-EAS350_0441:1:1:1460:19184#0/1
CCTGAGTGAAGCTGATCTTGATCTACGAAGAGAGATAGATCTTGATCGTCGAGGAGATGCTGACCTTGACCT
+
HHGHHGHHHHHHHHDGG<GDGGE@GDGGD<?B8??ADAD<BE@EE8EGDGA3CB85*,77@>>CE?=896=:
@ERR127302.1704 HWI-EAS350_0441:1:1:1460:16861#0/1
GCGGTATGCTGGAAGGTGCTCGAATGGAGAGCGCCAGCGCCCCGGCGCTGAGCCGCAGCCTCAGGTCCGCCC
+
DE?DD>ED4>EEE>DE8EEEDE8B?EB<@3;BA79?,881B?@73;1?########################
readFastq()
: inputFastqStreamer()
: iterate through FASTQ filesFastqSampler()
: sample from FASTQ files, e.g., for quality assessmentAligned reads: BAM files
Header
@HD VN:1.0 SO:coordinate
@SQ SN:chr1 LN:249250621
@SQ SN:chr10 LN:135534747
@SQ SN:chr11 LN:135006516
...
@SQ SN:chrY LN:59373566
@PG ID:TopHat VN:2.0.8b CL:/home/hpages/tophat-2.0.8b.Linux_x86_64/tophat --mate-inner-dist 150 --solexa-quals --max-multihits 5 --no-discordant --no-mixed --coverage-search --microexon-search --library-type fr-unstranded --num-threads 2 --output-dir tophat2_out/ERR127306 /home/hpages/bowtie2-2.1.0/indexes/hg19 fastq/ERR127306_1.fastq fastq/ERR127306_2.fastq
Alignments: ID, flag, alignment and mate
ERR127306.7941162 403 chr14 19653689 3 72M = 19652348 -1413 ...
ERR127306.22648137 145 chr14 19653692 1 72M = 19650044 -3720 ...
ERR127306.933914 339 chr14 19653707 1 66M120N6M = 19653686 -213 ...
Alignments: sequence and quality
... GAATTGATCAGTCTCATCTGAGAGTAACTTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCC *'%%%%%#&&%''#'&%%%)&&%%$%%'%%'&*****$))$)'')'%)))&)%%%%$'%%%%&"))'')%))
... TTGATCAGTCTCATCTGAGAGTAACTTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCCCAG '**)****)*'*&*********('&)****&***(**')))())%)))&)))*')&***********)****
... TGAGAGTAACTTTGTACCCATCACTGATTCCTTCTGAGACTGCCTCCACTTCCCCAGCAGCCTCTGGTTTCT '******&%)&)))&")')'')'*((******&)&'')'))$))'')&))$)**&&****************
Alignments: Tags
... AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:72 YT:Z:UU NH:i:2 CC:Z:chr22 CP:i:16189276 HI:i:0
... AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:72 YT:Z:UU NH:i:3 CC:Z:= CP:i:19921600 HI:i:0
... AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:4 MD:Z:72 YT:Z:UU XS:A:+ NH:i:3 CC:Z:= CP:i:19921465 HI:i:0
... AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:4 MD:Z:72 YT:Z:UU XS:A:+ NH:i:2 CC:Z:chr22 CP:i:16189138 HI:i:0
readGAlignments()
: Single-end readsreadGAlignmentPairs()
, readGAlignmentsList()
: paired end readsScanBamParam()
: restrict inputBamFile(, yieldSize=)
: iterationreduceByYield()
Header
##fileformat=VCFv4.2
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
##phasing=partial
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
...
##FILTER=<ID=q10,Description="Quality below 10">
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
...
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
Location
#CHROM POS ID REF ALT QUAL FILTER ...
20 14370 rs6054257 G A 29 PASS ...
20 17330 . T A 3 q10 ...
20 1110696 rs6040355 A G,T 67 PASS ...
Variant INFO
#CHROM POS ... INFO ...
20 14370 ... NS=3;DP=14;AF=0.5;DB;H2 ...
20 17330 ... NS=3;DP=11;AF=0.017 ...
20 1110696 ... NS=2;DP=10;AF=0.333,0.667;AA=T;DB ...
Genotype FORMAT and samples
... POS ... FORMAT NA00001 NA00002 NA00003
... 14370 ... GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
... 17330 ... GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3
... 1110696 ... GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4
readVcf()
: VCF inputScanVcfParam()
: restrict input to necessary fields / rangesVcfFile()
: indexing and iterating through large VCF fileslocateVariants()
: annotate in relation to genes, etc; see also ensemblVEP, VariantFilteringfilterVcf()
: flexible filteringThe RNAseqData.HNRNPC.bam.chr14 package is an example of an experiment data package. It contains a subset of BAM files used in a gene knock-down experiment, as described in ?RNAseqData.HNRNPC.bam.chr14
. Load the package and get the path to the BAM files.
library(RNAseqData.HNRNPC.bam.chr14)
fls = RNAseqData.HNRNPC.bam.chr14_BAMFILES
basename(fls)
## [1] "ERR127306_chr14.bam" "ERR127307_chr14.bam" "ERR127308_chr14.bam"
## [4] "ERR127309_chr14.bam" "ERR127302_chr14.bam" "ERR127303_chr14.bam"
## [7] "ERR127304_chr14.bam" "ERR127305_chr14.bam"
Create BamFileList()
, basically telling R that these are paths to BAM files rather than, say, text files from a spreadsheet.
library(GenomicAlignments)
bfls = BamFileList(fls)
bfl = bfls[[1]]
Input and explore the alignments. See ?readGAlignments
and ?GAlignments
for details on how to manipulate these objects.
ga = readGAlignments(bfl)
ga
## GAlignments object with 800484 alignments and 0 metadata columns:
## seqnames strand cigar qwidth start end
## <Rle> <Rle> <character> <integer> <integer> <integer>
## [1] chr14 + 72M 72 19069583 19069654
## [2] chr14 + 72M 72 19363738 19363809
## [3] chr14 - 72M 72 19363755 19363826
## [4] chr14 + 72M 72 19369799 19369870
## [5] chr14 - 72M 72 19369828 19369899
## ... ... ... ... ... ... ...
## [800480] chr14 - 72M 72 106989780 106989851
## [800481] chr14 + 72M 72 106994763 106994834
## [800482] chr14 - 72M 72 106994819 106994890
## [800483] chr14 + 72M 72 107003080 107003151
## [800484] chr14 - 72M 72 107003171 107003242
## width njunc
## <integer> <integer>
## [1] 72 0
## [2] 72 0
## [3] 72 0
## [4] 72 0
## [5] 72 0
## ... ... ...
## [800480] 72 0
## [800481] 72 0
## [800482] 72 0
## [800483] 72 0
## [800484] 72 0
## -------
## seqinfo: 93 sequences from an unspecified genome
table(strand(ga))
##
## + - *
## 400242 400242 0
Many of the reads have cigar “72M”. What does this mean? Can you create a subset of reads that do not have this cigar? Interpret some of the non-72M cigars. Any hint about what these cigars represent?
tail(sort(table(cigar(ga))))
##
## 18M123N54M 36M123N36M 64M316N8M 38M670N34M 35M123N37M 72M
## 225 228 261 264 272 603939
ga[cigar(ga) != "72M"]
## GAlignments object with 196545 alignments and 0 metadata columns:
## seqnames strand cigar qwidth start end
## <Rle> <Rle> <character> <integer> <integer> <integer>
## [1] chr14 - 64M1I7M 72 19411677 19411747
## [2] chr14 + 55M2117N17M 72 19650072 19652260
## [3] chr14 - 43M2117N29M 72 19650084 19652272
## [4] chr14 - 40M2117N32M 72 19650087 19652275
## [5] chr14 + 38M2117N34M 72 19650089 19652277
## ... ... ... ... ... ... ...
## [196541] chr14 - 51M1D21M 72 106950429 106950501
## [196542] chr14 + 31M1I40M 72 106960410 106960480
## [196543] chr14 + 52M1D20M 72 106965156 106965228
## [196544] chr14 - 13M1D59M 72 106965195 106965267
## [196545] chr14 - 6M1D66M 72 106965202 106965274
## width njunc
## <integer> <integer>
## [1] 71 0
## [2] 2189 1
## [3] 2189 1
## [4] 2189 1
## [5] 2189 1
## ... ... ...
## [196541] 73 0
## [196542] 71 0
## [196543] 73 0
## [196544] 73 0
## [196545] 73 0
## -------
## seqinfo: 93 sequences from an unspecified genome
Use the function summarizeJunctions()
to identify genomic regions that are spanned by reads with complicated cigars. Can you use the argument with.revmap=TRUE
to extract the reads supporting a particular (e.g., first) junction?
summarizeJunctions(ga)
## GRanges object with 4635 ranges and 3 metadata columns:
## seqnames ranges strand | score plus_score
## <Rle> <IRanges> <Rle> | <integer> <integer>
## [1] chr14 [19650127, 19652243] * | 4 2
## [2] chr14 [19650127, 19653624] * | 1 1
## [3] chr14 [19652355, 19653624] * | 8 7
## [4] chr14 [19652355, 19653657] * | 1 1
## [5] chr14 [19653773, 19653892] * | 9 5
## ... ... ... ... . ... ...
## [4631] chr14 [106912703, 106922227] * | 1 0
## [4632] chr14 [106938165, 106938301] * | 10 2
## [4633] chr14 [106938645, 106944774] * | 24 7
## [4634] chr14 [106944969, 106950170] * | 7 6
## [4635] chr14 [106950323, 106960260] * | 1 1
## minus_score
## <integer>
## [1] 2
## [2] 0
## [3] 1
## [4] 0
## [5] 4
## ... ...
## [4631] 1
## [4632] 8
## [4633] 17
## [4634] 1
## [4635] 0
## -------
## seqinfo: 93 sequences from an unspecified genome
junctions <- summarizeJunctions(ga, with.revmap=TRUE)
ga[ junctions$revmap[[1]] ]
## GAlignments object with 4 alignments and 0 metadata columns:
## seqnames strand cigar qwidth start end width
## <Rle> <Rle> <character> <integer> <integer> <integer> <integer>
## [1] chr14 + 55M2117N17M 72 19650072 19652260 2189
## [2] chr14 - 43M2117N29M 72 19650084 19652272 2189
## [3] chr14 - 40M2117N32M 72 19650087 19652275 2189
## [4] chr14 + 38M2117N34M 72 19650089 19652277 2189
## njunc
## <integer>
## [1] 1
## [2] 1
## [3] 1
## [4] 1
## -------
## seqinfo: 93 sequences from an unspecified genome
It is possible to do other actions on BAM files, e.g., calculating the ‘coverage’ (reads overlapping each base).
coverage(bfl)$chr14
## integer-Rle of length 107349540 with 493510 runs
## Lengths: 19069582 72 294083 ... 19 72 346298
## Values : 0 1 0 ... 0 1 0
The airway experiment data package summarizes an RNA-seq experiment investigating human smooth-muscle airway cell lines treated with dexamethasone. Load the library and data set.
library(airway)
data(airway)
airway
## class: RangedSummarizedExperiment
## dim: 64102 8
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
airway
is an example of the SummarizedExperiment class. Explore its assay()
(the matrix of counts of reads overlapping genomic regions of interest in each sample), colData()
(a description of each sample), and rowRanges()
(a description of each region of interest; here each region is an ENSEMBL gene).
x <- assay(airway)
class(x)
## [1] "matrix"
dim(x)
## [1] 64102 8
head(x)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003 679 448 873 408 1138
## ENSG00000000005 0 0 0 0 0
## ENSG00000000419 467 515 621 365 587
## ENSG00000000457 260 211 263 164 245
## ENSG00000000460 60 55 40 35 78
## ENSG00000000938 0 0 2 0 1
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003 1047 770 572
## ENSG00000000005 0 0 0
## ENSG00000000419 799 417 508
## ENSG00000000457 331 233 229
## ENSG00000000460 63 76 60
## ENSG00000000938 0 0 0
colData(airway)
## DataFrame with 8 rows and 9 columns
## SampleName cell dex albut Run avgLength
## <factor> <factor> <factor> <factor> <factor> <integer>
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98
## Experiment Sample BioSample
## <factor> <factor> <factor>
## SRR1039508 SRX384345 SRS508568 SAMN02422669
## SRR1039509 SRX384346 SRS508567 SAMN02422675
## SRR1039512 SRX384349 SRS508571 SAMN02422678
## SRR1039513 SRX384350 SRS508572 SAMN02422670
## SRR1039516 SRX384353 SRS508575 SAMN02422682
## SRR1039517 SRX384354 SRS508576 SAMN02422673
## SRR1039520 SRX384357 SRS508579 SAMN02422683
## SRR1039521 SRX384358 SRS508580 SAMN02422677
rowRanges(airway)
## GRangesList object of length 64102:
## $ENSG00000000003
## GRanges object with 17 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] X [99883667, 99884983] - | 667145 ENSE00001459322
## [2] X [99885756, 99885863] - | 667146 ENSE00000868868
## [3] X [99887482, 99887565] - | 667147 ENSE00000401072
## [4] X [99887538, 99887565] - | 667148 ENSE00001849132
## [5] X [99888402, 99888536] - | 667149 ENSE00003554016
## ... ... ... ... . ... ...
## [13] X [99890555, 99890743] - | 667156 ENSE00003512331
## [14] X [99891188, 99891686] - | 667158 ENSE00001886883
## [15] X [99891605, 99891803] - | 667159 ENSE00001855382
## [16] X [99891790, 99892101] - | 667160 ENSE00001863395
## [17] X [99894942, 99894988] - | 667161 ENSE00001828996
##
## ...
## <64101 more elements>
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
It’s easy to subset a SummarizedExperiment on rows, columns and assays, e.g., retaining just those samples in the trt
level of the dex
factor. Accessing elements of the column data is common, so there is a short-cut.
cidx <- colData(airway)$dex %in% "trt"
airway[, cidx]
## class: RangedSummarizedExperiment
## dim: 64102 4
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(4): SRR1039509 SRR1039513 SRR1039517 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
## shortcut
airway[, airway$dex %in% "trt"]
## class: RangedSummarizedExperiment
## dim: 64102 4
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(4): SRR1039509 SRR1039513 SRR1039517 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
It’s also easy to perform range-based operations on SummarizedExperiment
objects, e.g., querying for range of chromosome 14 and then subsetting to contain only genes on this chromosome. Range operations on rows are very common, so there are shortcuts here, too.
chr14 <- as(seqinfo(rowRanges(airway)), "GRanges")["14"]
ridx <- rowRanges(airway) %over% chr14
airway[ridx,]
## class: RangedSummarizedExperiment
## dim: 2244 8
## metadata(1): ''
## assays(1): counts
## rownames(2244): ENSG00000006432 ENSG00000009830 ... ENSG00000273259
## ENSG00000273307
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
## shortcut
chr14 <- as(seqinfo(airway), "GRanges")["14"]
airway[airway %over% chr14,]
## class: RangedSummarizedExperiment
## dim: 2244 8
## metadata(1): ''
## assays(1): counts
## rownames(2244): ENSG00000006432 ENSG00000009830 ... ENSG00000273259
## ENSG00000273307
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
Use the assay()
and rowSums()
function to remove all rows from the airway
object that have 0 reads overlapping all samples. Summarize the library size (column sums of assay()
) and plot a histogram of the distribution of reads per feature of interest.
Load the org package for Homo sapiens.
library(org.Hs.eg.db)
Use select()
to annotate the HNRNPC gene symbol with its Entrez identifier and less formal gene name. Create a map between SYMBOL and ENTREZID using mapIds()
.
select(org.Hs.eg.db, "HNRNPC", c("ENTREZID", "GENENAME"), "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## SYMBOL ENTREZID GENENAME
## 1 HNRNPC 3183 heterogeneous nuclear ribonucleoprotein C (C1/C2)
sym2eg <- mapIds(org.Hs.eg.db, "HNRNPC", "ENTREZID", "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
Load the TxDb package for the UCSC hg19 knownGene track
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
Extract coordinates of genes, and of exons grouped by gene for the HNRNPC gene.
gns <- genes(txdb)
exonsBy(txdb, "gene")[sym2eg]
## GRangesList object of length 1:
## $3183
## GRanges object with 19 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr14 [21677296, 21679465] - | 184100 <NA>
## [2] chr14 [21678927, 21679725] - | 184101 <NA>
## [3] chr14 [21679565, 21679672] - | 184102 <NA>
## [4] chr14 [21679565, 21679725] - | 184103 <NA>
## [5] chr14 [21679969, 21680062] - | 184104 <NA>
## ... ... ... ... . ... ...
## [15] chr14 [21702237, 21702388] - | 184114 <NA>
## [16] chr14 [21730760, 21730927] - | 184115 <NA>
## [17] chr14 [21731470, 21731495] - | 184116 <NA>
## [18] chr14 [21731826, 21731988] - | 184117 <NA>
## [19] chr14 [21737457, 21737638] - | 184118 <NA>
##
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
Use the gene coordinates to query the BAM file for a specific genomic region; see ?ScanBamParam()
for other ways of restricting data input.
library(Rsamtools)
param <- ScanBamParam(which=gns[sym2eg])
readGAlignments(bfl, param=param)
## GAlignments object with 5422 alignments and 0 metadata columns:
## seqnames strand cigar qwidth start end width
## <Rle> <Rle> <character> <integer> <integer> <integer> <integer>
## [1] chr14 + 72M 72 21677347 21677418 72
## [2] chr14 + 72M 72 21677352 21677423 72
## [3] chr14 + 72M 72 21677354 21677425 72
## [4] chr14 + 72M 72 21677355 21677426 72
## [5] chr14 + 72M 72 21677373 21677444 72
## ... ... ... ... ... ... ... ...
## [5418] chr14 - 72M 72 21737512 21737583 72
## [5419] chr14 - 72M 72 21737520 21737591 72
## [5420] chr14 - 72M 72 21737520 21737591 72
## [5421] chr14 - 72M 72 21737521 21737592 72
## [5422] chr14 - 72M 72 21737534 21737605 72
## njunc
## <integer>
## [1] 0
## [2] 0
## [3] 0
## [4] 0
## [5] 0
## ... ...
## [5418] 0
## [5419] 0
## [5420] 0
## [5421] 0
## [5422] 0
## -------
## seqinfo: 93 sequences from an unspecified genome
The Roadmap Epigenomics Project generated genome-wide maps of regulatory marks across a number of cell lines.
Retrieve the Epigenome Roadmap table from AnnotationHub…
library(AnnotationHub)
hub <- AnnotationHub()
## snapshotDate(): 2017-04-25
query(hub, c("epigenome", "metadata"))
## AnnotationHub with 1 record
## # snapshotDate(): 2017-04-25
## # names(): AH41830
## # $dataprovider: BroadInstitute
## # $species: Homo sapiens
## # $rdataclass: data.frame
## # $rdatadateadded: 2015-05-11
## # $title: EID_metadata.tab
## # $description: Metadata for EpigenomeRoadMap Project
## # $taxonomyid: 9606
## # $genome: hg19
## # $sourcetype: tab
## # $sourceurl: http://egg2.wustl.edu/roadmap/data/byFileType/metadata/EID_...
## # $sourcesize: 18035
## # $tags: c("EpigenomeRoadMap", "Metadata")
## # retrieve record with 'object[["AH41830"]]'
meta <- hub[["AH41830"]]
## loading from cache '/home/mtmorgan//.AnnotationHub/47270'
Explore the metadata to identify a cell line of interest to you; see also the metadata spreadsheet version of the data made available by the Epigenome roadmap project.
table(meta$ANATOMY)
##
## ADRENAL BLOOD BONE BRAIN
## 1 27 1 13
## BREAST CERVIX ESC ESC_DERIVED
## 3 1 8 9
## FAT GI_COLON GI_DUODENUM GI_ESOPHAGUS
## 3 3 2 1
## GI_INTESTINE GI_RECTUM GI_STOMACH HEART
## 3 3 4 4
## IPSC KIDNEY LIVER LUNG
## 5 1 2 5
## MUSCLE MUSCLE_LEG OVARY PANCREAS
## 7 1 1 2
## PLACENTA SKIN SPLEEN STROMAL_CONNECTIVE
## 2 8 1 2
## THYMUS VASCULAR
## 2 2
meta[meta$ANATOMY == "LIVER",]
## EID GROUP COLOR MNEMONIC
## 64 E066 Other #999999 LIV.ADLT
## 116 E118 ENCODE2012 #000000 LIV.HEPG2.CNCR
## STD_NAME EDACC_NAME
## 64 Liver Adult_Liver
## 116 HepG2 Hepatocellular Carcinoma Cell Line HepG2_Hepatocellular_Carcinoma
## ANATOMY TYPE AGE SEX SOLID_LIQUID ETHNICITY
## 64 LIVER PrimaryTissue Unknown Mixed SOLID Unknown
## 116 LIVER CellLine Male
## SINGLEDONOR_COMPOSITE
## 64 C
## 116 SD
Use the ‘EID’ to query for and retrieve the ‘mnemonic’ file summarizing chromatin state
query(hub, c("E118", "mnemonic"))
## AnnotationHub with 1 record
## # snapshotDate(): 2017-04-25
## # names(): AH46971
## # $dataprovider: BroadInstitute
## # $species: Homo sapiens
## # $rdataclass: GRanges
## # $rdatadateadded: 2015-05-14
## # $title: E118_15_coreMarks_mnemonics.bed.gz
## # $description: 15 state chromatin segmentations from EpigenomeRoadMap Pr...
## # $taxonomyid: 9606
## # $genome: hg19
## # $sourcetype: BED
## # $sourceurl: http://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegme...
## # $sourcesize: 3231313
## # $tags: c("EpigenomeRoadMap", "chromhmmSegmentations",
## # "ChmmModels", "coreMarks", "E118", "ENCODE2012", "LIV.HEPG2.CNCR",
## # "HepG2 Hepatocellular Carcinoma Cell Line")
## # retrieve record with 'object[["AH46971"]]'
E118 <- hub[["AH46971"]]
## require("rtracklayer")
## loading from cache '/home/mtmorgan//.AnnotationHub/52411'
E118
## GRanges object with 561497 ranges and 4 metadata columns:
## seqnames ranges strand | abbr
## <Rle> <IRanges> <Rle> | <character>
## [1] chr10 [ 1, 113200] * | 15_Quies
## [2] chr10 [113201, 119600] * | 14_ReprPCWk
## [3] chr10 [119601, 120000] * | 10_TssBiv
## [4] chr10 [120001, 120200] * | 1_TssA
## [5] chr10 [120201, 120400] * | 2_TssAFlnk
## ... ... ... ... . ...
## [561493] chrY [58907201, 58967400] * | 15_Quies
## [561494] chrY [58967401, 58972000] * | 9_Het
## [561495] chrY [58972001, 58997400] * | 8_ZNF/Rpts
## [561496] chrY [58997401, 59033600] * | 9_Het
## [561497] chrY [59033601, 59373400] * | 15_Quies
## name color_name color_code
## <character> <character> <character>
## [1] Quiescent/Low White #FFFFFF
## [2] Weak Repressed PolyComb Gainsboro #C0C0C0
## [3] Bivalent/Poised TSS IndianRed #CD5C5C
## [4] Active TSS Red #FF0000
## [5] Flanking Active TSS Orange Red #FF4500
## ... ... ... ...
## [561493] Quiescent/Low White #FFFFFF
## [561494] Heterochromatin PaleTurquoise #8A91D0
## [561495] ZNF genes & repeats Medium Aquamarine #66CDAA
## [561496] Heterochromatin PaleTurquoise #8A91D0
## [561497] Quiescent/Low White #FFFFFF
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
Explore the object, e.g., tabulating the different chromatin state classifications (in the name
column). Subset the object to return, e.g., just those regions marked as ‘Heterochromatin’
table(E118$name)
##
## Active TSS Bivalent Enhancer
## 20010 23155
## Bivalent/Poised TSS Enhancers
## 13214 110260
## Flanking Active TSS Flanking Bivalent TSS/Enh
## 45115 15844
## Genic enhancers Heterochromatin
## 14995 31193
## Quiescent/Low Repressed PolyComb
## 61759 44013
## Strong transcription Transcr. at gene 5' and 3'
## 32522 2515
## Weak Repressed PolyComb Weak transcription
## 60867 83738
## ZNF genes & repeats
## 2297
E118[E118$name %in% "Heterochromatin"]
## GRanges object with 31193 ranges and 4 metadata columns:
## seqnames ranges strand | abbr name
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr10 [ 140201, 143800] * | 9_Het Heterochromatin
## [2] chr10 [ 806201, 807800] * | 9_Het Heterochromatin
## [3] chr10 [ 842001, 843800] * | 9_Het Heterochromatin
## [4] chr10 [1024601, 1027200] * | 9_Het Heterochromatin
## [5] chr10 [1191601, 1192600] * | 9_Het Heterochromatin
## ... ... ... ... . ... ...
## [31189] chrY [58883001, 58885400] * | 9_Het Heterochromatin
## [31190] chrY [58890001, 58891000] * | 9_Het Heterochromatin
## [31191] chrY [58906401, 58907200] * | 9_Het Heterochromatin
## [31192] chrY [58967401, 58972000] * | 9_Het Heterochromatin
## [31193] chrY [58997401, 59033600] * | 9_Het Heterochromatin
## color_name color_code
## <character> <character>
## [1] PaleTurquoise #8A91D0
## [2] PaleTurquoise #8A91D0
## [3] PaleTurquoise #8A91D0
## [4] PaleTurquoise #8A91D0
## [5] PaleTurquoise #8A91D0
## ... ... ...
## [31189] PaleTurquoise #8A91D0
## [31190] PaleTurquoise #8A91D0
## [31191] PaleTurquoise #8A91D0
## [31192] PaleTurquoise #8A91D0
## [31193] PaleTurquoise #8A91D0
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
Can you, using a TxDb package and the genes()
and subsetByOverlaps()
functions, determine how many genes overlap heterochromatic states, or the genes nearest()
each enhancer?
Visit the biomart website and figure out how to browse data to retrieve, e.g., genes on chromosomes 21 and 22. You’ll need to browse to the ensembl mart, Homo spaiens data set, establish filters for chromosomes 21 and 22, and then specify that you’d like the Ensembl gene id attribute returned.
Now do the same process in biomaRt:
library(biomaRt)
head(listMarts(), 3) ## list marts
head(listDatasets(useMart("ensembl")), 3) ## mart datasets
ensembl <- ## fully specified mart
useMart("ensembl", dataset = "hsapiens_gene_ensembl")
head(listFilters(ensembl), 3) ## filters
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) ## return values
myValues <- c("21", "22")
head(listAttributes(ensembl), 3) ## attributes
myAttributes <- c("ensembl_gene_id","chromosome_name")
## assemble and query the mart
res <- getBM(attributes = myAttributes, filters = myFilter,
values = myValues, mart = ensembl)