--- title: Reproducing Variant Annotation Results output: BiocStyle::html_document --- ```{r style, echo = FALSE, results = 'asis'} BiocStyle::markdown() ``` ```{r, echo=FALSE, results="hide"} # Ensure that any errors cause the Vignette build to fail. library(knitr) opts_chunk$set(error=FALSE) ``` ```{r, echo = FALSE} apiKey <- Sys.getenv("GOOGLE_API_KEY") if (nchar(apiKey) == 0) { warning(paste("To build this vignette, please setup the environment variable", "GOOGLE_API_KEY with the public API key from your Google", "Developer Console before loading the GoogleGenomics package,", "or run GoogleGenomics::authenticate.")) knitr::knit_exit() } ``` ## Load Data Below we compare the results of annotating variants via `r Biocpkg("VariantAnnotation")` specifically repeating a subset of the steps in vignette [Introduction to VariantAnnotation](http://bioconductor.org/packages/release/bioc/vignettes/VariantAnnotation/inst/doc/VariantAnnotation.pdf). We compare using data from 1,000 Genomes Phase 1 Variants: * as parsed from a VCF file * retrieved from the Google Genomics API ### VCF Data First we read in the data from the VCF file: ```{r message=FALSE} library(VariantAnnotation) ``` ```{r} fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation") vcf <- readVcf(fl, "hg19") vcf <- renameSeqlevels(vcf, c("22"="chr22")) vcf ``` The file `chr22.vcf.gz` within package VariantAnnotation holds data for 5 of the 1,092 individuals in 1,000 Genomes, starting at position 50300078 and ending at position 50999964. `HG00096 HG00097 HG00099 HG00100 HG00101` ### Google Genomics Data Important data differences to note: * VCF data uses 1-based coordinates but data from the GA4GH APIs is 0-based. * There are two variants in the Google Genomics copy of 1,000 Genomes phase 1 variants that are not in `chr22.vcf.gz`. They are the only two variants within the genomic range with `ALT == `. ```{r message=FALSE} library(GoogleGenomics) # This vignette is authenticated on package load from the env variable GOOGLE_API_KEY. # When running interactively, call the authenticate method. # ?authenticate ``` ```{r} # We're just getting the first few variants so that this runs quickly. # If we wanted to get them all, we sould set end=50999964. granges <- getVariants(variantSetId="10473108253681171589", chromosome="22", start=50300077, end=50303000, converter=variantsToGRanges) ``` ### Compare the Loaded Data Ensure that the data retrieved by each matches: ```{r} vcf <- vcf[1:length(granges)] # Truncate the VCF data so that it is the same # set as what was retrieved from the API. ``` ```{r message=FALSE} library(testthat) ``` ```{r} expect_equal(start(granges), start(vcf)) expect_equal(end(granges), end(vcf)) expect_equal(as.character(granges$REF), as.character(ref(vcf))) expect_equal(as.character(unlist(granges$ALT)), as.character(unlist(alt(vcf)))) expect_equal(granges$QUAL, qual(vcf)) expect_equal(granges$FILTER, filt(vcf)) ``` ## Compare the Annotation Results Now locate the protein coding variants in each: ```{r message=FALSE} library(TxDb.Hsapiens.UCSC.hg19.knownGene) ``` ```{r} txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene rd <- rowRanges(vcf) vcf_locations <- locateVariants(rd, txdb, CodingVariants()) vcf_locations granges_locations <- locateVariants(granges, txdb, CodingVariants()) granges_locations expect_equal(granges_locations, vcf_locations) ``` And predict the effect of the protein coding variants: ```{r message=FALSE} library(BSgenome.Hsapiens.UCSC.hg19) ``` ```{r} vcf_coding <- predictCoding(vcf, txdb, seqSource=Hsapiens) vcf_coding granges_coding <- predictCoding(rep(granges, elementNROWS(granges$ALT)), txdb, seqSource=Hsapiens, varAllele=unlist(granges$ALT, use.names=FALSE)) granges_coding expect_equal(as.matrix(granges_coding$REFCODON), as.matrix(vcf_coding$REFCODON)) expect_equal(as.matrix(granges_coding$VARCODON), as.matrix(vcf_coding$VARCODON)) expect_equal(granges_coding$GENEID, vcf_coding$GENEID) expect_equal(granges_coding$CONSEQUENCE, vcf_coding$CONSEQUENCE) ``` ## Provenance ```{r} sessionInfo() ```