## ----echo=FALSE, results="hide", warning=FALSE--------------------------------
suppressPackageStartupMessages({
    library('variants')
})

## ----eval=FALSE---------------------------------------------------------------
#  library(VariantAnnotation)
#  library(org.Hs.eg.db)
#  library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#  library(BSgenome.Hsapiens.UCSC.hg19)
#  library(PolyPhen.Hsapiens.dbSNP131)

## ----eval=FALSE---------------------------------------------------------------
#  if (!"BiocManager" %in% rownames(installed.packages()))
#       install.packages("BiocManager")
#  BiocManager::install("mypackage")

## -----------------------------------------------------------------------------
file <- system.file("vcf", "NA06985_17.vcf.gz", package = "variants")

## -----------------------------------------------------------------------------
hdr <- scanVcfHeader(file)

info(hdr)

geno(hdr)

## -----------------------------------------------------------------------------
meta(hdr)

## -----------------------------------------------------------------------------
## get entrez ids from gene symbols
genesym <- c("TRPV1", "TRPV2", "TRPV3")
geneid <- select(
    org.Hs.eg.db, keys=genesym, keytype="SYMBOL", columns="ENTREZID"
)
geneid

## -----------------------------------------------------------------------------
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb

## -----------------------------------------------------------------------------
txdb <- keepSeqlevels(txdb, "chr17")

## -----------------------------------------------------------------------------
txbygene <- transcriptsBy(txdb, "gene")

## -----------------------------------------------------------------------------
gnrng <- unlist(range(txbygene[geneid$ENTREZID]), use.names=FALSE)
names(gnrng) <- geneid$SYMBOL

## -----------------------------------------------------------------------------
seqinfo(gnrng)
seqlevels(gnrng) <- sub("chr", "", seqlevels(gnrng))
genome(gnrng) <- "B37"
seqinfo(gnrng)
## seqlevelsStyle(gnrng) <- "NCBI"
param <- ScanVcfParam(which = gnrng, info = "DP", geno = c("GT", "cPd"))
param

## Extract the TRPV ranges from the VCF file
vcf <- readVcf(file, param = param)
## Inspect the VCF object with the 'fixed', 'info' and 'geno' accessors
vcf

head(fixed(vcf))

geno(vcf)

## -----------------------------------------------------------------------------
seqinfo(vcf)
genome(vcf) <- "hg19"
seqlevels(vcf) <- sub("([[:digit:]]+)", "chr\\1", seqlevels(vcf))
seqinfo(vcf)
## seqlevelsStyle(vcf) <- "UCSC"
vcf <- keepSeqlevels(vcf, "chr17")

## ----eval=FALSE---------------------------------------------------------------
#  ## Use the 'region' argument to define the region
#  ## of interest. See ?locateVariants for details.
#  cds <- locateVariants(vcf, txdb, CodingVariants())
#  five <- locateVariants(vcf, txdb, FiveUTRVariants())
#  splice <- locateVariants(vcf, txdb, SpliceSiteVariants())
#  intron <- locateVariants(vcf, txdb, IntronVariants())

## -----------------------------------------------------------------------------
all <- locateVariants(vcf, txdb, AllVariants())

## -----------------------------------------------------------------------------
## Did any variants match more than one gene?
geneByQuery <- sapply(split(all$GENEID, all$QUERYID), unique)
table(lengths(geneByQuery) > 1)

## Summarize the number of variants by gene:
queryByGene <- sapply(split(all$QUERYID, all$GENEID), unique)
table(lengths(queryByGene) > 1)

sapply(queryByGene, length)

## Summarize variant location by gene:
sapply(names(queryByGene), function(nm) {
	d <- all[all$GENEID %in% nm, c("QUERYID", "LOCATION")]
	table(d$LOCATION[duplicated(d) == FALSE])
})

## -----------------------------------------------------------------------------
aa <- predictCoding(vcf, txdb, Hsapiens)

## -----------------------------------------------------------------------------
## Did any variants match more than one gene?
aageneByQuery <- split(aa$GENEID, aa$QUERYID)
table(lengths(sapply(aageneByQuery, unique)) > 1)

## Summarize the number of variants by gene:
aaaqueryByGene <- split(aa$QUERYID, aa$GENEID, drop=TRUE)
sapply(aaaqueryByGene, length)

## Summarize variant consequence by gene:
sapply(names(aaaqueryByGene), function(nm) {
    d <- aa[aa$GENEID %in% nm, c("QUERYID","CONSEQUENCE")]
    table(d$CONSEQUENCE[duplicated(d) == FALSE])
})

## ----eval=FALSE---------------------------------------------------------------
#  browseVignettes(package="VariantAnnotation")

## ----eval=FALSE---------------------------------------------------------------
#  help.start()

## -----------------------------------------------------------------------------
sessionInfo()