\name{locateVariants} \alias{locateVariants} \alias{locateVariants,Ranges,TranscriptDb,ANY-method} \alias{locateVariants,Ranges,GRangesList,ANY-method} \alias{locateVariants,GRanges,TranscriptDb,ANY-method} \alias{locateVariants,GRanges,GRangesList,ANY-method} \alias{locateVariants,VCF,TranscriptDb,ANY-method} \alias{locateVariants,VCF,GRangesList,ANY-method} \alias{locateVariants,GRanges,TranscriptDb,CodingVariants-method} \alias{locateVariants,GRanges,GRangesList,CodingVariants-method} \alias{locateVariants,GRanges,TranscriptDb,IntronVariants-method} \alias{locateVariants,GRanges,GRangesList,IntronVariants-method} \alias{locateVariants,GRanges,TranscriptDb,FiveUTRVariants-method} \alias{locateVariants,GRanges,GRangesList,FiveUTRVariants-method} \alias{locateVariants,GRanges,TranscriptDb,ThreeUTRVariants-method} \alias{locateVariants,GRanges,GRangesList,ThreeUTRVariants-method} \alias{locateVariants,GRanges,TranscriptDb,IntergenicVariants-method} \alias{locateVariants,GRanges,GRangesList,IntergenicVariants-method} \alias{locateVariants,GRanges,TranscriptDb,SpliceSiteVariants-method} \alias{locateVariants,GRanges,GRangesList,SpliceSiteVariants-method} \alias{locateVariants,GRanges,TranscriptDb,AllVariants-method} \alias{locateVariants,GRanges,GRangesList,AllVariants-method} \title{Locate variants} \description{Variant location with respect to gene function} \usage{ locateVariants(query, subject, region, ...) \S4method{locateVariants}{VCF,TranscriptDb,ANY}(query, subject, region, ..., cache = new.env(parent=emptyenv())) \S4method{locateVariants}{Ranges,TranscriptDb,ANY}(query, subject, region, ..., cache = new.env(parent=emptyenv())) \S4method{locateVariants}{GRanges,TranscriptDb,ANY}(query, subject, region, ..., cache = new.env(parent=emptyenv())) \S4method{locateVariants}{GRanges,GRangesList,CodingVariants}(query, subject, region, ...) \S4method{locateVariants}{GRanges,GRangesList,IntronVariants}(query, subject, region, ...) \S4method{locateVariants}{GRanges,GRangesList,FiveUTRVariants}(query, subject, region, ...) \S4method{locateVariants}{GRanges,GRangesList,ThreeUTRVariants}(query, subject, region, ...) \S4method{locateVariants}{GRanges,GRangesList,IntergenicVariants}(query, subject, region, ...) \S4method{locateVariants}{GRanges,GRangesList,SpliceSiteVariants}(query, subject, region, ...) \S4method{locateVariants}{GRanges,GRangesList,AllVariants}(query, subject, region, ...) } \arguments{ \item{query}{A \link[IRanges]{Ranges}, \link[GenomicRanges]{GRanges} or \linkS4class{VCF} object containing the variants. Metadata columns are allowed but are ignored. } \item{subject}{A \link[GenomicFeatures]{TranscriptDb} object that serves as the annotation reference. } \item{region}{An instance of one of \code{CodingVariants}, \code{IntronVariants}, \code{FiveUTRVariants}, \code{ThreeUTRVariants}, \code{IntergenicVariants}, \code{SpliceSiteVariants}, \code{AllVariants}. These objects are instantiated with no arguments, e.g., CodingVariants() will create an object of \code{CodingVariants}. } \item{\dots}{Additional arguments passed to methods } \item{cache}{An \code{environment} into which required components of \code{subject} are loaded. Provide, and re-use, a cache to speed repeated queries to the same \code{subject} across different \code{query} instances. } } \details{ \describe{ \item{Range representation :}{ The ranges in \code{query} should reflect the position(s) of the reference allele. For snps the range will be of width 1. For range insertions or deletions the reference allele could be a sequence such as GGTG in which case the width of the range should be 4. } \item{Location :}{ Possible locations are \sQuote{coding}, \sQuote{intron}, \sQuote{threeUTR}, \sQuote{fiveUTR}, \sQuote{intergenic}, or \sQuote{spliceSite}. Overlap operations for \sQuote{coding}, \sQuote{intron}, \sQuote{threeUTR}, and \sQuote{fiveUTR} require variants to fall completely within the defined region to be classified as such. To be classified as a \sQuote{spliceSite} the variant must overlap with any portion of the first 2 or last 2 nucleotides in an intron. \sQuote{intergenic} variants are those that do not fall within a transcript associated with a gene. If available, gene IDs for the flanking genes are give as the \code{precedesID} and \code{followsID}. } \item{Subject as GRangesList :}{ The \code{subject} can be a \code{TranscriptDb} or \code{GRangesList} object. When using a \code{GRangesList} the type of data required is driven by the \code{VariantType} class in the \code{region} argument. Below is a description of the appropriate \code{GRangesList} \code{subject} for each type of \code{region}. \describe{ \item{CodingVariants :}{coding regions by transcript} \item{IntronVariants :}{intron regions by transcript} \item{IntergenicVariants :}{transcript regions by gene} \item{SpliceSiteVariants :}{intron regions by transcript} \item{FiveUTRVariants :}{five prime UTR regions by transcript} \item{ThreeUTRVariants :}{three prime UTR regions by transcript} \item{AllVariants :}{no GRangeList method available} } } \item{Using the cache :}{ When processing multiple VCF files performance is enhanced by specifying an enviornment as the \code{cache} argument. This cache is used to store and reuse extracted components of the subject (TxDb) required by the function. The first call to the function (i.e., processing the first VCF file in a list of many) populates the cache; repeated calls to \code{locateVariants} will access these objects from the cache vs re-extracting the same information. } } } \value{ A \link[IRanges]{DataFrame} with a row for each variant-transcript match. Columns are \code{location}, \code{queryID}, \code{txID}, \code{geneID} and \code{cdsID}. \describe{ \item{\code{location}}{ Possible locations are \sQuote{coding}, \sQuote{intron}, \sQuote{threeUTR}, \sQuote{fiveUTR}, \sQuote{intergenic}, and \sQuote{spliceSite}. To be classified as \sQuote{coding}, \sQuote{intron}, \sQuote{threeUTR} or \sQuote{fiveUTR} the variant must fall completely within the region. \sQuote{intergenic} variants do not fall within a transcript. The \sQuote{geneID} for these position are \code{NA} and instead the \sQuote{precedesID} and \sQuote{followsID} for the flanking genes are given. A \sQuote{spliceSite} variant overlaps any portion of the first 2 or last 2 nucleotides of an intron. } \item{\code{queryID}}{ The \code{queryID} column provides a map back to the row in the original \code{query}. If the \code{query} was a \code{VCF} object this index corresponds to the row in the \code{GRanges} in the \code{rowData} slot. } \item{\code{txID}}{ The transcript id taken from the \code{TranscriptDb} object. } \item{\code{cdsID}}{ The coding sequence id taken from the \code{TranscriptDb} object. } \item{\code{geneID}}{ The gene id taken from the \code{TranscriptDb} object. } \item{\code{precedesID}}{ The id of the gene the query precedes. Only present for \sQuote{intergenic} variants. } \item{\code{followsID}}{ The id of the gene the query follows. Only present for \sQuote{intergenic} variants. } All ID values will be \sQuote{NA} for variants with a location of \code{transcript_region} or \code{NA}. } } \author{Valerie Obenchain } \seealso{ \code{\link{readVcf}}, \code{\link{predictCoding}} } \examples{ library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene ## -------------------------------- ## TranscriptDb object as subject ## -------------------------------- ## Read variants from a VCF file fl <- system.file("extdata", "structural.vcf", package="VariantAnnotation") vcf <- readVcf(fl, "hg19") ## Variant seqlevels do not match the TxDb seqlevels rd <- rowData(vcf) head(seqlevels(rd)) head(seqlevels(txdb)) intersect(seqlevels(rd), seqlevels(txdb)) ## Use the renameSeqlevels helper to rename seqlevels newnames <- paste("chr", seqlevels(vcf), sep="") names(newnames) <- seqlevels(vcf) vcf_adj <- renameSeqlevels(vcf, newnames) rd_adj <- renameSeqlevels(rd, newnames) ## Confirm intersect(seqlevels(vcf_adj), seqlevels(txdb)) intersect(seqlevels(rd_adj), seqlevels(txdb)) ## VCF object as the query loc_coding <- locateVariants(vcf_adj, txdb, CodingVariants()) ## GRanges as the query loc_intron <- locateVariants(rd_adj, txdb, IntronVariants()) loc_splice <- locateVariants(rd_adj, txdb, SpliceSiteVariants()) ## -------------------------------- ## GRangesList object as subject ## -------------------------------- ## Note the results do not include geneID. This information ## is not available when a GRangesList is used in place of ## a TranscriptDb. cdsbytx <- cdsBy(txdb) locateVariants(rd_adj, cdsbytx, CodingVariants()) intbytx <- intronsByTranscript(txdb) locateVariants(rd_adj, intbytx, IntronVariants()) ## -------------------------------- ## Using the cache ## -------------------------------- ## myenv <- new.env() ## files <- list(vcf1, vcf2, vcf3) ## lapply(files, ## function(fl) ## { ## vcf <- readVcf(fl, "hg19") ## ## modify seqlevels to match TxDb ## oldnms <- seqlevels(vcf) ## newnms <- paste("chr", oldnms, sep="") ## names(newnms) <- oldnms ## vcf_mod <- renameSeqlevels(vcf, newnms) ## locateVariants(vcf_mod, txdb, AllVariants(), cache=myenv) ## } ## ) } \keyword{methods}