## ----library_install, message=FALSE, cache=FALSE, eval=FALSE--------------- # source("http://bioconductor.org/biocLite.R") # biocLite("GenomicScores") ## ----library_upload, message=FALSE, cache=FALSE---------------------------- library(GenomicScores) ## ----retrieve1, message=FALSE, cache=FALSE--------------------------------- library(phastCons100way.UCSC.hg19) library(GenomicRanges) gsco <- phastCons100way.UCSC.hg19 ## -------------------------------------------------------------------------- scores(gsco, GRanges(seqnames="chr7", IRanges(start=117232380, width=1))) ## ---- echo=FALSE, eval=FALSE----------------------------------------------- # gsco ## ---- echo=FALSE----------------------------------------------------------- avgs <- readRDS(system.file("extdata", "avgs.rds", package="GenomicScores")) ## ----retrieve2, message=FALSE, cache=FALSE, eval=FALSE--------------------- # availableGScores() ## ---- echo=FALSE----------------------------------------------------------- avgs ## ----retrieve3, message=FALSE, cache=FALSE, eval=FALSE--------------------- # gsco <- getGScores("phastCons100way.UCSC.hg19") ## ----retrieve4, message=FALSE, cache=FALSE--------------------------------- scores(gsco, GRanges(seqnames="chr7", IRanges(start=117232380, width=1))) ## -------------------------------------------------------------------------- library(VariantAnnotation) library(TxDb.Hsapiens.UCSC.hg19.knownGene) ## -------------------------------------------------------------------------- fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation") vcf <- readVcf(fl, "hg19") seqlevelsStyle(vcf) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene seqlevelsStyle(txdb) ## -------------------------------------------------------------------------- seqlevelsStyle(vcf) <- seqlevelsStyle(txdb) rd <- rowRanges(vcf) rd[1:3] ## -------------------------------------------------------------------------- loc <- locateVariants(rd, txdb, AllVariants()) loc[1:3] table(loc$LOCATION) ## -------------------------------------------------------------------------- loc$PHASTCONS <- scores(gsco, loc, scores.only=TRUE) loc[1:3] ## ----plot1, fig.cap = "Distribution of phastCons conservation scores in variants across different annotated regions. Diamonds indicate mean values.", echo = FALSE, fig.height=5, fig.wide = TRUE, echo=TRUE---- x <- split(loc$PHASTCONS, loc$LOCATION) mask <- elementNROWS(x) > 0 boxplot(x[mask], ylab="phastCons score") points(1:length(x[mask])+0.25, sapply(x[mask], mean, na.rm=TRUE), pch=23, bg="black") ## ---- echo=FALSE, eval=FALSE----------------------------------------------- # qfun(gsco) # dqfun(gsco) ## ----showpositions, message=FALSE, cache=FALSE----------------------------- origpcscoCDS <- readRDS(system.file("extdata", "origphastCons100wayhg19CDS.rds", package="GenomicScores")) origpcscoCDS length(unique(origpcscoCDS$score)) ## -------------------------------------------------------------------------- numDecimals <- function(x) { spl <- strsplit(as.character(x+1), "\\.") spl <- sapply(spl, "[", 2) spl[is.na(spl)] <- "" nchar(spl) } nd1 <- numDecimals(origpcscoCDS$score) table(nd1) ## ----showpositions2, message=FALSE, cache=FALSE---------------------------- origpcsco3UTRs <- readRDS(system.file("extdata", "origphastCons100wayhg193UTR.rds", package="GenomicScores")) origpcsco3UTRs length(table(origpcsco3UTRs$score)) nd2 <- numDecimals(origpcsco3UTRs$score) table(nd2) ## -------------------------------------------------------------------------- pkgpcscoCDS <- scores(gsco, origpcscoCDS, scores.only=TRUE) pkgpcsco3UTRs <- scores(gsco, origpcsco3UTRs, scores.only=TRUE) ## ----plot2, fig.cap = "Original and lossy-compressed phastCons scores. Top panels: comparison of the distribution of values. Bottom panels: comparison of the cumulative distribution", echo = FALSE, fig.height=12, fig.wide = TRUE---- par(mfrow=c(2, 2)) plot(origpcscoCDS$score, jitter(pkgpcscoCDS), pch=19, cex=1, xaxt="n", yaxt="n", xlab="Original phastCons scores (CDS)", ylab="Compressed phastCons scores (CDS)") axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray") abline(0, 1) plot(origpcsco3UTRs$score, jitter(pkgpcsco3UTRs), pch=19, cex=1, xaxt="n", yaxt="n", xlab="Original phastCons scores (3' UTR)", ylab="Compressed phastCons scores (3' UTR)") axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray") abline(0, 1) ForigCDS <- ecdf(origpcscoCDS$score) FpkgCDS <- ecdf(pkgpcscoCDS) plot(sort(origpcscoCDS$score), ForigCDS(sort(origpcscoCDS$score)), xaxt="n", yaxt="n", pch=".", cex=4, xlab="phastCons scores (CDS)", ylab="F(x)", ylim=c(0, 1)) axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray") points(sort(pkgpcscoCDS), FpkgCDS(sort(pkgpcscoCDS)), pch=19, cex=1) legend("topleft", c("Original score", "Compressed score"), pch=c(46, 19), pt.cex=c(4, 1), inset=0.01, bg="white") Forig3UTRs <- ecdf(origpcsco3UTRs$score) Fpkg3UTRs <- ecdf(pkgpcsco3UTRs) plot(sort(origpcsco3UTRs$score), Forig3UTRs(sort(origpcsco3UTRs$score)), xaxt="n", yaxt="n", pch=".", cex=4, xlab="phastCons scores (3'UTR)", ylab="F(x)", ylim=c(0, 1)) axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray") points(sort(pkgpcsco3UTRs), Fpkg3UTRs(sort(pkgpcsco3UTRs)), pch=19, cex=1) legend("topleft", c("Original score", "Compressed score"), pch=c(46, 19), pt.cex=c(4, 1), inset=0.01, bg="white") ## ----session_info, cache=FALSE--------------------------------------------- sessionInfo()