### R code from vignette source 'SearchForResearch.Rnw' ################################################### ### code chunk number 1: SearchForResearch.Rnw:51-53 ################################################### options(continue=" ") options(width=80) ################################################### ### code chunk number 2: startup ################################################### library(DECIPHER) ################################################### ### code chunk number 3: expr1 ################################################### # specify the path to your file of pattern (query) sequences: fas1 <- "<>" # OR use the example protein sequences: fas1 <- system.file("extdata", "PlanctobacteriaNamedGenes.fas.gz", package="DECIPHER") # read the sequences into memory pattern <- readAAStringSet(fas1) pattern ################################################### ### code chunk number 4: expr2 ################################################### # specify the path to your file of subject (target) sequences: fas2 <- "<>" # OR use the example subject genome: fas2 <- system.file("extdata", "Chlamydia_trachomatis_NC_000117.fas.gz", package="DECIPHER") # read the sequences into memory genome <- readDNAStringSet(fas2) genome genome <- c(genome, reverseComplement(genome)) # two strands subject <- subseq(rep(genome, each=3), rep(1:3, 2)) # six frames subject <- suppressWarnings(translate(subject)) # 6-frame translation subject ################################################### ### code chunk number 5: expr3 ################################################### index <- IndexSeqs(subject, sensitivity=0.99, percentIdentity=70, patternLength=300, processors=1) index ################################################### ### code chunk number 6: expr4 ################################################### hits <- SearchIndex(pattern, index, perPatternLimit=100, processors=1) dim(hits) head(hits) ################################################### ### code chunk number 7: expr5 ################################################### getOption("SweaveHooks")[["fig"]]() layout(matrix(1:4, nrow=2)) hist(hits$Score, breaks=100, xlab="Score", main="Distribution of scores") plot(NA, xlim=c(0, max(width(subject))), ylim=c(1, 6), xlab="Genome position", ylab="Genome frame", main="Location of k-mer matches") segments(sapply(hits$Position, `[`, i=3), # third row hits$Subject, sapply(hits$Position, `[`, i=4), # fourth row hits$Subject) plot(hits$Score, sapply(hits$Position, function(x) sum(x[2,] - x[1,] + 1)), xlab="Score", ylab="Sum of k-mer matches", main="Matches versus score", log="xy") plot(table(tabulate(hits$Pattern, nbins=length(pattern))), xlab="Hits per pattern sequence", ylab="Frequency", main="Number of hits per query") ################################################### ### code chunk number 8: expr6 ################################################### w <- which.max(tabulate(hits$Pattern)) hits[hits$Pattern == w,] names(pattern)[w] ################################################### ### code chunk number 9: expr7 ################################################### aligned <- AlignPairs(pattern=pattern, subject=subject, pairs=hits, processors=1) head(aligned) ################################################### ### code chunk number 10: expr8 ################################################### getOption("SweaveHooks")[["fig"]]() PID1 <- aligned$Matches/(aligned$Matches + aligned$Mismatches) PID2 <- aligned$Matches/aligned$AlignmentLength layout(matrix(1:4, ncol=2)) plot(hits$Score, PID2, xlab="Hit score", ylab="Matches / (Aligned length)") plot(hits$Score, aligned$Score, xlab="Hit score", ylab="Aligned score") plot(aligned$Score, PID1, xlab="Aligned score", ylab="Matches / (Matches + Mismatches)") plot(PID1, PID2, xlab="Matches / (Matches + Mismatches)", ylab="Matches / (Aligned length)") ################################################### ### code chunk number 11: expr9 ################################################### alignments <- AlignPairs(pattern=pattern, subject=subject, pairs=hits, type="both", processors=1) patterns <- alignments[[2]] subjects <- alignments[[3]] c(patterns[1], subjects[1]) # view the first pairwise alignment ################################################### ### code chunk number 12: expr10 ################################################### revhits <- SearchIndex(reverse(pattern), # reverse the query index, # keep the same target database minScore=10, # set low to get many hits processors=1) dim(revhits) ################################################### ### code chunk number 13: expr11 ################################################### getOption("SweaveHooks")[["fig"]]() X <- 10:100 # score bins Y <- tabulate(.bincode(revhits$Score, X), length(X) - 1) Y <- Y/length(pattern) # average per query w <- which(Y > 0) # needed to fit in log-space plot(X[w], Y[w], log="y", xlab="Score", ylab="Average false positives per query") fit <- function(rate) # integrate from bin start to end sum(abs((log((exp(-X[w]*rate) - exp(-X[w + 1]*rate))*length(subject)) - log(Y[w])))) o <- optimize(fit, c(0.01, 2)) # optimize rate lines(X[-length(X)], (exp(-X[-length(X)]*o$minimum) - exp(-(X[-1])*o$minimum))*length(subject)) rate <- o$minimum print(rate) ################################################### ### code chunk number 14: expr12 ################################################### # convert each Score to an E-value Evalue <- exp(-rate*hits$Score)*length(subject)*length(pattern) # determine minimum Score for up to 1 false positive hit expected log(1/length(subject)/length(pattern))/-rate ################################################### ### code chunk number 15: expr13 ################################################### # determine minimum Score for 0.05 (total) false positive hits expected threshold <- log(0.05/length(subject)/length(pattern))/-rate hits <- hits[hits$Score > threshold,] dim(hits) ################################################### ### code chunk number 16: expr14 ################################################### # include the target sequences to increase search sensitivity hits <- SearchIndex(pattern, index, subject, # optional parameter perPatternLimit=100, processors=1) dim(hits) head(hits) ################################################### ### code chunk number 17: expr15 ################################################### index <- IndexSeqs(genome, K=11, maskRepeats=FALSE, maskLCRs=FALSE, processors=1) index ################################################### ### code chunk number 18: expr16 ################################################### fas3 <- system.file("extdata", "Simulated_ONT_Long_Reads.fas.gz", package="DECIPHER") reads <- readDNAStringSet(fas3) head(names(reads)) ################################################### ### code chunk number 19: expr17 ################################################### maps <- SearchIndex(reads, index, perPatternLimit=2, # two hits per read perSubjectLimit=0, # unlimited maskRepeats=FALSE, maskLCRs=FALSE, processors=1) dim(maps) head(maps) # when > 1 hit, subtract 2nd highest score from top hit o <- order(maps$Pattern, maps$Score, decreasing=TRUE) w <- which(duplicated(maps$Pattern[o])) maps$Score[o[w - 1]] <- maps$Score[o[w - 1]] - maps$Score[o[w]] maps <- maps[-o[w],] # remove 2nd best hits nrow(maps) == length(reads) ################################################### ### code chunk number 20: expr18 ################################################### pos <- sapply(maps$Position, function(x) x[3] - x[1] + 1) offset <- abs(pos - as.integer(names(reads))) table(offset) maps$Score[which.max(offset)] ################################################### ### code chunk number 21: sessinfo ################################################### toLatex(sessionInfo(), locale=FALSE)