############################################################################## ### ### Exercise 1 ### ========== ### Before you can load a Bioconductor package, you need to install it with: ### source("http://bioconductor.org/biocLite.R") ### biocLite("Biostrings") ### The biocLite() script will take care of installing any other BioC or CRAN ### package that the package you are trying to install depends on. library(Biostrings) library(BSgenome) available.genomes() library(BSgenome.Dmelanogaster.UCSC.dm3) Dmelanogaster Dmelanogaster$chr2L ############################################################################## ### ### Exercise 2 ### ========== library(BSgenome.Celegans.UCSC.ce2) Celegans$chrI class(Celegans$chrI) # a DNAString object length(Celegans$chrI) alphabetFrequency(Celegans$chrI) s1 <- subseq(Celegans$chrI, end=20) # extracts the first 20 nucleotides ### See ?subseq for more information. ### Note that subseq() doesn't copy the extracted sequence data in memory ### so subseq() is very fast and very efficient even when extracting a ### substring made of millions of nucleotides (e.g. from Human chr1). reverseComplement(s1) ############################################################################## ### ### Exercise 3 ### ========== library(drosophila2probe) drosophila2probe class(drosophila2probe) colnames(drosophila2probe) drosophila2probe$sequence[1:5] dict0 <- DNAStringSet(drosophila2probe) dict0 length(dict0) width(dict0)[1:10] # 'width(dict0)' is a very long integer vector! dict0[-2] rev(dict0) dict0[[1]] DNAStringSet(dict0, end=-3) # remove last 2 nucleotides DNAStringSet(dict0, start=-10) # keep the last 10 nucleotides af0 <- alphabetFrequency(dict0) dim(af0) head(af0) alphabetFrequency(dict0, collapse=TRUE) alphabetFrequency(reverseComplement(dict0), collapse=TRUE) sum(af0[,"C"] + af0[,"G"] >= 0.8 * width(dict0)) # only 1 probe ap0 <- alphabetFrequency(dict0, collapse=TRUE, as.prob=TRUE, baseOnly=TRUE) ap0["C"] + ap0["G"] # GC content for the entire microarray ############################################################################## ### ### Exercise 4 ### ========== library(BSgenome.Celegans.UCSC.ce2) chrI <- Celegans$chrI vI <- Views(chrI, start=c(5, 300, 20), width=50) vI subject(vI) start(vI) end(vI) gaps(vI) alphabetFrequency(vI) DNAStringSet(vI) ############################################################################## ### ### Exercise 5 ### ========== library(BSgenome.Dmelanogaster.UCSC.dm3) Dmelanogaster$chr2L unmasked(Dmelanogaster$chr2L) ############################################################################## ### ### Exercise 6 ### ========== chrX <- Dmelanogaster$chrX chrX # See the 1st value in the maskedratio col for the % # of the sequence that is masked by the AGAPS mask. # => 0.4% of this assembly of Fly chromosome X is made # of assembly gaps. alphabetFrequency(chrX, baseOnly=TRUE) # all the Ns are masked alphabetFrequency(unmasked(chrX), as.prob=TRUE)["N"] as(chrX , "XStringViews") gaps(as(chrX , "XStringViews")) ### Lengths of the assembly gaps: width(gaps(as(chrX , "XStringViews"))) ############################################################################## ### ### Exercise 7 ### ========== library(BSgenome) available.genomes() library(BSgenome.Celegans.UCSC.ce2) Celegans seqlengths(Celegans) # from the examples in ?Celegans ############################################################################## ### ### Exercise 8 ### ========== library(BSgenome.Celegans.UCSC.ce2) chrI <- Celegans$chrI pattern <- DNAString("TTACCGATTTCA") matchPattern(pattern, chrI) # hits in the plus strand matchPattern(reverseComplement(pattern), chrI) # hits in the minus strand ### Look at the examples in ?reverseComplement for a discussion about why ### reverse complementing the pattern and not the subject is generally better. matchPattern(pattern, chrI, max.mismatch=1) matchPattern(pattern, chrI, max.mismatch=2, with.indels=TRUE) ############################################################################## ### ### Exercise 9 ### ========== library(BSgenome.Dmelanogaster.UCSC.dm3) upstream1000 <- Dmelanogaster$upstream1000 pattern <- DNAString("TTACCGATTTCA") mi <- vmatchPattern(pattern, upstream1000) mi ### Extract the indices of the upstream1000 elements that received ### at least 1 hit: upstream1000_idx <- which(countIndex(mi) != 0) upstream1000[upstream1000_idx] ############################################################################## ### ### Exercise 10 ### =========== library(BSgenome.Celegans.UCSC.ce2) chrI <- Celegans$chrI pattern <- DNAString("GAACTTTGCCAC") matchPattern(pattern, chrI) # no match pattern2 <- DNAString("GAACNNNGCCAC") matchPattern(pattern2, chrI) # no match matchPattern(pattern2, chrI, fixed=FALSE) # 27 matches! ############################################################################## ### ### Exercise 11 ### =========== library(drosophila2probe) dict0 <- DNAStringSet(drosophila2probe) pdict0 <- PDict(dict0) pdict0 library(BSgenome.Dmelanogaster.UCSC.dm3) chr2L <- unmasked(Dmelanogaster$chr2L) mi <- matchPDict(pdict0, chr2L) mi ci <- countIndex(mi) ### Probes with the higher number of hits: dict0_idx <- which(ci == max(ci)) dict0[dict0_idx] Views(chr2L, mi[[dict0_idx[1]]]) # showing hits for probe number dict0_idx[1] matchPattern(dict0[[dict0_idx[1]]], chr2L) # same result ### Hits in the minus strand: rcdict0 <- reverseComplement(dict0) rcpdict0 <- PDict(rcdict0) rcmi <- matchPDict(rcpdict0, chr2L) rcci <- countIndex(rcmi) ### Total count of hits: totalci <- ci + rcci ### With up to 2 mismatching nucleotides in the last 12 nucleotides of ### each probe: pdict2 <- PDict(dict0, tb.end=-13) pdict2 mi2 <- matchPDict(pdict2, chr2L, max.mismatch=2) ci2 <- countIndex(mi2) dict0_idx2 <- which(ci2 == max(ci2)) dict0[dict0_idx2] v <- Views(chr2L, mi2[[dict0_idx2[1]]]) consensusMatrix(v) # no mismatches v <- Views(chr2L, mi2[[dict0_idx2[2]]]) consensusMatrix(v) # 1 mismatch at the second last base v[1:15] # see hit number 13 ### Note that some probes do have matches with 2 mismatches: Views(chr2L, mi2[[114702]])