############################################################################## ### ### Exercise 1 ### ========== ### Before you can load a Bioconductor package, you need to install it with: ### source("http://bioconductor.org/biocLite.R") ### biocLite("Biostrings") ### The biocLite() script will take care of installing any other BioC or CRAN ### package that the package you are trying to install depends on. library(Biostrings) library(BSgenome) available.genomes() library(BSgenome.Dmelanogaster.UCSC.dm3) ############################################################################## ### ### Exercise 2 ### ========== seq <- DNAString("aagtac") nchar(seq) alphabetFrequency(seq) reverseComplement(seq) subseq(seq, start=2, end=4) ### The same substring can be extracted with subseq(seq, start=2, width=3) ### or with subseq(seq, end=4, width=3) ### or with subseq(seq, end=-3, width=3) ### See ?subseq for more information. ### Note that subseq() doesn't copy the extracted sequence data in memory ### so subseq() is very fast and very efficient even when extracting a ### substring made of millions of nucleotides (e.g. from Human chr1). ############################################################################## ### ### Exercise 3 ### ========== x0 <- DNAStringSet(c("aagtac", "ccc", "gtt")) width(x0) length(x0) x0[-2] ### 2 equivalent ways to invert the order of x0'elements: ### 1. by subsetting: x0[3:1] ### 2. with rev(): rev(x0) x0[[1]] ### You could do this: DNAStringSet(x0, end=width(x0)-2) ### but this is much better: DNAStringSet(x0, end=-3) alphabetFrequency(x0) ### Note that you can collapse the matrix returned by alphabetFrequency() ### with: alphabetFrequency(x0, collapse=TRUE) ### Also you can look at the frequencies of the base letters only: alphabetFrequency(x0, baseOnly=TRUE) alphabetFrequency(x0, baseOnly=TRUE, collapse=TRUE) reverseComplement(x0) ############################################################################## ### ### Exercise 4 ### ========== library(hgu95av2probe) ### Only one single object is defined in a "probe" package, it has the name ### of the package: hgu95av2probe class(hgu95av2probe) dim(hgu95av2probe) ### The "variables" of a data frame are its columns: colnames(hgu95av2probe) ### In a data frame, all the variables always have the same length which is ### the number of rows of the data frame. We access a data frame variable with ### the $ operator: length(hgu95av2probe$sequence) ### See the first 10 elements of the "sequence" variable: hgu95av2probe$sequence[1:10] nchar(hgu95av2probe$sequence[1:10]) ### Yes they look like 25-mers which is what we expect for an Affymetrix chip. ### Let's store the "sequence" variable into a DNAStringSet object: x1 <- DNAStringSet(hgu95av2probe$sequence) x1 ### First we get the matrix of nucleotide frequencies: af1 <- alphabetFrequency(x1, baseOnly=TRUE, freq=TRUE) colnames(af1) ### Then we get the vector of CG content GCcontent <- af1[ , "C"] + af1[ , "G"] ### or equivalently: GCcontent2 <- rowSums(af1[ , c("C", "G")]) ### And finally we count (with sum()) the number of elements in GCcontent that ### are >= 0.8: sum(GCcontent >= 0.8) ### And for the entire microarray: af1global <- alphabetFrequency(x1, baseOnly=TRUE, collapse=TRUE, freq=TRUE) sum(af1global[c("C","G")]) # 48% ############################################################################## ### ### Exercise 5 ### ========== subj <- DNAString("taacacgacctga") v <- views(subj, start=c(4, 7, 1), end=c(9, 10, 2)) v subject(v) start(v) end(v) gaps(v) alphabetFrequency(v) DNAStringSet(v) ############################################################################## ### ### Exercise 6 ### ========== library(BSgenome.Celegans.UCSC.ce2) Celegans Celegans$chrI Celegans$chrII alphabetFrequency(Celegans$chrI, baseOnly=TRUE) alphabetFrequency(Celegans$chrII, baseOnly=TRUE) ### yes, chrII contains IUPAC extended letters library(BSgenome.Dmelanogaster.UCSC.dm3) Dmelanogaster alphabetFrequency(Dmelanogaster$chr2L, baseOnly=TRUE) ## yes, chr2L contains IUPAC extended letters chr2Lhead <- subseq(Dmelanogaster$chr2L, start=1, end=5000000) slices <- views(chr2Lhead, start=20:5000000 - 19, end=20:5000000) slices ### Look at the examples in ?chartr for how to simulate a bisulfite ### transformation of a DNA sequence. ############################################################################## ### ### Exercise 7 ### ========== subject <- Dmelanogaster$chr2L pattern1 <- DNAString("ACCACTGAA") hits1 <- matchPattern(pattern1, subject) hits1 pattern2 <- DNAString("ACCACTGAAATG") hits2 <- matchPattern(pattern2, subject) hits2 ### and to search the minus strand: rcpattern1 <- reverseComplement(pattern1) rchits1 <- matchPattern(rcpattern1, subject) rchits1 rcpattern2 <- reverseComplement(pattern2) rchits2 <- matchPattern(rcpattern2, subject) rchits2 ### Look at the examples in ?reverseComplement for a discussion about why ### reverse complementing the pattern and not the subject is generally better. ############################################################################## ### ### Exercise 8 ### ========== subject <- Dmelanogaster$chr2L matchPattern("GAACTTTGCCACTC", subject) ### We can replace the 6th letter by an N (in the IUPAC extended alphabet, N ### is a wildcard that matches any nucleotide, see the IUPAC_CODE_MAP ### predefined vector for the mapping between extended letters and what they ### match): pattern <- DNAString("GAACTNTGCCACTC") matchPattern(pattern, subject) ### Also we cannot use 'fixed=TRUE' if we want the N to be interpreted as ### an ambiguity: matchPattern(pattern, subject, fixed=FALSE) ### In fact, we don't want the Ns in the subject to be interpreted as ### ambiguities, only in the pattern. This is achieved with: matchPattern(pattern, subject, fixed="subject") ############################################################################## ### ### Exercise 9 ### ========== library(BSgenome.Hsapiens.UCSC.hg18) chr1 <- Hsapiens$chr1 chr1 maskedratio(chr1) active(masks(chr1))[1] <- TRUE # activate mask of assembly gaps chr1 maskedratio(chr1) alphabetFrequency(chr1) alphabetFrequency(unmasked(chr1)) active(masks(chr1)) <- TRUE # activate all the masks Ebox <- "CANNTG" hits_allmasks <- matchPattern(Ebox, chr1, fixed=FALSE) hits_allmasks hits_nomasks <- matchPattern(Ebox, unmasked(chr1), fixed="subject") hits_nomasks ### So by masking, we keep only E-boxes that are not in repeat regions. ############################################################################## ### ### Exercise 10 ### =========== chr2 <- Hsapiens$chr2 active(masks(chr2))[1] <- TRUE # activate mask of assembly gaps alphabetFrequency(chr2) # we see 2855 intra-contig Ns! v2 <- as(chr2 , "XStringViews") v2 # we don't immediately see the intra-contig Ns but this doesn't mean that # they are not here alphabetFrequency(v2) # yes they are here, in the 6th contig (6th view) ### A mask inversion is needed in order to get the inter-view gaps (or ### inter-contig gaps, or assembly gaps). That's what gaps() does: gaps(v2) ############################################################################## ### ### Exercise 11 ### =========== library(BSgenome.Dmelanogaster.UCSC.dm3) chrX <- Dmelanogaster$chrX ### A first attempt: cpals_FchrX <- findComplementedPalindromes(chrX, min.armlength=30, max.looplength=10) cpals_FchrX ### The search will be cleaner if we mask the assembly gaps: active(masks(chrX))[1] <- TRUE cpals_FchrXcleaner <- findComplementedPalindromes(chrX, min.armlength=30, max.looplength=10) cpals_FchrXcleaner ### Now with Human chromosome 2: cpals_Hchr2 <- findComplementedPalindromes(chr2, min.armlength=30, max.looplength=10) cpals_Hchr2 af <- alphabetFrequency(cpals_Hchr2) ii <- which(af[ , "A"] != 0 & af[ , "C"] != 0 & af[ , "G"] != 0 & af[ , "T"] != 0) cpals_Hchr2[ii] # 105 palindromic regions contain the 4 bases ############################################################################## ### ### Exercise 12 ### =========== library(BSgenome.Hsapiens.UCSC.hg18) library(hgu95av2probe) dict0 <- DNAStringSet(hgu95av2probe$sequence) ### Preprocess: pdict0 <- PDict(dict0) ### Find the hits: mindex <- matchPDict(pdict0, Hsapiens$chr1) ### Extract the number of hits for each probe: nhits <- countIndex(mindex) ### Probe(s) with the highest number of hits: max(nhits) # highest number of hits per probe is 11293 dict0[nhits == max(nhits)] # see probe(s) with highest number of hits as.data.frame(hgu95av2probe[nhits == max(nhits), ]) # get more information # about this/these probe(s) ### Note that by using 'which.max(nhits)' instead of 'nhits == max(nhits)' above, ### you could have missed the other probes that have the same highest number of ### hits in case there were more than one. See ?which.max for the details. which.max(nhits) # probe 201534 probe201534_hits <- mindex[[which.max(nhits)]] views(Hsapiens$chr1, start=start(probe201534_hits), end=end(probe201534_hits)) ############################################################################## ### ### Exercise 13 ### =========== library(BSgenome.Hsapiens.UCSC.hg18) library(SNPlocs.Hsapiens.dbSNP.20071016) hg18snp <- injectSNPs(Hsapiens, 'SNPlocs.Hsapiens.dbSNP.20071016') hg18snp chr1snps <- hg18snp$chr1 alphabetFrequency(chr1snps) # lots of IUPAC extended letters (1 extended # letter per SNP!) alphabetFrequency(Hsapiens$chr1) # no IUPAC extended letters (beside the Ns) ### Our first attempt to count the hits for the hgu95av2 probes fails because ### there are "too many IUPAC ambiguity letters in 'subject'": #nhits <- countPDict(pdict0, chr1snps, fixed="pattern") # first attempt fails! ### We need to mask the assembly gaps for this to work properly: active(masks(chr1snps))[1] <- TRUE nhits <- countPDict(pdict0, chr1snps, fixed="pattern") max(nhits) # highest number of hits per probe is now 11492 instead of 11293 # (generally speaking, all probes have now more hits) which.max(nhits) # probe 201534 again probe201534 <- dict0[[which.max(nhits)]] probe201534_hits <- matchPattern(probe201534, chr1snps, fixed="pattern") consmat(probe201534_hits) # a simple confirmation that the hits contain SNPs ii <- which(probe201534_hits != probe201534) probe201534_hits[ii] # probe 201534 has 635 hits that contain at least 1 # known SNP