##############################################################################
###
### Exercise 1
### ==========

### Before you can load a Bioconductor package, you need to install it with:
###   source("http://bioconductor.org/biocLite.R")
###   biocLite("Biostrings")
### The biocLite() script will take care of installing any other BioC or CRAN
### package that the package you are trying to install depends on.
library(Biostrings)

library(BSgenome)

available.genomes()

library(BSgenome.Dmelanogaster.UCSC.dm3)

Dmelanogaster

Dmelanogaster$chr2L


##############################################################################
###
### Exercise 2
### ==========

library(BSgenome.Celegans.UCSC.ce2)

Celegans$chrI

class(Celegans$chrI)  # a DNAString object

length(Celegans$chrI)

alphabetFrequency(Celegans$chrI)

s1 <- subseq(Celegans$chrI, end=20)  # extracts the first 20 nucleotides

### See ?subseq for more information.

### Note that subseq() doesn't copy the extracted sequence data in memory
### so subseq() is very fast and very efficient even when extracting a
### substring made of millions of nucleotides (e.g. from Human chr1).

reverseComplement(s1)


##############################################################################
###
### Exercise 3
### ==========

library(drosophila2probe)

drosophila2probe

class(drosophila2probe)

colnames(drosophila2probe)

drosophila2probe$sequence[1:5]

dict0 <- DNAStringSet(drosophila2probe)

dict0

length(dict0)

width(dict0)[1:10]  # 'width(dict0)' is a very long integer vector!

dict0[-2]

rev(dict0)

dict0[[1]]

DNAStringSet(dict0, end=-3)  # remove last 2 nucleotides

DNAStringSet(dict0, start=-10)  # keep the last 10 nucleotides

af0 <- alphabetFrequency(dict0)
dim(af0)
head(af0)

alphabetFrequency(dict0, collapse=TRUE)

alphabetFrequency(reverseComplement(dict0), collapse=TRUE)

sum(af0[,"C"] + af0[,"G"] >= 0.8 * width(dict0))  # only 1 probe

ap0 <- alphabetFrequency(dict0, collapse=TRUE, as.prob=TRUE, baseOnly=TRUE)
ap0["C"] + ap0["G"]  # GC content for the entire microarray


##############################################################################
###
### Exercise 4
### ==========

library(BSgenome.Celegans.UCSC.ce2)

chrI <- Celegans$chrI

vI <- Views(chrI, start=c(5, 300, 20), width=50)
vI

subject(vI)
start(vI)
end(vI)
gaps(vI)

alphabetFrequency(vI)

DNAStringSet(vI)


##############################################################################
###
### Exercise 5
### ==========

library(BSgenome.Dmelanogaster.UCSC.dm3)

Dmelanogaster$chr2L

unmasked(Dmelanogaster$chr2L)


##############################################################################
###
### Exercise 6
### ==========

chrX <- Dmelanogaster$chrX

chrX  # See the 1st value in the maskedratio col for the %
      # of the sequence that is masked by the AGAPS mask.
      # => 0.4% of this assembly of Fly chromosome X is made
      # of assembly gaps.

alphabetFrequency(chrX, baseOnly=TRUE)  # all the Ns are masked
alphabetFrequency(unmasked(chrX), as.prob=TRUE)["N"]

as(chrX , "XStringViews")
gaps(as(chrX , "XStringViews"))

### Lengths of the assembly gaps:
width(gaps(as(chrX , "XStringViews")))


##############################################################################
###
### Exercise 7
### ==========

library(BSgenome)

available.genomes()

library(BSgenome.Celegans.UCSC.ce2)

Celegans
seqlengths(Celegans)  # from the examples in ?Celegans


##############################################################################
###
### Exercise 8
### ==========

library(BSgenome.Celegans.UCSC.ce2)

chrI <- Celegans$chrI

pattern <- DNAString("TTACCGATTTCA")

matchPattern(pattern, chrI)  # hits in the plus strand
matchPattern(reverseComplement(pattern), chrI)  # hits in the minus strand

### Look at the examples in ?reverseComplement for a discussion about why
### reverse complementing the pattern and not the subject is generally better.

matchPattern(pattern, chrI, max.mismatch=1)

matchPattern(pattern, chrI, max.mismatch=2, with.indels=TRUE)


##############################################################################
###
### Exercise 9
### ==========

library(BSgenome.Dmelanogaster.UCSC.dm3)

upstream1000 <- Dmelanogaster$upstream1000

pattern <- DNAString("TTACCGATTTCA")

mi <- vmatchPattern(pattern, upstream1000)
mi

### Extract the indices of the upstream1000 elements that received
### at least 1 hit:
upstream1000_idx <- which(countIndex(mi) != 0)

upstream1000[upstream1000_idx]


##############################################################################
###
### Exercise 10
### ===========

library(BSgenome.Celegans.UCSC.ce2)

chrI <- Celegans$chrI

pattern <- DNAString("GAACTTTGCCAC")

matchPattern(pattern, chrI)  # no match

pattern2 <- DNAString("GAACNNNGCCAC")

matchPattern(pattern2, chrI)  # no match

matchPattern(pattern2, chrI, fixed=FALSE)  # 27 matches!


##############################################################################
###
### Exercise 11
### ===========

library(drosophila2probe)

dict0 <- DNAStringSet(drosophila2probe)

pdict0 <- PDict(dict0)
pdict0

library(BSgenome.Dmelanogaster.UCSC.dm3)

chr2L <- unmasked(Dmelanogaster$chr2L)

mi <- matchPDict(pdict0, chr2L)
mi

ci <- countIndex(mi)

### Probes with the higher number of hits:
dict0_idx <- which(ci == max(ci))
dict0[dict0_idx]

Views(chr2L, mi[[dict0_idx[1]]])  # showing hits for probe number dict0_idx[1]

matchPattern(dict0[[dict0_idx[1]]], chr2L)  # same result

### Hits in the minus strand:
rcdict0 <- reverseComplement(dict0)
rcpdict0 <- PDict(rcdict0)

rcmi <- matchPDict(rcpdict0, chr2L)
rcci <- countIndex(rcmi)

### Total count of hits:
totalci <- ci + rcci

### With up to 2 mismatching nucleotides in the last 12 nucleotides of
### each probe:

pdict2 <- PDict(dict0, tb.end=-13)
pdict2

mi2 <- matchPDict(pdict2, chr2L, max.mismatch=2)
ci2 <- countIndex(mi2)
dict0_idx2 <- which(ci2 == max(ci2))
dict0[dict0_idx2]
v <- Views(chr2L, mi2[[dict0_idx2[1]]])
consensusMatrix(v)  # no mismatches

v <- Views(chr2L, mi2[[dict0_idx2[2]]]) 
consensusMatrix(v)  # 1 mismatch at the second last base
v[1:15]  # see hit number 13

### Note that some probes do have matches with 2 mismatches:
Views(chr2L, mi2[[114702]])