## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE)

## ----preliminaries, message=FALSE, echo=FALSE---------------------------------
library("ShortRead")

## ----sample, eval=FALSE-------------------------------------------------------
# sampler <- FastqSampler('E-MTAB-1147/fastq/ERR127302_1.fastq.gz', 20000)
# set.seed(123); ERR127302_1 <- yield(sampler)
# sampler <- FastqSampler('E-MTAB-1147/fastq/ERR127302_2.fastq.gz', 20000)
# set.seed(123); ERR127302_2 <- yield(sampler)

## ----stream, eval=FALSE-------------------------------------------------------
# strm <- FastqStreamer("a.fastq.gz")
# repeat {
#     fq <- yield(strm)
#     if (length(fq) == 0)
#         break
#     ## process chunk
# }

## ----sampler, eval=FALSE------------------------------------------------------
# sampler <- FastqSampler("a.fastq.gz")
# fq <- yield(sampler)

## ----countFastq---------------------------------------------------------------
fl <- system.file(package="ShortRead", "extdata", "E-MTAB-1147",
                  "ERR127302_1_subset.fastq.gz")
countFastq(fl)

## ----readFastq----------------------------------------------------------------
fq <- readFastq(fl)

## ----ShortReadQ---------------------------------------------------------------
fq
fq[1:5]
head(sread(fq), 3)
head(quality(fq), 3)

## ----encoding-----------------------------------------------------------------
encoding(quality(fq))

## ----qa-files1, eval=FALSE----------------------------------------------------
# fls <- dir("/path/to", "*fastq$", full=TRUE)

## ----qa-qa, eval=FALSE--------------------------------------------------------
# qaSummary <- qa(fls, type="fastq")

## ----qa-view, eval=FALSE------------------------------------------------------
# browseURL(report(qaSummary))

## ----qa-files2, echo=FALSE----------------------------------------------------
load("qa_E-MTAB-1147.Rda")

## ----qa-elements--------------------------------------------------------------
qaSummary

## ----qa-readCounts------------------------------------------------------------
head(qaSummary[["readCounts"]])
head(qaSummary[["baseCalls"]])

## ----filter-scheme------------------------------------------------------------
myFilterAndTrim <-
    function(fl, destination=sprintf("%s_subset", fl))
{
    ## open input stream
    stream <- open(FastqStreamer(fl))
    on.exit(close(stream))
    repeat {
        ## input chunk
        fq <- yield(stream)
        if (length(fq) == 0)
            break
        ## trim and filter, e.g., reads cannot contain 'N'...
        fq <- fq[nFilter()(fq)]  # see ?srFilter for pre-defined filters
        ## trim as soon as 2 of 5 nucleotides has quality encoding less
        ## than "4" (phred score 20)
        fq <- trimTailw(fq, 2, "4", 2)
        ## drop reads that are less than 36nt
        fq <- fq[width(fq) >= 36]
        ## append to destination
        writeFastq(fq, destination, "a")
    }
}

## ----export-------------------------------------------------------------------
## location of file
exptPath <- system.file("extdata", package="ShortRead")
sp <- SolexaPath(exptPath)
pattern <- "s_2_export.txt"
fl <- file.path(analysisPath(sp), pattern)
strsplit(readLines(fl, n=1), "\t")
length(readLines(fl))

## ----colClasses---------------------------------------------------------------
colClasses <- rep(list(NULL), 21)
colClasses[9:10] <- c("DNAString", "BString")
names(colClasses)[9:10] <- c("read", "quality")

## ----readXStringColumns-------------------------------------------------------
cols <- readXStringColumns(analysisPath(sp), pattern, colClasses)
cols

## ----size---------------------------------------------------------------------
object.size(cols$read)
object.size(as.character(cols$read))

## ----tables-------------------------------------------------------------------
tbls <- tables(fq)
names(tbls)
tbls$top[1:5]
head(tbls$distribution)

## ----srdistance---------------------------------------------------------------
dist <- srdistance(sread(fq), names(tbls$top)[1])[[1]]
table(dist)[1:10]

## ----aln-not-near-------------------------------------------------------------
fqSubset <- fq[dist>4]

## ----polya--------------------------------------------------------------------
countA <- alphabetFrequency(sread(fq))[,"A"]
fqNoPolyA <- fq[countA < 30]

## ----sessionInfo--------------------------------------------------------------
sessionInfo()