--- title: "An Ultra-Fast All-in-One FASTQ preprocessor" author: "Wei Wang " date: "`r format(Sys.Date(), '%m/%d/%Y')`" package: Rfastp output: BiocStyle::html_document: number_sections: yes toc: true vignette: > %\VignetteIndexEntry{Rfastp} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} %\usepackage[utf8]{inputenc} bibliography: - fastp.bib --- ```{r setup, echo=FALSE, results="hide", include = FALSE} knitr::opts_chunk$set(tidy=FALSE, cache=FALSE, #dev="png", message=FALSE, error=FALSE, warning=TRUE) options(width=100) ``` # Introduction The Rfastp package provides an interface to the all-in-one preprocessing for FastQ files toolkit [fastp](https://github.com/OpenGene/fastp)[@10.1093/bioinformatics/bty560]. # Installation Use the `BiocManager` package to download and install the package from Bioconductor as follows: ```{r getPackage, eval=FALSE} if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("Rfastp") ``` If required, the latest development version of the package can also be installed from GitHub. ```{r, eval = FALSE} BiocManager::install("remotes") BiocManager::install("RockefellerUniversity/Rfastp") ``` Once the package is installed, load it into your R session: ```{r} library(Rfastp) ``` # FastQ Quality Control with rfastp The package contains three example fastq files, corresponding to a single-end fastq file, a pair of paired-end fastq files. ```{r} se_read1 <- system.file("extdata","Fox3_Std_small.fq.gz",package="Rfastp") pe_read1 <- system.file("extdata","reads1.fastq.gz",package="Rfastp") pe_read2 <- system.file("extdata","reads2.fastq.gz",package="Rfastp") outputPrefix <- tempfile(tmpdir = tempdir()) ``` ## a normal QC run for single-end fastq file. Rfastp support multiple threads, set threads number by parameter `thread`. ```{r} se_json_report <- rfastp(read1 = se_read1, outputFastq = paste0(outputPrefix, "_se"), thread = 4) ``` ## a normal QC run for paired-end fastq files. ```{r} pe_json_report <- rfastp(read1 = pe_read1, read2 = pe_read2, outputFastq = paste0(outputPrefix, "_pe")) ``` ## merge paired-end fastq files after QC. ```{r} pe_merge_json_report <- rfastp(read1 = pe_read1, read2 = pe_read2, merge = TRUE, outputFastq = paste0(outputPrefix, '_unpaired'), mergeOut = paste0(outputPrefix, "_merged.fastq.gz")) ``` ## UMI processing ### a normal UMI processing for 10X Single-Cell library. ```{r} umi_json_report <- rfastp(read1 = pe_read1, read2 = pe_read2, outputFastq = paste0(outputPrefix, '_umi1'), umi = TRUE, umiLoc = "read1", umiLength = 16) ``` ### Set a customized UMI prefix and location in sequence name. the following example will add prefix string before the UMI sequence in the sequence name. An "_" will be added between the prefix string and UMI sequence. The UMI sequences will be inserted into the sequence name before the first space. ```{r umi} umi_json_report <- rfastp(read1 = pe_read1, read2 = pe_read2, outputFastq = paste0(outputPrefix, '_umi2'), umi = TRUE, umiLoc = "read1", umiLength = 16, umiPrefix = "#", umiNoConnection = TRUE, umiIgnoreSeqNameSpace = TRUE) ``` ## A QC example with customized cutoffs and adapter sequence. Trim poor quality bases at 3' end base by base with quality higher than 5; trim poor quality bases at 5' end by a 29bp window with mean quality higher than 20; disable the polyG trimming, specify the adapter sequence for read1. ```{r} clipr_json_report <- rfastp(read1 = se_read1, outputFastq = paste0(outputPrefix, '_clipr'), disableTrimPolyG = TRUE, cutLowQualFront = TRUE, cutFrontWindowSize = 29, cutFrontMeanQual = 20, cutLowQualTail = TRUE, cutTailWindowSize = 1, cutTailMeanQual = 5, minReadLength = 29, adapterSequenceRead1 = 'GTGTCAGTCACTTCCAGCGG' ) ``` ## multiple input files for read1/2 in a vector. rfastq can accept multiple input files, and it will concatenate the input files into one and the run fastp. ```{r} pe001_read1 <- system.file("extdata","splited_001_R1.fastq.gz", package="Rfastp") pe002_read1 <- system.file("extdata","splited_002_R1.fastq.gz", package="Rfastp") pe003_read1 <- system.file("extdata","splited_003_R1.fastq.gz", package="Rfastp") pe004_read1 <- system.file("extdata","splited_004_R1.fastq.gz", package="Rfastp") inputfiles <- c(pe001_read1, pe002_read1, pe003_read1, pe004_read1) cat_rjson_report <- rfastp(read1 = inputfiles, outputFastq = paste0(outputPrefix, "_merged1")) ``` # concatenate multiple fastq files. ## catfastq concatenate all the input files into a new file. ```{r} pe001_read2 <- system.file("extdata","splited_001_R2.fastq.gz", package="Rfastp") pe002_read2 <- system.file("extdata","splited_002_R2.fastq.gz", package="Rfastp") pe003_read2 <- system.file("extdata","splited_003_R2.fastq.gz", package="Rfastp") pe004_read2 <- system.file("extdata","splited_004_R2.fastq.gz", package="Rfastp") inputR2files <- c(pe001_read2, pe002_read2, pe003_read2, pe004_read2) catfastq(output = paste0(outputPrefix,"_merged2_R2.fastq.gz"), inputFiles = inputR2files) ``` # Generate report tables/plots ## A data frame for the summary. ```{r} dfsummary <- qcSummary(pe_json_report) ``` ## a ggplot2 object of base quality plot. ```{r} p1 <- curvePlot(se_json_report) p1 ``` ## a ggplot2 object of GC Content plot. ```{r} p2 <- curvePlot(se_json_report, curve="content_curves") p2 ``` ## a data frame for the trimming summary. ```{r} dfTrim <- trimSummary(pe_json_report) ``` # Miscellaneous helper functions usage of rfastp: ```{r} ?rfastp ``` usage of catfastq: ```{r} ?catfastq ``` usage of qcSummary: ```{r} ?qcSummary ``` usage of trimSummary: ```{r} ?trimSummary ``` usage of curvePlot: ```{r} ?curvePlot ``` # Acknowledgments Thank you to Ji-Dung Luo for testing/vignette review/critical feedback, Doug Barrows for critical feedback/vignette review and Ziwei Liang for their support. # Session info ```{r sessionInfo} sessionInfo() ``` # References