## ----style, echo = FALSE, results = 'asis'-------------------------------------------------------- BiocStyle::markdown() options(width = 100, max.print = 1000) knitr::opts_chunk$set( eval = as.logical(Sys.getenv("KNITR_EVAL", "TRUE")), cache = as.logical(Sys.getenv("KNITR_CACHE", "TRUE")), tidy.opts = list(width.cutoff = 100), tidy = FALSE) ## ----setting, eval=TRUE, echo=FALSE--------------------------------------------------------------- if (file.exists("spr_project")) unlink("spr_project", recursive = TRUE) is_win <- Sys.info()[['sysname']] != "Windows" ## ----load_library, eval=TRUE, include=FALSE------------------------------------------------------- suppressPackageStartupMessages({ library(systemPipeR) }) ## ----utilities, eval=TRUE, warning= FALSE, echo=FALSE, out.width="100%", fig.align = "center", fig.cap= "Important functionalities of systemPipeR. (A) Illustration of workflow design concepts, and (B) examples of visualization functionalities for NGS data.", warning=FALSE---- knitr::include_graphics("images/utilities.png") ## ----sysargslistImage, warning= FALSE, eval=TRUE, echo=FALSE, out.width="100%", fig.align = "center", fig.cap= "Workflow management class. Workflows are defined and managed by the `SYSargsList` (`SAL`) control class. Components of `SAL` include `SYSargs2` and/or `LineWise` for defining CL- and R-based workflow steps, respectively. The former are constructed from a `targets` and two CWL *param* files, and the latter comprises mainly R code.", warning=FALSE---- knitr::include_graphics("images/SYSargsList.png") ## ----sprandCWL, warning=FALSE, eval=TRUE, echo=FALSE, out.width="100%", fig.align = "center", fig.cap= "Parameter files. Illustration how the different fields in cwl, yml and targets files are connected to assemble command-line calls, here for 'Hello World' example.", warning=FALSE---- knitr::include_graphics("images/SPR_CWL_hello.png") ## ----general, warning= FALSE, eval=TRUE, echo=FALSE, out.width="100%", fig.align = "center", fig.cap= "Overview of `systemPipeR` workflows management instances. (A) A typical analysis workflow requires multiple software tools (red), metadata for describing the input (green) and output data, and analysis reports for interpreting the results (purple). B) The environment provides utilities for designing and building workflows containing R and/or command-line steps, for managing the workflow runs. C) Options are provided to execute single or multiple workflow steps. This includes a high level of scalability, functionalities for checkpointing, and generating of technical and scientific reports.", warning=FALSE---- knitr::include_graphics("images/general.png") ## ----install, eval=FALSE-------------------------------------------------------------------------- # if (!requireNamespace("BiocManager", quietly=TRUE)) install.packages("BiocManager") # BiocManager::install("systemPipeR") # BiocManager::install("systemPipeRdata") ## ----eval=FALSE----------------------------------------------------------------------------------- # systemPipeRdata::genWorkenvir(workflow = "rnaseq") # setwd("rnaseq") ## ----eval=FALSE----------------------------------------------------------------------------------- # library(systemPipeR) # # Initialize workflow project # sal <- SPRproject() # ## Creating directory '/home/myuser/systemPipeR/rnaseq/.SPRproject' # ## Creating file '/home/myuser/systemPipeR/rnaseq/.SPRproject/SYSargsList.yml' # sal <- importWF(sal, file_path = "systemPipeRNAseq.Rmd") # import into sal the WF steps defined by chosen Rmd file # # ## The following print statements, issued during the import, are shortened for brevity # ## Import messages for first 3 of 20 steps total # ## Parse chunk code # ## Now importing step 'load_SPR' # ## Now importing step 'preprocessing' # ## Now importing step 'trimming' # ## Now importing step '...' # ## ... # # ## Now check if required CL tools are installed # ## Messages for 4 of 7 CL tools total # ## step_name tool in_path # ## 1 trimming trimmomatic TRUE # ## 2 hisat2_index hisat2-build TRUE # ## 3 hisat2_mapping hisat2 TRUE # ## 4 hisat2_mapping samtools TRUE # ## ... ## ----eval=FALSE----------------------------------------------------------------------------------- # sal # ## Instance of 'SYSargsList': # ## WF Steps: # ## 1. load_SPR --> Status: Pending # ## 2. preprocessing --> Status: Pending # ## Total Files: 36 | Existing: 0 | Missing: 36 # ## 2.1. preprocessReads-pe # ## cmdlist: 18 | Pending: 18 # ## 3. trimming --> Status: Pending # ## Total Files: 72 | Existing: 0 | Missing: 72 # ## 4. - 20. not shown here for brevity ## ----eval=FALSE----------------------------------------------------------------------------------- # sal <- runWF(sal) ## ----eval=FALSE----------------------------------------------------------------------------------- # sal ## ----wf-status-image, warning=FALSE, eval=TRUE, echo=FALSE, out.width="100%", fig.align = "center", fig.cap= "Status check of workflow. The run status flags of each workflow step are given in its summary view.", warning=FALSE---- knitr::include_graphics("images/runwf.png") ## ----eval=FALSE----------------------------------------------------------------------------------- # plotWF(sal) ## ----rnaseq-toplogy, eval=TRUE, warning= FALSE, echo=FALSE, out.width="100%", fig.align = "center", fig.cap= "Toplogy graph of RNA-Seq workflow.", warning=FALSE---- knitr::include_graphics("images/plotWF.png") ## ----eval=FALSE----------------------------------------------------------------------------------- # # Scietific report # sal <- renderReport(sal) # rmarkdown::render("systemPipeRNAseq.Rmd", clean = TRUE, output_format = "BiocStyle::html_document") # # # Technical (log) report # sal <- renderLogs(sal) ## ----dir, eval=TRUE, echo=FALSE, warning= FALSE, out.width="100%", fig.align = "center", fig.cap= "Directory structure of workflows.", warning=FALSE---- knitr::include_graphics("images/spr_project.png") ## ----targetsSE, eval=TRUE------------------------------------------------------------------------- targetspath <- system.file("extdata", "targets.txt", package = "systemPipeR") showDF(read.delim(targetspath, comment.char = "#")) ## ----targetsPE, eval=TRUE------------------------------------------------------------------------- targetspath <- system.file("extdata", "targetsPE.txt", package = "systemPipeR") showDF(read.delim(targetspath, comment.char = "#")) ## ----comment_lines, echo=TRUE--------------------------------------------------------------------- readLines(targetspath)[1:4] ## ----targetscomp, eval=TRUE----------------------------------------------------------------------- readComp(file = targetspath, format = "vector", delim = "-") ## ----targetsFig, eval=TRUE, echo=FALSE, out.width="100%", fig.align = "center", fig.cap= "_`systemPipeR`_ automatically creates the downstream `targets` files based on the previous steps outfiles. A) Usually, users provide the initial `targets` files, and this step will generate some outfiles, as demonstrated on B. Then, those files are used to build the new `targets` files as inputs in the next step. _`systemPipeR`_ (C) manages this connectivity among the steps automatically for the users.", warning=FALSE---- knitr::include_graphics("images/targets_con.png") ## ----cleaning1, eval=TRUE, include=FALSE---------------------------------------------------------- if (file.exists(".SPRproject")) unlink(".SPRproject", recursive = TRUE) ## NOTE: Removes previous project created in the quick-start section ## ----SPRproject1a, eval=FALSE--------------------------------------------------------------------- # sal <- SPRproject() ## ----SPRproject1, eval=TRUE----------------------------------------------------------------------- sal <- SPRproject(projPath = getwd(), overwrite = TRUE) ## ----SPRproject_dir, eval=FALSE------------------------------------------------------------------- # sal <- SPRproject(data = "data", param = "param", results = "results") ## ----SPRproject_logs, eval=FALSE------------------------------------------------------------------ # sal <- SPRproject(logs.dir= ".SPRproject", sys.file=".SPRproject/SYSargsList.yml") ## ----SPRproject_env, eval=FALSE------------------------------------------------------------------- # sal <- SPRproject(envir = new.env()) ## ----projectInfo, eval=TRUE----------------------------------------------------------------------- sal projectInfo(sal) ## ----length, eval=TRUE---------------------------------------------------------------------------- length(sal) ## ----sal_check, eval=TRUE------------------------------------------------------------------------- sal ## ----firstStep_R, eval=TRUE----------------------------------------------------------------------- appendStep(sal) <- LineWise(code = { mapply(function(x, y) write.csv(x, y), split(iris, factor(iris$Species)), file.path("results", paste0(names(split(iris, factor(iris$Species))), ".csv")) ) }, step_name = "export_iris") ## ----show, eval=TRUE------------------------------------------------------------------------------ sal ## ----codeLine, eval=TRUE-------------------------------------------------------------------------- codeLine(sal) ## ----gzip_secondStep, eval=TRUE------------------------------------------------------------------- targetspath <- system.file("extdata/cwl/gunzip", "targets_gunzip.txt", package = "systemPipeR") appendStep(sal) <- SYSargsList(step_name = "gzip", targets = targetspath, dir = TRUE, wf_file = "gunzip/workflow_gzip.cwl", input_file = "gunzip/gzip.yml", dir_path = system.file("extdata/cwl", package = "systemPipeR"), inputvars = c(FileName = "_FILE_PATH_", SampleName = "_SampleName_"), dependency = "export_iris") ## ------------------------------------------------------------------------------------------------- sal ## ------------------------------------------------------------------------------------------------- cmdlist(sal, step = "gzip") # cmdlist(sal, step = "gzip", targets=c("SE")) ## ------------------------------------------------------------------------------------------------- # outfiles(sal) # output files of all steps in sal outfiles(sal)['gzip'] # output files of 'gzip' step # colnames(outfiles(sal)$gzip) # returns column name passed on to `inputvars` ## ----gunzip, eval=TRUE---------------------------------------------------------------------------- appendStep(sal) <- SYSargsList(step_name = "gunzip", targets = "gzip", dir = TRUE, wf_file = "gunzip/workflow_gunzip.cwl", input_file = "gunzip/gunzip.yml", dir_path = system.file("extdata/cwl", package = "systemPipeR"), inputvars = c(gzip_file = "_FILE_PATH_", SampleName = "_SampleName_"), rm_targets_col = "FileName", dependency = "gzip") ## ------------------------------------------------------------------------------------------------- sal ## ----targetsWF_3, eval=TRUE----------------------------------------------------------------------- targetsWF(sal['gunzip']) ## ----outfiles_2, eval=TRUE------------------------------------------------------------------------ outfiles(sal['gunzip']) ## ----eval=TRUE------------------------------------------------------------------------------------ cmdlist(sal["gunzip"], targets = 1) ## ----getColumn, eval=TRUE------------------------------------------------------------------------- getColumn(sal, step = "gunzip", 'outfiles') ## ----iris_stats, eval=TRUE------------------------------------------------------------------------ appendStep(sal) <- LineWise(code = { df <- lapply(getColumn(sal, step = "gunzip", 'outfiles'), function(x) read.delim(x, sep = ",")[-1]) df <- do.call(rbind, df) stats <- data.frame(cbind(mean = apply(df[,1:4], 2, mean), sd = apply(df[,1:4], 2, sd))) stats$size <- rownames(stats) plot <- ggplot2::ggplot(stats, ggplot2::aes(x = size, y = mean, fill = size)) + ggplot2::geom_bar(stat = "identity", color = "black", position = ggplot2::position_dodge()) + ggplot2::geom_errorbar(ggplot2::aes(ymin = mean-sd, ymax = mean+sd), width = .2, position = ggplot2::position_dodge(.9)) }, step_name = "iris_stats", dependency = "gzip") ## ------------------------------------------------------------------------------------------------- sal ## ----importWF_rmd, eval=TRUE---------------------------------------------------------------------- sal_rmd <- SPRproject(logs.dir = ".SPRproject_rmd") sal_rmd <- importWF(sal_rmd, file_path = system.file("extdata", "spr_simple_wf.Rmd", package = "systemPipeR")) ## ----importWF_details, eval=FALSE----------------------------------------------------------------- # sal_rmd # stepsWF(sal_rmd) # dependency(sal_rmd) # cmdlist(sal_rmd) # codeLine(sal_rmd) # targetsWF(sal_rmd) # statusWF(sal_rmd) ## ----fromFile_example_rules_cmd, eval=FALSE------------------------------------------------------- # targetspath <- system.file("extdata/cwl/example/targets_example.txt", package = "systemPipeR") # appendStep(sal) <- SYSargsList(step_name = "Example", # targets = targetspath, # wf_file = "example/example.cwl", input_file = "example/example.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = c(Message = "_STRING_", SampleName = "_SAMPLE_")) ## ----fromFile_example_rules_r, eval=FALSE--------------------------------------------------------- # appendStep(sal) <- LineWise(code = { # library(systemPipeR) # }, # step_name = "load_lib") ## ----runWF, eval=is_win--------------------------------------------------------------------------- sal <- runWF(sal) ## ----runWF_error, eval=FALSE---------------------------------------------------------------------- # sal <- runWF(sal, steps = c(1,3)) ## ----runWF_force, eval=FALSE---------------------------------------------------------------------- # sal <- runWF(sal, force = TRUE, warning.stop = FALSE, error.stop = TRUE) ## ----runWF_env, eval=FALSE------------------------------------------------------------------------ # viewEnvir(sal) ## ----runWF_saveenv, eval=FALSE-------------------------------------------------------------------- # sal <- runWF(sal, saveEnv = TRUE) ## ----show_statusWF_details1, eval=TRUE------------------------------------------------------------ sal ## ----show_statusWF_details2, eval=FALSE----------------------------------------------------------- # stepsWF(sal) # dependency(sal) # cmdlist(sal) # codeLine(sal) # targetsWF(sal) # statusWF(sal) # projectInfo(sal) ## ----save_sal, eval=FALSE------------------------------------------------------------------------- # sal <- write_SYSargsList(sal) ## ----module_cmds, eval=FALSE---------------------------------------------------------------------- # module(action_type="load", module_name="hisat2") # moduleload("hisat2") # alternative command # moduleunload("hisat2") # modulelist() # list software loaded into current session # moduleAvail() # list all software available in module system ## ----list_module, eval=FALSE---------------------------------------------------------------------- # listCmdModules(sal) # listCmdTools(sal) ## ----runWF_cluster, eval=FALSE-------------------------------------------------------------------- # resources <- list(conffile=".batchtools.conf.R", # template="batchtools.slurm.tmpl", # Njobs=18, # walltime=120, ## in minutes # ntasks=1, # ncpus=4, # memory=1024, ## in Mb # partition = "short" # ) # sal <- addResources(sal, step=c("hisat2_mapping"), resources = resources) # sal <- runWF(sal) ## ----eval=TRUE------------------------------------------------------------------------------------ plotWF(sal, show_legend = TRUE, width = "80%", rstudio = TRUE) ## ----eval=FALSE----------------------------------------------------------------------------------- # sal <- renderReport(sal) # # rmarkdown::render("my.Rmd", clean = TRUE, output_format = "BiocStyle::html_document") ## ----eval=FALSE----------------------------------------------------------------------------------- # sal <- renderLogs(sal) ## ----eval=FALSE----------------------------------------------------------------------------------- # sal2rmd(sal) ## ----eval=FALSE----------------------------------------------------------------------------------- # sal2bash(sal) ## ----SPR_resume, eval=FALSE----------------------------------------------------------------------- # sal <- SPRproject(resume = TRUE) ## ----resume_load, eval=FALSE---------------------------------------------------------------------- # sal <- SPRproject(resume = TRUE, load.envir = TRUE) ## ----envir, eval=FALSE---------------------------------------------------------------------------- # viewEnvir(sal) # copyEnvir(sal, list="plot", new.env = globalenv()) ## ----restart_load, eval=FALSE--------------------------------------------------------------------- # sal <- SPRproject(restart = TRUE) ## ----SPR_overwrite, eval=FALSE-------------------------------------------------------------------- # sal <- SPRproject(overwrite = TRUE) ## ------------------------------------------------------------------------------------------------- length(sal) ## ------------------------------------------------------------------------------------------------- stepName(sal) ## ------------------------------------------------------------------------------------------------- listCmdTools(sal) ## ------------------------------------------------------------------------------------------------- listCmdModules(sal) modules(stepsWF(sal)$gzip) ## ------------------------------------------------------------------------------------------------- names(sal) ## ------------------------------------------------------------------------------------------------- stepsWF(sal) ## ------------------------------------------------------------------------------------------------- names(stepsWF(sal)$gzip) ## ------------------------------------------------------------------------------------------------- statusWF(sal) ## ------------------------------------------------------------------------------------------------- targetsWF(sal[2]) ## ----eval=FALSE----------------------------------------------------------------------------------- # targetsheader(sal, step = "Quality") ## ------------------------------------------------------------------------------------------------- outfiles(sal[2]) ## ------------------------------------------------------------------------------------------------- dependency(sal) ## ------------------------------------------------------------------------------------------------- SampleName(sal, step = "gzip") ## ------------------------------------------------------------------------------------------------- getColumn(sal, "outfiles", step = "gzip", column = "gzip_file") getColumn(sal, "targetsWF", step = "gzip", column = "FileName") ## ------------------------------------------------------------------------------------------------- yamlinput(sal, step = "gzip") ## ------------------------------------------------------------------------------------------------- cmdlist(sal, step = c(2,3), targets = 1) ## ------------------------------------------------------------------------------------------------- codeLine(sal, step = "export_iris") ## ------------------------------------------------------------------------------------------------- viewEnvir(sal) ## ------------------------------------------------------------------------------------------------- copyEnvir(sal, list = c("plot"), new.env = globalenv(), silent = FALSE) ## ------------------------------------------------------------------------------------------------- length(sal) sal[1:2] ## ----eval=FALSE----------------------------------------------------------------------------------- # sal_sub <- subset(sal, subset_steps = c(2,3), input_targets = c("SE", "VE"), keep_steps = TRUE) # stepsWF(sal_sub) # targetsWF(sal_sub) # outfiles(sal_sub) ## ----eval=FALSE----------------------------------------------------------------------------------- # sal[1] + sal[2] + sal[3] ## ----eval=TRUE------------------------------------------------------------------------------------ ## create a copy of sal for testing sal_c <- sal ## view original value, here restricted to 'ext' slot yamlinput(sal_c, step = "gzip")$ext ## Replace value under 'ext' yamlinput(sal_c, step = "gzip", paramName = "ext") <- "txt.gz" ## view modified value, here restricted to 'ext' slot yamlinput(sal_c, step = "gzip")$ext ## Evaluate resulting CL call cmdlist(sal_c, step = "gzip", targets = 1) ## ----sal_lw_rep, eval=TRUE------------------------------------------------------------------------ appendCodeLine(sal_c, step = "export_iris", after = 1) <- "log_cal_100 <- log(100)" codeLine(sal_c, step = "export_iris") ## ----sal_lw_rep2, eval=TRUE----------------------------------------------------------------------- replaceCodeLine(sal_c, step="export_iris", line = 2) <- LineWise(code={ log_cal_100 <- log(50) }) codeLine(sal_c, step = "export_iris") ## ------------------------------------------------------------------------------------------------- renameStep(sal_c, c(1, 2)) <- c("newStep2", "newIndex") sal_c names(outfiles(sal_c)) names(targetsWF(sal_c)) dependency(sal_c) ## ----eval=FALSE----------------------------------------------------------------------------------- # sal_test <- sal[c(1,2)] # replaceStep(sal_test, step = 1, step_name = "gunzip" ) <- sal[3] # sal_test ## ------------------------------------------------------------------------------------------------- sal_test <- sal[-2] sal_test ## ------------------------------------------------------------------------------------------------- dir_path <- system.file("extdata/cwl", package = "systemPipeR") cwl <- yaml::read_yaml(file.path(dir_path, "example/example.cwl")) ## ------------------------------------------------------------------------------------------------- cwl[1:2] ## ------------------------------------------------------------------------------------------------- cwl[3] ## ------------------------------------------------------------------------------------------------- cwl[4] ## ------------------------------------------------------------------------------------------------- cwl[5] ## ------------------------------------------------------------------------------------------------- cwl[6] ## ------------------------------------------------------------------------------------------------- cwl.wf <- yaml::read_yaml(file.path(dir_path, "example/workflow_example.cwl")) ## ------------------------------------------------------------------------------------------------- cwl.wf[1:2] ## ------------------------------------------------------------------------------------------------- cwl.wf[3] ## ------------------------------------------------------------------------------------------------- cwl.wf[4] ## ------------------------------------------------------------------------------------------------- cwl.wf[5] ## ------------------------------------------------------------------------------------------------- yaml::read_yaml(file.path(dir_path, "example/example_single.yml")) ## ----fromFile, eval=TRUE-------------------------------------------------------------------------- HW <- SYSargsList(wf_file = "example/workflow_example.cwl", input_file = "example/example_single.yml", dir_path = system.file("extdata/cwl", package = "systemPipeR")) HW cmdlist(HW) ## ------------------------------------------------------------------------------------------------- yml <- yaml::read_yaml(file.path(dir_path, "example/example.yml")) yml ## ------------------------------------------------------------------------------------------------- targetspath <- system.file("extdata/cwl/example/targets_example.txt", package = "systemPipeR") read.delim(targetspath, comment.char = "#") ## ----fromFile_example, eval=TRUE------------------------------------------------------------------ HW_mul <- SYSargsList(step_name = "echo", targets=targetspath, wf_file="example/workflow_example.cwl", input_file="example/example.yml", dir_path = dir_path, inputvars = c(Message = "_STRING_", SampleName = "_SAMPLE_")) HW_mul ## ----fromFile_example2, eval=TRUE----------------------------------------------------------------- cmdlist(HW_mul) ## ----cmd_orig, eval=FALSE------------------------------------------------------------------------- # hisat2 -S ./results/M1A.sam -x ./data/tair10.fasta -k 1 -min-intronlen 30 -max-intronlen 3000 -threads 4 -U ./data/SRR446027_1.fastq.gz ## ----cmd, eval=TRUE------------------------------------------------------------------------------- command <- " hisat2 \ -S \ -x \ -k \ -min-intronlen \ -max-intronlen \ -threads \ -U " ## ------------------------------------------------------------------------------------------------- cmd <- createParam(command, writeParamFiles = TRUE, overwrite=TRUE, confirm=TRUE) ## ------------------------------------------------------------------------------------------------- cmdlist(cmd) ## ----saving, eval=FALSE--------------------------------------------------------------------------- # writeParamFiles(cmd, overwrite = TRUE) ## ----sysargs2, eval=TRUE, results="hide"---------------------------------------------------------- command2 <- " hisat2 \ -S \ -x \ -k \ -min-intronlen \ -max-intronlen \ -threads \ -U " WF <- createParam(command2, overwrite = TRUE, writeParamFiles = TRUE, confirm = TRUE) targetspath <- system.file("extdata", "targets.txt", package = "systemPipeR") WF_test <- loadWorkflow(targets = targetspath, wf_file="hisat2.cwl", input_file="hisat2.yml", dir_path = "param/cwl/hisat2/") WF_test <- renderWF(WF_test, inputvars = c(FileName = "_FASTQ_PATH1_")) ## ----sysargs2b, eval=TRUE------------------------------------------------------------------------- WF_test cmdlist(WF_test)[1:2] ## ----eval=FALSE----------------------------------------------------------------------------------- # printParam(cmd, position = "baseCommand") ## Return baseCommand # printParam(cmd, position = "outputs") ## Return outputs # printParam(cmd, position = "inputs", index = 1:2) ## Return components by index # printParam(cmd, position = "inputs", index = -1:-2) ## Negative index subsetting ## ----eval=FALSE----------------------------------------------------------------------------------- # cmd2 <- subsetParam(cmd, position = "inputs", index = 1:2, trim = TRUE) # cmdlist(cmd2) # # cmd2 <- subsetParam(cmd, position = "inputs", index = c("S", "x"), trim = TRUE) # cmdlist(cmd2) ## ----eval=FALSE----------------------------------------------------------------------------------- # cmd3 <- replaceParam(cmd, "base", index = 1, replace = list(baseCommand = "bwa")) # cmdlist(cmd3) ## Replacement changed baseCommand ## ----eval=FALSE----------------------------------------------------------------------------------- # new_inputs <- new_inputs <- list( # "new_input1" = list(type = "File", preF="-b", yml ="myfile"), # "new_input2" = "-L " # ) # cmd4 <- replaceParam(cmd, "inputs", index = 1:2, replace = new_inputs) # cmdlist(cmd4) ## ----eval=FALSE----------------------------------------------------------------------------------- # newIn <- new_inputs <- list( # "new_input1" = list(type = "File", preF="-b1", yml ="myfile1"), # "new_input2" = list(type = "File", preF="-b2", yml ="myfile2"), # "new_input3" = "-b3 " # ) # cmd5 <- appendParam(cmd, "inputs", index = 1:2, append = new_inputs) # cmdlist(cmd5) # # cmd6 <- appendParam(cmd, "inputs", index = 1:2, after=0, append = new_inputs) # cmdlist(cmd6) ## ----eval=FALSE----------------------------------------------------------------------------------- # new_outs <- list( # "sam_out" = "" # ) # cmd7 <- replaceParam(cmd, "outputs", index = 1, replace = new_outs) # output(cmd7) ## ----SYSargs2_structure, eval=TRUE---------------------------------------------------------------- library(systemPipeR) targetspath <- system.file("extdata", "targets.txt", package = "systemPipeR") dir_path <- system.file("extdata/cwl", package = "systemPipeR") WF <- loadWF(targets = targetspath, wf_file = "hisat2/hisat2-mapping-se.cwl", input_file = "hisat2/hisat2-mapping-se.yml", dir_path = dir_path) WF <- renderWF(WF, inputvars = c(FileName = "_FASTQ_PATH1_", SampleName = "_SampleName_")) ## ----cmdlist, eval=TRUE--------------------------------------------------------------------------- cmdlist(WF)[1] ## ----names_WF, eval=TRUE-------------------------------------------------------------------------- names(WF) ## ----output_WF, eval=TRUE------------------------------------------------------------------------- output(WF)[1] ## ----targets_WF, eval=TRUE------------------------------------------------------------------------ targets(WF)[1] as(WF, "DataFrame") ## ----module_WF, eval=TRUE------------------------------------------------------------------------- modules(WF) ## ----other_WF, eval=FALSE------------------------------------------------------------------------- # files(WF) # inputvars(WF) ## ----lw, eval=TRUE-------------------------------------------------------------------------------- rmd <- system.file("extdata", "spr_simple_lw.Rmd", package = "systemPipeR") sal_lw <- SPRproject(overwrite = TRUE) sal_lw <- importWF(sal_lw, rmd, verbose = FALSE) codeLine(sal_lw) ## ----lw_coerce, eval=TRUE------------------------------------------------------------------------- lw <- stepsWF(sal_lw)[[2]] ## Coerce ll <- as(lw, "list") class(ll) lw <- as(ll, "LineWise") lw ## ----lw_access, eval=TRUE------------------------------------------------------------------------- length(lw) names(lw) codeLine(lw) codeChunkStart(lw) rmdPath(lw) ## ----lw_sub, eval=TRUE---------------------------------------------------------------------------- l <- lw[2] codeLine(l) l_sub <- lw[-2] codeLine(l_sub) ## ----lw_rep, eval=TRUE---------------------------------------------------------------------------- replaceCodeLine(lw, line = 2) <- "5+5" codeLine(lw) appendCodeLine(lw, after = 0) <- "6+7" codeLine(lw) ## ----sal_rep_append, eval=FALSE------------------------------------------------------------------- # replaceCodeLine(sal_lw, step = 2, line = 2) <- LineWise(code={ # "5+5" # }) # codeLine(sal_lw, step = 2) # # appendCodeLine(sal_lw, step = 2) <- "66+55" # codeLine(sal_lw, step = 2) # # appendCodeLine(sal_lw, step = 1, after = 1) <- "66+55" # codeLine(sal_lw, step = 1) ## ----table_tools, echo=FALSE, message=FALSE------------------------------------------------------- library(magrittr) SPR_software <- system.file("extdata", "SPR_software.csv", package = "systemPipeR") software <- read.delim(SPR_software, sep = ",", comment.char = "#") colors <- colorRampPalette((c("darkseagreen", "indianred1")))(length(unique(software$Category))) id <- as.numeric(c((unique(software$Category)))) software %>% dplyr::mutate(Step = kableExtra::cell_spec(Step, color = "white", bold = TRUE, background = factor(Category, id, colors))) %>% dplyr::select(Tool, Step, Description) %>% dplyr::arrange(Tool) %>% kableExtra::kable("html", escape = FALSE, col.names = c("Tool Name", "Description", "Step")) %>% kableExtra::kable_material(c("striped", "hover", "condensed")) %>% kableExtra::scroll_box(width = "80%", height = "500px") ## ----cleaning3, eval=TRUE, include=FALSE---------------------------------------------------------- if (file.exists(".SPRproject")) unlink(".SPRproject", recursive = TRUE) ## NOTE: Removing previous project create in the quick starts section ## ----SPRproject2, eval=FALSE---------------------------------------------------------------------- # sal <- SPRproject(projPath = getwd(), overwrite = TRUE) ## ----load_SPR, message=FALSE, eval=FALSE, spr=TRUE------------------------------------------------ # appendStep(sal) <- LineWise({ # library(systemPipeR) # }, # step_name = "load_SPR") ## ----preprocessing, message=FALSE, eval=FALSE, spr=TRUE------------------------------------------- # targetspath <- system.file("extdata", "targetsPE.txt", package = "systemPipeR") # appendStep(sal) <- SYSargsList( # step_name = "preprocessing", # targets = targetspath, dir = TRUE, # wf_file = "preprocessReads/preprocessReads-pe.cwl", # input_file = "preprocessReads/preprocessReads-pe.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = c( # FileName1 = "_FASTQ_PATH1_", # FileName2 = "_FASTQ_PATH2_", # SampleName = "_SampleName_" # ), # dependency = c("load_SPR")) ## ----custom_preprocessing_function, eval=FALSE---------------------------------------------------- # appendStep(sal) <- LineWise( # code = { # filterFct <- function(fq, cutoff = 20, Nexceptions = 0) { # qcount <- rowSums(as(quality(fq), "matrix") <= cutoff, na.rm = TRUE) # # Retains reads where Phred scores are >= cutoff with N exceptions # fq[qcount <= Nexceptions] # } # save(list = ls(), file = "param/customFCT.RData") # }, # step_name = "custom_preprocessing_function", # dependency = "preprocessing" # ) ## ----editing_preprocessing, message=FALSE, eval=FALSE--------------------------------------------- # yamlinput(sal, "preprocessing")$Fct # yamlinput(sal, "preprocessing", "Fct") <- "'filterFct(fq, cutoff=20, Nexceptions=0)'" # yamlinput(sal, "preprocessing")$Fct ## check the new function # cmdlist(sal, "preprocessing", targets = 1) ## check if the command line was updated with success ## ----trimGalore, eval=FALSE, spr=TRUE------------------------------------------------------------- # targetspath <- system.file("extdata", "targets.txt", package = "systemPipeR") # appendStep(sal) <- SYSargsList(step_name = "trimGalore", # targets = targetspath, dir = TRUE, # wf_file = "trim_galore/trim_galore-se.cwl", # input_file = "trim_galore/trim_galore-se.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = c(FileName = "_FASTQ_PATH1_", SampleName = "_SampleName_"), # dependency = "load_SPR", # run_step = "optional") ## ----trimmomatic, eval=FALSE, spr=TRUE------------------------------------------------------------ # targetspath <- system.file("extdata", "targets.txt", package = "systemPipeR") # appendStep(sal) <- SYSargsList(step_name = "trimmomatic", # targets = targetspath, dir = TRUE, # wf_file = "trimmomatic/trimmomatic-se.cwl", # input_file = "trimmomatic/trimmomatic-se.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = c(FileName = "_FASTQ_PATH1_", SampleName = "_SampleName_"), # dependency = "load_SPR", # run_step = "optional") ## ----fastq_report, eval=FALSE, message=FALSE, spr=TRUE-------------------------------------------- # appendStep(sal) <- LineWise(code = { # fastq <- getColumn(sal, step = "preprocessing", "targetsWF", column = 1) # fqlist <- seeFastq(fastq = fastq, batchsize = 10000, klength = 8) # pdf("./results/fastqReport.pdf", height = 18, width = 4*length(fqlist)) # seeFastqPlot(fqlist) # dev.off() # }, step_name = "fastq_report", # dependency = "preprocessing") ## ----hisat_index, eval=FALSE, spr=TRUE------------------------------------------------------------ # appendStep(sal) <- SYSargsList(step_name = "hisat_index", # targets = NULL, dir = FALSE, # wf_file = "hisat2/hisat2-index.cwl", # input_file = "hisat2/hisat2-index.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = NULL, # dependency = "preprocessing") ## ----hisat_mapping_samtools, eval=FALSE, spr=TRUE------------------------------------------------- # appendStep(sal) <- SYSargsList(step_name = "hisat_mapping", # targets = "preprocessing", dir = TRUE, # wf_file = "workflow-hisat2/workflow_hisat2-se.cwl", # input_file = "workflow-hisat2/workflow_hisat2-se.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars=c(FileName1="_FASTQ_PATH1_", SampleName="_SampleName_"), # dependency = c("hisat_index"), # run_session = "compute") ## ----bowtie_index, eval=FALSE, spr=TRUE----------------------------------------------------------- # appendStep(sal) <- SYSargsList(step_name = "bowtie_index", # targets = NULL, dir = FALSE, # wf_file = "bowtie2/bowtie2-index.cwl", # input_file = "bowtie2/bowtie2-index.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = NULL, # dependency = "preprocessing", # run_step = "optional") ## ----tophat2_mapping, eval=FALSE, spr=TRUE-------------------------------------------------------- # appendStep(sal) <- SYSargsList(step_name = "tophat2_mapping", # targets = "preprocessing", dir = TRUE, # wf_file = "tophat2/workflow_tophat2-mapping-se.cwl", # input_file = "tophat2/tophat2-mapping-se.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars=c(preprocessReads_se="_FASTQ_PATH1_", SampleName="_SampleName_"), # dependency = c("bowtie_index"), # run_session = "remote", # run_step = "optional") ## ----bowtie2_mapping, eval=FALSE, spr=TRUE-------------------------------------------------------- # appendStep(sal) <- SYSargsList(step_name = "bowtie2_mapping", # targets = "preprocessing", dir = TRUE, # wf_file = "bowtie2/workflow_bowtie2-mapping-se.cwl", # input_file = "bowtie2/bowtie2-mapping-se.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars=c(preprocessReads_se="_FASTQ_PATH1_", SampleName="_SampleName_"), # dependency = c("bowtie_index"), # run_session = "remote", # run_step = "optional") ## ----bwa_index, eval=FALSE, spr=TRUE-------------------------------------------------------------- # appendStep(sal) <- SYSargsList(step_name = "bwa_index", # targets = NULL, dir = FALSE, # wf_file = "bwa/bwa-index.cwl", # input_file = "bwa/bwa-index.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = NULL, # dependency = "preprocessing", # run_step = "optional") ## ----bwa_mapping, eval=FALSE, spr=TRUE------------------------------------------------------------ # appendStep(sal) <- SYSargsList(step_name = "bwa_mapping", # targets = "preprocessing", dir = TRUE, # wf_file = "bwa/bwa-se.cwl", # input_file = "bwa/bwa-se.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars=c(preprocessReads_se="_FASTQ_PATH1_", SampleName="_SampleName_"), # dependency = c("bwa_index"), # run_session = "remote", # run_step = "optional") ## ----rsubread_index, eval=FALSE, spr=TRUE--------------------------------------------------------- # appendStep(sal) <- SYSargsList(step_name = "rsubread_index", # targets = NULL, dir = FALSE, # wf_file = "rsubread/rsubread-index.cwl", # input_file = "rsubread/rsubread-index.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = NULL, # dependency = "preprocessing", # run_step = "optional") ## ----rsubread_mapping, eval=FALSE, spr=TRUE------------------------------------------------------- # appendStep(sal) <- SYSargsList(step_name = "rsubread", # targets = "preprocessing", dir = TRUE, # wf_file = "rsubread/rsubread-mapping-se.cwl", # input_file = "rsubread/rsubread-mapping-se.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars=c(FileName1="_FASTQ_PATH1_", SampleName="_SampleName_"), # dependency = c("rsubread_index"), # run_session = "compute", # run_step = "optional") ## ----gsnap_index, eval=FALSE, spr=TRUE------------------------------------------------------------ # appendStep(sal) <- SYSargsList(step_name = "gsnap_index", # targets = NULL, dir = FALSE, # wf_file = "gsnap/gsnap-index.cwl", # input_file = "gsnap/gsnap-index.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = NULL, # dependency = "preprocessing", # run_step = "optional") ## ----gsnap_mapping, eval=FALSE, spr=TRUE---------------------------------------------------------- # appendStep(sal) <- SYSargsList(step_name = "gsnap", # targets = "targetsPE.txt", dir = TRUE, # wf_file = "gsnap/gsnap-mapping-pe.cwl", # input_file = "gsnap/gsnap-mapping-pe.yml", # dir_path = system.file("extdata/cwl", package = "systemPipeR"), # inputvars = c(FileName1 = "_FASTQ_PATH1_", # FileName2 = "_FASTQ_PATH2_", SampleName = "_SampleName_"), # dependency = c("gsnap_index"), # run_session = "remote", # run_step = "optional") ## ----bam_IGV, eval=FALSE, spr=TRUE---------------------------------------------------------------- # appendStep(sal) <- LineWise( # code = { # bampaths <- getColumn(sal, step = "hisat2_mapping", "outfiles", # column = "samtools_sort_bam") # symLink2bam( # sysargs = bampaths, htmldir = c("~/.html/", "somedir/"), # urlbase = "https://cluster.hpcc.ucr.edu//", # urlfile = "./results/IGVurl.txt") # }, # step_name = "bam_IGV", # dependency = "hisat_mapping", # run_step = "optional" # ) ## ----create_txdb, message=FALSE, eval=FALSE, spr=TRUE--------------------------------------------- # appendStep(sal) <- LineWise(code = { # library(txdbmaker) # txdb <- makeTxDbFromGFF(file="data/tair10.gff", format="gff", # dataSource="TAIR", organism="Arabidopsis thaliana") # saveDb(txdb, file="./data/tair10.sqlite") # }, # step_name = "create_txdb", # dependency = "hisat_mapping") ## ----read_counting, message=FALSE, eval=FALSE, spr=TRUE------------------------------------------- # appendStep(sal) <- LineWise({ # library(BiocParallel) # txdb <- loadDb("./data/tair10.sqlite") # eByg <- exonsBy(txdb, by="gene") # outpaths <- getColumn(sal, step = "hisat_mapping", 'outfiles', column = 2) # bfl <- BamFileList(outpaths, yieldSize=50000, index=character()) # multicoreParam <- MulticoreParam(workers=4); register(multicoreParam); registered() # counteByg <- bplapply(bfl, function(x) summarizeOverlaps(eByg, x, mode="Union", # ignore.strand=TRUE, # inter.feature=TRUE, # singleEnd=TRUE)) # # Note: for strand-specific RNA-Seq set 'ignore.strand=FALSE' and for PE data set 'singleEnd=FALSE' # countDFeByg <- sapply(seq(along=counteByg), # function(x) assays(counteByg[[x]])$counts) # rownames(countDFeByg) <- names(rowRanges(counteByg[[1]])) # colnames(countDFeByg) <- names(bfl) # rpkmDFeByg <- apply(countDFeByg, 2, function(x) returnRPKM(counts=x, ranges=eByg)) # write.table(countDFeByg, "results/countDFeByg.xls", # col.names=NA, quote=FALSE, sep="\t") # write.table(rpkmDFeByg, "results/rpkmDFeByg.xls", # col.names=NA, quote=FALSE, sep="\t") # }, # step_name = "read_counting", # dependency = "create_txdb") ## ----align_stats, message=FALSE, eval=FALSE, spr=TRUE--------------------------------------------- # appendStep(sal) <- LineWise({ # read_statsDF <- alignStats(args) # write.table(read_statsDF, "results/alignStats.xls", # row.names = FALSE, quote = FALSE, sep = "\t") # }, # step_name = "align_stats", # dependency = "hisat_mapping") ## ----align_stats2, eval=TRUE---------------------------------------------------------------------- read.table(system.file("extdata", "alignStats.xls", package = "systemPipeR"), header = TRUE)[1:4,] ## ----read_counting_mirna, message=FALSE, eval=FALSE, spr=TRUE------------------------------------- # appendStep(sal) <- LineWise({ # system("wget https://www.mirbase.org/download/ath.gff3 -P ./data/") # gff <- rtracklayer::import.gff("./data/ath.gff3") # gff <- split(gff, elementMetadata(gff)$ID) # bams <- getColumn(sal, step = "bowtie2_mapping", 'outfiles', column = 2) # bfl <- BamFileList(bams, yieldSize=50000, index=character()) # countDFmiR <- summarizeOverlaps(gff, bfl, mode="Union", # ignore.strand = FALSE, inter.feature = FALSE) # countDFmiR <- assays(countDFmiR)$counts # # Note: inter.feature=FALSE important since pre and mature miRNA ranges overlap # rpkmDFmiR <- apply(countDFmiR, 2, function(x) returnRPKM(counts = x, ranges = gff)) # write.table(assays(countDFmiR)$counts, "results/countDFmiR.xls", # col.names=NA, quote=FALSE, sep="\t") # write.table(rpkmDFmiR, "results/rpkmDFmiR.xls", col.names=NA, quote=FALSE, sep="\t") # }, # step_name = "read_counting_mirna", # dependency = "bowtie2_mapping") ## ----sample_tree_rlog, message=FALSE, eval=FALSE, spr=TRUE---------------------------------------- # appendStep(sal) <- LineWise({ # library(DESeq2, warn.conflicts=FALSE, quietly=TRUE) # library(ape, warn.conflicts=FALSE) # countDFpath <- system.file("extdata", "countDFeByg.xls", package="systemPipeR") # countDF <- as.matrix(read.table(countDFpath)) # colData <- data.frame(row.names = targetsWF(sal)[[2]]$SampleName, # condition=targetsWF(sal)[[2]]$Factor) # dds <- DESeqDataSetFromMatrix(countData = countDF, colData = colData, # design = ~ condition) # d <- cor(assay(rlog(dds)), method = "spearman") # hc <- hclust(dist(1-d)) # plot.phylo(as.phylo(hc), type = "p", edge.col = 4, edge.width = 3, # show.node.label = TRUE, no.margin = TRUE) # }, # step_name = "sample_tree_rlog", # dependency = "read_counting") ## ----edger, message=FALSE, eval=FALSE, spr=TRUE--------------------------------------------------- # appendStep(sal) <- LineWise({ # targetspath <- system.file("extdata", "targets.txt", package = "systemPipeR") # targets <- read.delim(targetspath, comment = "#") # cmp <- readComp(file = targetspath, format = "matrix", delim = "-") # countDFeBygpath <- system.file("extdata", "countDFeByg.xls", package = "systemPipeR") # countDFeByg <- read.delim(countDFeBygpath, row.names = 1) # edgeDF <- run_edgeR(countDF = countDFeByg, targets = targets, cmp = cmp[[1]], # independent = FALSE, mdsplot = "") # DEG_list <- filterDEGs(degDF = edgeDF, filter = c(Fold = 2, FDR = 10)) # }, # step_name = "edger", # dependency = "read_counting") ## ----deseq2, message=FALSE, eval=FALSE, spr=TRUE-------------------------------------------------- # appendStep(sal) <- LineWise({ # degseqDF <- run_DESeq2(countDF=countDFeByg, targets=targets, cmp=cmp[[1]], # independent=FALSE) # DEG_list2 <- filterDEGs(degDF=degseqDF, filter=c(Fold=2, FDR=10)) # }, # step_name = "deseq2", # dependency = "read_counting") ## ----vennplot, message=FALSE, eval=FALSE, spr=TRUE------------------------------------------------ # appendStep(sal) <- LineWise({ # vennsetup <- overLapper(DEG_list$Up[6:9], type="vennsets") # vennsetdown <- overLapper(DEG_list$Down[6:9], type="vennsets") # vennPlot(list(vennsetup, vennsetdown), mymain="", mysub="", # colmode=2, ccol=c("blue", "red")) # }, # step_name = "vennplot", # dependency = "edger") ## ----get_go_biomart, message=FALSE, eval=FALSE, spr=TRUE------------------------------------------ # appendStep(sal) <- LineWise({ # library("biomaRt") # listMarts() # To choose BioMart database # listMarts(host="plants.ensembl.org") # m <- useMart("plants_mart", host="https://plants.ensembl.org") # listDatasets(m) # m <- useMart("plants_mart", dataset="athaliana_eg_gene", host="https://plants.ensembl.org") # listAttributes(m) # Choose data types you want to download # go <- getBM(attributes=c("go_id", "tair_locus", "namespace_1003"), mart=m) # go <- go[go[,3]!="",]; go[,3] <- as.character(go[,3]) # go[go[,3]=="molecular_function", 3] <- "F" # go[go[,3]=="biological_process", 3] <- "P" # go[go[,3]=="cellular_component", 3] <- "C" # go[1:4,] # dir.create("./data/GO") # write.table(go, "data/GO/GOannotationsBiomart_mod.txt", # quote=FALSE, row.names=FALSE, col.names=FALSE, sep="\t") # catdb <- makeCATdb(myfile="data/GO/GOannotationsBiomart_mod.txt", # lib=NULL, org="", colno=c(1,2,3), idconv=NULL) # save(catdb, file="data/GO/catdb.RData") # }, # step_name = "get_go_biomart", # dependency = "edger") ## ----go_enrichment, message=FALSE, eval=FALSE, spr=TRUE------------------------------------------- # appendStep(sal) <- LineWise({ # load("data/GO/catdb.RData") # DEG_list <- filterDEGs(degDF=edgeDF, filter=c(Fold=2, FDR=50), plot=FALSE) # up_down <- DEG_list$UporDown; names(up_down) <- paste(names(up_down), "_up_down", sep="") # up <- DEG_list$Up; names(up) <- paste(names(up), "_up", sep="") # down <- DEG_list$Down; names(down) <- paste(names(down), "_down", sep="") # DEGlist <- c(up_down, up, down) # DEGlist <- DEGlist[sapply(DEGlist, length) > 0] # BatchResult <- GOCluster_Report(catdb=catdb, setlist=DEGlist, method="all", # id_type="gene", CLSZ=2, cutoff=0.9, # gocats=c("MF", "BP", "CC"), recordSpecGO=NULL) # library("biomaRt") # m <- useMart("plants_mart", dataset="athaliana_eg_gene", host="https://plants.ensembl.org") # goslimvec <- as.character(getBM(attributes=c("goslim_goa_accession"), mart=m)[,1]) # BatchResultslim <- GOCluster_Report(catdb=catdb, setlist=DEGlist, method="slim", # id_type="gene", myslimv=goslimvec, CLSZ=10, # cutoff=0.01, gocats=c("MF", "BP", "CC"), # recordSpecGO=NULL) # gos <- BatchResultslim[grep("M6-V6_up_down", BatchResultslim$CLID), ] # gos <- BatchResultslim # pdf("GOslimbarplotMF.pdf", height=8, width=10); goBarplot(gos, gocat="MF"); dev.off() # goBarplot(gos, gocat="BP") # goBarplot(gos, gocat="CC") # }, # step_name = "go_enrichment", # dependency = "get_go_biomart") ## ----hierarchical_clustering, message=FALSE, eval=FALSE, spr=TRUE--------------------------------- # appendStep(sal) <- LineWise({ # library(pheatmap) # geneids <- unique(as.character(unlist(DEG_list[[1]]))) # y <- assay(rlog(dds))[geneids, ] # pdf("heatmap1.pdf") # pheatmap(y, scale="row", clustering_distance_rows="correlation", # clustering_distance_cols="correlation") # dev.off() # }, # step_name = "hierarchical_clustering", # dependency = c("sample_tree_rlog", "edger")) ## ----sessionInfo---------------------------------------------------------------------------------- sessionInfo()