--- title: "ORFik_Ribo-seq_pipeline" author: "Haakon Tjeldnes & Kornel Labun" date: "`r BiocStyle::doc_date()`" package: "`r pkg_ver('ORFik')`" output: BiocStyle::html_document vignette: > %\VignetteIndexEntry{ORFik_Ribo-seq_pipeline} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Introduction Welcome to the `ORFik` package. `ORFik` is an R package for analysis of transcript and translation features through manipulation of sequence data and NGS data. This vignette will preview a simple Ribo-seq pipeline using ORFik. It is important you read all the other vignettes before this one, since functions will not be explained here in detail. # Pipeline This pipeline will shows steps needed to analyse Ribo-seq from Alexaki et al, 2020 The following steps are done: 1. Define directory paths 2. Download Ribo-seq & RNA-seq data from SRA (subset to 2 million reads per library) 3. Download annotation and contaminants 4. Align data 5. Make ORFik experiment 6. QC 7. Heatmaps 8. Differentially translated genes 9. Peak analysis 10. Feature table 11. uORF analysis ```{r eval = FALSE, echo = TRUE, message = FALSE} #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Ribo-seq HEK293 (2020) Investigative analysis of quality of new Ribo-seq data #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Article: https://f1000research.com/articles/9-174/v2#ref-5 # Design: Wild type (WT) vs codon optimized (CO) (gene F9) library(ORFik) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Config #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Specify paths wanted for NGS data, genome, annotation and STAR index # If you use local files, make a conf variable with existing directories conf <- config.exper(experiment = "Alexaki_Human", assembly = "Homo_sapiens_GRCh38_101", type = c("Ribo-seq", "RNA-seq")) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Download fastq files for experiment and rename (Skip if you have the files already) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # SRA Meta data download (work for ERA and DRA too) study <- download.SRA.metadata("PRJNA591214", outdir = conf["fastq Ribo-seq"]) # Subset study.rfp <- study[grep("mRNA-Seq", sample_title, invert = TRUE),] study.rna <- study[grep("Ribo-Seq", sample_title, invert = TRUE),] # Download fastq files (uses SRR numbers (RUN column) from study)) download.SRA(study.rfp, conf["fastq Ribo-seq"], rename = study.rfp$sample_title, subset = 2000000) download.SRA(study.rna, conf["fastq RNA-seq"], rename = study.rna$sample_title, subset = 2000000) # Which organism is this, scientific name, like "Homo sapiens" or "Danio rerio" organism <- study$ScientificName[1] # Usually you find organism here, else set it yourself paired.end.rfp <- study.rfp$LibraryLayout == "PAIRED" paired.end.rna <- study.rna$LibraryLayout == "PAIRED" #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Annotation #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# annotation <- getGenomeAndAnnotation( organism = organism, genome = TRUE, GTF = TRUE, phix = TRUE, ncRNA = TRUE, tRNA = TRUE, rRNA = TRUE, output.dir = conf["ref"], assembly_type = "primary_assembly" ) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # STAR index #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Remove max.ram = 20 and SAsparse = 2, if you have more than 64GB ram index <- STAR.index(annotation, wait = TRUE, max.ram = 20, SAsparse = 2) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Alignment (with depletion of phix, rRNA, ncRNA and tRNAs) & (with MultiQC of final STAR alignment) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# STAR.align.folder(conf["fastq Ribo-seq"], conf["bam Ribo-seq"], index, paired.end = paired.end.rfp, steps = "tr-co-ge", # (trim needed: adapters found, then genome) adapter.sequence = "auto", # Adapters are auto detected trim.front = 0, min.length = 20) STAR.align.folder(conf["fastq RNA-seq"], conf["bam RNA-seq"], index, paired.end = paired.end.rna, steps = "tr-co-ge", # (trim needed: adapters found, then genome) adapter.sequence = "auto", # Adapters are auto detected trim.front = 0, min.length = 20) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Create experiment (Starting point if alignment is finished) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# library(ORFik) create.experiment(paste0(conf["bam Ribo-seq"], "/aligned/"), exper = conf["exp Ribo-seq"], fa = annotation["genome"], txdb = paste0(annotation["gtf"], ".db"), organism = organism, pairedEndBam = paired.end.rfp, rep = c(1,2,3,1,2,3), condition = c(rep("CO", 3), rep("WT", 3)), viewTemplate = FALSE) create.experiment(paste0(conf["bam RNA-seq"], "/aligned/"), exper = conf["exp RNA-seq"], fa = annotation["genome"], txdb = paste0(annotation["gtf"], ".db"), organism = organism, pairedEndBam = paired.end.rna, rep = c(1,2,3,1,2,3), condition = c(rep("CO", 3), rep("WT", 3)), viewTemplate = FALSE) library(ORFik) df.rfp <- read.experiment("Alexaki_Human_Ribo-seq") df.rna <- read.experiment("Alexaki_Human_RNA-seq") #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Convert files and run Annotation vs alignment QC #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # General QC ORFikQC(df.rfp) ORFikQC(df.rna) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # P-shifting of Ribo-seq reads: #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # From ORFikQC it looks like 20, 21, 27 and 28 are candidates for Ribosomal footprints shiftFootprintsByExperiment(df.rfp, accepted.lengths = c(20:21, 27:28)) # Ribo-seq specific QC remove.experiments(df.rfp) # Remove loaded data (it is not pshifted) ORFik:::RiboQC.plot(df.rfp, BPPARAM = BiocParallel::SerialParam(progressbar = T)) remove.experiments(df.rfp) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Create heatmaps (Ribo-seq) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Pre-pshifting heatMapRegion(df.rfp, region = c("TIS", "TTS"), shifting = "5prime", type = "ofst", outdir = paste0(dirname(df.rfp$filepath[1]), "/QC_STATS/heatmaps/pre-pshift/")) remove.experiments(df.rfp) # After pshifting heatMapRegion(df.rfp, region = c("TIS", "TTS"), shifting = "5prime", type = "pshifted", outdir = paste0(dirname(df.rfp$filepath[1]), "/QC_STATS/heatmaps/pshifted/")) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Differential translation analysis (condition: WT vs CO) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # We here get 11 unique DTEG genes # Now let's check if the CO group overexpress the F9 Gene (ENSG00000101981): res <- DTEG.analysis(df.rfp, df.rna, design = "condition") significant_genes <- res[Regulation != "No change",] gene_names <- txNamesToGeneNames(significant_genes$id, df.rfp) "ENSG00000101981" %in% unique(gene_names) # TRUE # It does, good good. # How is it regulated ? significant_genes[which(gene_names == "ENSG00000101981"),] # By mRNA abundance #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Peak detection (strong peaks in CDS) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# peaks <- findPeaksPerGene(loadRegion(df.rfp, "cds"), reads = RFP_WT_r1, type = "max") ORFik::windowCoveragePlot(peaks, type = "cds", scoring = "transcriptNormalized") # The gene does not have a strong max peak in WT rep1 "ENSG00000101981" %in% peaks$gene_id # FALSE peaks_CO <- findPeaksPerGene(loadRegion(df.rfp, "cds"), reads = RFP_CO_r1, type = "max") # The gene does not have a strong max peak in CO rep1 either "ENSG00000101981" %in% peaks_CO$gene_id # FALSE #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # Feature table (From WT rep 1) #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# cds <- loadRegion(df.rfp, "cds") cds <- ORFik:::removeMetaCols(cds) # Dont need them cds <- cds[filterTranscripts(df.rfp)] # Filter to sane transcripts (annotation is not perfect) dt <- computeFeatures(cds, RFP = fimport(filepath(df.rfp[6,], "pshifted")), RNA = fimport(filepath(df.rna[6,], "ofst")), Gtf = df.rfp, grl.is.sorted = TRUE, faFile = df.rfp, weight.RFP = "score", weight.RNA = "score", riboStart = 21, uorfFeatures = FALSE) # The 8 remaining significant DTEGs. dt[names(cds) %in% significant_genes$id,] # All genes with strong 3nt periodicity of Ribo-seq dt[ORFScores > 5,] # Not all genes start with ATG, possible errors in annotation table(dt$StartCodons) # All genes with strong start codon peak dt[startCodonCoverage > 5,] #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# # uORF analysis (advanced) (using the extension package to ORFik: uORFomePipe) # Will create a mysql database + files with results #¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤# devtools::install_github("Roleren/uORFomePipe", dependencies = TRUE) library(uORFomePipe) find_uORFome("/media/roler/S/data/Bio_data/projects/Alexaki_uORFome/", df.rfp = df.rfp, df.rna = df.rna, df.cage = NULL, biomart = NULL, startCodons = "ATG|CTG|TTG|GTG", BPPARAM = BiocParallel::MulticoreParam(2)) grl <- getUorfsInDb() pred <- readTable("finalPredWithProb")$prediction cov <- readTable("startCodonCoverage") grl[pred == 1 & rowSums(cov) > 5] ```