---
title: "ORFik_Ribo-seq_pipeline"
author: "Haakon Tjeldnes & Kornel Labun"
date: "`r BiocStyle::doc_date()`"
package: "`r pkg_ver('ORFik')`"
output: BiocStyle::html_document
vignette: >
  %\VignetteIndexEntry{ORFik_Ribo-seq_pipeline}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Introduction

Welcome to the `ORFik` package. 
`ORFik` is an R package for analysis of transcript and translation features through manipulation of sequence data and NGS data.
This vignette will preview a simple Ribo-seq pipeline using ORFik. It is important you read all the other vignettes before this one, since functions will not be explained here in detail. 

# Pipeline
This pipeline will shows steps needed to analyse Ribo-seq from Alexaki et al, 2020
The following steps are done:

1. Define directory paths
2. Download Ribo-seq & RNA-seq data from SRA (subset to 2 million reads per library)
3. Download annotation and contaminants
4. Align data
5. Make ORFik experiment
6. QC
7. Heatmaps
8. Differentially translated genes
9. Peak analysis
10. Feature table
11. uORF analysis

```{r eval = FALSE, echo = TRUE, message = FALSE}
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Ribo-seq HEK293 (2020) Investigative analysis of quality of new Ribo-seq data
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Article: https://f1000research.com/articles/9-174/v2#ref-5
# Design: Wild type (WT) vs codon optimized (CO) (gene F9)
library(ORFik)

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Config
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Specify paths wanted for NGS data, genome, annotation and STAR index
# If you use local files, make a conf variable with existing directories
conf <- config.exper(experiment = "Alexaki_Human",
                     assembly = "Homo_sapiens_GRCh38_101",
                     type = c("Ribo-seq", "RNA-seq"))

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Download fastq files for experiment and rename (Skip if you have the files already)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# SRA Meta data download (work for ERA and DRA too)
study <- download.SRA.metadata("PRJNA591214", outdir = conf["fastq Ribo-seq"])
# Subset
study.rfp <- study[grep("mRNA-Seq", sample_title, invert = TRUE),]
study.rna <- study[grep("Ribo-Seq", sample_title, invert = TRUE),]
# Download fastq files (uses SRR numbers (RUN column) from study))
download.SRA(study.rfp, conf["fastq Ribo-seq"],
             rename = study.rfp$sample_title, subset = 2000000)
download.SRA(study.rna, conf["fastq RNA-seq"],
             rename = study.rna$sample_title, subset = 2000000)

# Which organism is this, scientific name, like "Homo sapiens" or "Danio rerio"
organism <- study$ScientificName[1] # Usually you find organism here, else set it yourself
paired.end.rfp <- study.rfp$LibraryLayout == "PAIRED"
paired.end.rna <- study.rna$LibraryLayout == "PAIRED"
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Annotation
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#

annotation <- getGenomeAndAnnotation(
  organism = organism,
  genome = TRUE, GTF = TRUE,
  phix = TRUE, ncRNA = TRUE, tRNA = TRUE, rRNA = TRUE,
  output.dir = conf["ref"],
  assembly_type = "primary_assembly"
)

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# STAR index
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Remove max.ram = 20 and SAsparse = 2, if you have more than 64GB ram
index <- STAR.index(annotation, wait = TRUE, max.ram = 20, SAsparse = 2)

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Alignment (with depletion of phix, rRNA, ncRNA and tRNAs) & (with MultiQC of final STAR alignment)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#

STAR.align.folder(conf["fastq Ribo-seq"], conf["bam Ribo-seq"], index,
                  paired.end = paired.end.rfp,
                  steps = "tr-co-ge", # (trim needed: adapters found, then genome)
                  adapter.sequence = "auto", # Adapters are auto detected
                  trim.front = 0, min.length = 20)

STAR.align.folder(conf["fastq RNA-seq"], conf["bam RNA-seq"], index,
                  paired.end = paired.end.rna,
                  steps = "tr-co-ge", # (trim needed: adapters found, then genome)
                  adapter.sequence = "auto", # Adapters are auto detected
                  trim.front = 0, min.length = 20)

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Create experiment (Starting point if alignment is finished)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
library(ORFik)
create.experiment(paste0(conf["bam Ribo-seq"], "/aligned/"),
                  exper = conf["exp Ribo-seq"],
                  fa = annotation["genome"],
                  txdb = paste0(annotation["gtf"], ".db"),
                  organism = organism,
                  pairedEndBam = paired.end.rfp,
                  rep = c(1,2,3,1,2,3),
                  condition = c(rep("CO", 3), rep("WT", 3)),
                  viewTemplate = FALSE)
create.experiment(paste0(conf["bam RNA-seq"], "/aligned/"),
                  exper = conf["exp RNA-seq"],
                  fa = annotation["genome"],
                  txdb = paste0(annotation["gtf"], ".db"),
                  organism = organism,
                  pairedEndBam = paired.end.rna,
                  rep = c(1,2,3,1,2,3),
                  condition = c(rep("CO", 3), rep("WT", 3)),
                  viewTemplate = FALSE)

library(ORFik)
df.rfp <- read.experiment("Alexaki_Human_Ribo-seq")
df.rna <- read.experiment("Alexaki_Human_RNA-seq")
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Convert files and run Annotation vs alignment QC
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# General QC
ORFikQC(df.rfp)
ORFikQC(df.rna)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# P-shifting of Ribo-seq reads:
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# From ORFikQC it looks like 20, 21, 27 and 28 are candidates for Ribosomal footprints
shiftFootprintsByExperiment(df.rfp, accepted.lengths = c(20:21, 27:28))
# Ribo-seq specific QC
remove.experiments(df.rfp) # Remove loaded data (it is not pshifted)
ORFik:::RiboQC.plot(df.rfp, BPPARAM = BiocParallel::SerialParam(progressbar = T))
remove.experiments(df.rfp)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Create heatmaps (Ribo-seq)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Pre-pshifting
heatMapRegion(df.rfp, region = c("TIS", "TTS"), shifting = "5prime", type = "ofst",
              outdir = paste0(dirname(df.rfp$filepath[1]), "/QC_STATS/heatmaps/pre-pshift/"))
remove.experiments(df.rfp)
# After pshifting
heatMapRegion(df.rfp, region = c("TIS", "TTS"), shifting = "5prime", type = "pshifted",
              outdir = paste0(dirname(df.rfp$filepath[1]), "/QC_STATS/heatmaps/pshifted/"))

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Differential translation analysis (condition: WT vs CO)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# We here get 11 unique DTEG genes
# Now let's check if the CO group overexpress the F9 Gene (ENSG00000101981):
res <- DTEG.analysis(df.rfp, df.rna, design = "condition")
significant_genes <- res[Regulation != "No change",]
gene_names <- txNamesToGeneNames(significant_genes$id, df.rfp)
"ENSG00000101981" %in% unique(gene_names) # TRUE
# It does, good good.
# How is it regulated ?
significant_genes[which(gene_names == "ENSG00000101981"),] # By mRNA abundance

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Peak detection (strong peaks in CDS)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#

peaks <- findPeaksPerGene(loadRegion(df.rfp, "cds"), reads = RFP_WT_r1, type = "max")
ORFik::windowCoveragePlot(peaks, type = "cds", scoring = "transcriptNormalized")
# The gene does not have a strong max peak in WT rep1
"ENSG00000101981" %in% peaks$gene_id # FALSE

peaks_CO <- findPeaksPerGene(loadRegion(df.rfp, "cds"), reads = RFP_CO_r1, type = "max")
# The gene does not have a strong max peak in CO rep1 either
"ENSG00000101981" %in% peaks_CO$gene_id # FALSE

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Feature table (From WT rep 1)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
cds <- loadRegion(df.rfp, "cds")
cds <- ORFik:::removeMetaCols(cds) # Dont need them
cds <- cds[filterTranscripts(df.rfp)] # Filter to sane transcripts (annotation is not perfect)
dt <- computeFeatures(cds,
                RFP = fimport(filepath(df.rfp[6,], "pshifted")),
                RNA = fimport(filepath(df.rna[6,], "ofst")), Gtf = df.rfp,
                grl.is.sorted = TRUE, faFile = df.rfp,
                weight.RFP = "score", weight.RNA = "score",
                riboStart = 21, uorfFeatures = FALSE)
# The 8 remaining significant DTEGs.
dt[names(cds) %in%  significant_genes$id,]
# All genes with strong 3nt periodicity of Ribo-seq
dt[ORFScores > 5,]
# Not all genes start with ATG, possible errors in annotation
table(dt$StartCodons)
# All genes with strong start codon peak
dt[startCodonCoverage > 5,]
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# uORF analysis (advanced) (using the extension package to ORFik: uORFomePipe)
# Will create a mysql database + files with results
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
devtools::install_github("Roleren/uORFomePipe", dependencies = TRUE)
library(uORFomePipe)
find_uORFome("/media/roler/S/data/Bio_data/projects/Alexaki_uORFome/",
             df.rfp = df.rfp, df.rna = df.rna, df.cage = NULL, biomart = NULL,
             startCodons = "ATG|CTG|TTG|GTG", BPPARAM = BiocParallel::MulticoreParam(2))

grl <- getUorfsInDb()
pred <- readTable("finalPredWithProb")$prediction
cov <- readTable("startCodonCoverage")
grl[pred == 1 & rowSums(cov) > 5]

```