Contents

1 Introduction

Welcome to the ORFik package. ORFik is an R package for analysis of transcript and translation features through manipulation of sequence data and NGS data.

This vignette will preview a simple Ribo-seq pipeline using ORFik. It is important you read all the other vignettes before this one, since functions will not be explained here in detail.

2 Pipeline

This pipeline will shows steps needed to analyse Ribo-seq from Alexaki et al, 2020

The following steps are done:

  1. Define directory paths
  2. Download Ribo-seq & RNA-seq data from SRA (subset to 2 million reads per library)
  3. Download genome annotation and contaminants
  4. Trim & Align data
  5. Make ORFik experiment
  6. QC
  7. Heatmaps
  8. Count table analysis: TE tables
  9. Differentially translated genes
  10. Peak analysis
  11. Feature table
  12. Gene plotting (Advanced)
  13. uORF analysis (Advanced)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Ribo-seq HEK293 (2020) Investigative analysis of quality of new Ribo-seq data
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Article: https://f1000research.com/articles/9-174/v2#ref-5
# Design: Wild type (WT) vs codon optimized (CO) (gene F9)
library(ORFik)

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Config
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Specify paths wanted for NGS data, genome, annotation and STAR index
# If you use local files, make a conf variable with existing directories
# Seperate Ribo-seq and RNA-seq into separate folders with type argument
conf <- config.exper(experiment = "Alexaki_Human",
                     assembly = "Homo_sapiens_GRCh38_101",
                     type = c("Ribo-seq", "RNA-seq"))
# Will create default config paths, if you want more control of where the
# data is stored, check out function config() function

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Download fastq files for experiment and rename (Skip if you have the files already)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# SRA Meta data download (work for ERA, DRA and GEO too)
study <- download.SRA.metadata("PRJNA591214", auto.detect = TRUE)
# Auto detection worked, all Ribo-seq and RNA-seq samples detected
# NOTE: Could not detect condition CO, only wild type (WT)

# Split study into (Ribo-seq / RNA-seq)
study.rfp <- study[LIBRARYTYPE == "RFP",]
study.rna <- study[LIBRARYTYPE == "RNA",]
# Download fastq files (uses SRR numbers (RUN column) from study))
# The sample_title column had good names to rename files:
download.SRA(study.rfp, conf["fastq Ribo-seq"],
             rename = study.rfp$sample_title, subset = 2000000)
download.SRA(study.rna, conf["fastq RNA-seq"],
             rename = study.rna$sample_title, subset = 2000000)

# Which organism is this, scientific name, like "Homo sapiens" or "Danio rerio"
organism <- study$ScientificName[1] # Usually you find organism here, else set it yourself
paired.end.rfp <- study.rfp$LibraryLayout == "PAIRED"
paired.end.rna <- study.rna$LibraryLayout == "PAIRED"
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Annotation (Download genome, transcript annotation and contaminants)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#

annotation <- getGenomeAndAnnotation(
  organism = organism,
  genome = TRUE, GTF = TRUE,
  phix = TRUE, ncRNA = TRUE, tRNA = TRUE, rRNA = TRUE,
  output.dir = conf["ref"],
  assembly_type = "primary_assembly"
)

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# STAR index (index the genome and contaminants seperatly)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Remove max.ram = 20 and SAsparse = 2, if you have more than 64GB ram
index <- STAR.index(annotation, wait = TRUE, max.ram = 20, SAsparse = 2)

# Show all annotations you have made with ORFik so far
list.genomes()
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Alignment (with depletion of phix, rRNA, ncRNA and tRNAs) & (with MultiQC of final STAR alignment)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#

STAR.align.folder(conf["fastq Ribo-seq"], conf["bam Ribo-seq"], index,
                  paired.end = paired.end.rfp,
                  steps = "tr-co-ge", # (trim needed: adapters found, then genome)
                  adapter.sequence = "auto", # Adapters are auto detected
                  trim.front = 0, min.length = 20)

STAR.align.folder(conf["fastq RNA-seq"], conf["bam RNA-seq"], index,
                  paired.end = paired.end.rna,
                  steps = "tr-co-ge", # (trim needed: adapters found, then genome)
                  adapter.sequence = "auto", # Adapters are auto detected
                  trim.front = 0, min.length = 20)

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Create experiment (Starting point if alignment is finished)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
library(ORFik)
create.experiment(file.path(conf["bam Ribo-seq"], "aligned/"),
                  exper = conf["exp Ribo-seq"],
                  fa = annotation["genome"],
                  txdb = paste0(annotation["gtf"], ".db"),
                  organism = organism,
                  pairedEndBam = paired.end.rfp,
                  rep = c(1,2,3,1,2,3),
                  condition = rep(c("CO", "WT"), each = 3))
create.experiment(file.path(conf["bam RNA-seq"], "aligned/"),
                  exper = conf["exp RNA-seq"],
                  fa = annotation["genome"],
                  txdb = paste0(annotation["gtf"], ".db"),
                  organism = organism,
                  pairedEndBam = paired.end.rna,
                  rep = c(1,2,3,1,2,3),
                  condition = rep(c("CO", "WT"), each = 3))

library(ORFik)
# Show the experiments you have made with ORFik so far
list.experiments()
df.rfp <- read.experiment("Alexaki_Human_Ribo-seq")
df.rna <- read.experiment("Alexaki_Human_RNA-seq")


#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Convert files and run Annotation vs alignment QC
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# General QC
ORFikQC(df.rfp)
ORFikQC(df.rna)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# P-shifting of Ribo-seq reads:
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# From ORFikQC it looks like 20, 21, 27 and 28 are candidates for Ribosomal footprints
shiftFootprintsByExperiment(df.rfp, accepted.lengths = c(20:21, 27:28))
# Now check if you are happy with shifts, these libraries have some interesting
# periodicity for read length 20 and 27,
# it has identical amount of reads in frame 0 and 1, not optimal for ORF detection.
shiftPlots(df.rfp, output = "auto", downstream = 30) # Barplots, better details
shiftPlots(df.rfp, output = "auto", downstream = 30, type = "heatmap") # Heatmaps, better overview

# Ribo-seq specific QC
remove.experiments(df.rfp) # Remove loaded data (it is not pshifted)
RiboQC.plot(df.rfp, BPPARAM = BiocParallel::SerialParam(progressbar = TRUE))
remove.experiments(df.rfp)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Create heatmaps (Ribo-seq)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Pre-pshifting
heatMapRegion(df.rfp, region = c("TIS", "TTS"), shifting = "5prime", type = "ofst",
              outdir = file.path(QCfolder(df.rfp), "heatmaps/pre-pshift/"))
remove.experiments(df.rfp)
# After pshifting
heatMapRegion(df.rfp, region = c("TIS", "TTS"), shifting = "5prime", type = "pshifted",
              outdir = file.path(QCfolder(df.rfp), "heatmaps/pshifted/"))

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Count table analysis: TE tables
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Shifting looks good, let's make count tables of pshifted libraries:
# As a note: Correlation between count tables of pshifted vs raw libs is ~ 40% usually.
countTable_regions(df.rfp, lib.type = "pshifted", rel.dir = "pshifted")

# TE per library match
countsRFP <- countTable(df.rfp, region = "cds", type = "fpkm", collapse = FALSE, count.folder = "pshifted")
countsRNA <- countTable(df.rna, region = "mrna", type = "fpkm", collapse = FALSE)
countsTE <- (countsRFP + 1) / (countsRNA + 1) # with pseudo count
# TE per condition (WT vs CO) (collapse replicates)
countsRFP <- countTable(df.rfp, region = "cds", type = "fpkm", collapse = TRUE, count.folder = "pshifted")
countsRNA <- countTable(df.rna, region = "mrna", type = "fpkm", collapse = TRUE)
countsTE <- (countsRFP + 1) / (countsRNA + 1) # with pseudo count
# TE merged all libraries
countsRFP <- countTable(df.rfp, region = "cds", type = "fpkm", collapse = "all", count.folder = "pshifted")
countsRNA <- countTable(df.rna, region = "mrna", type = "fpkm", collapse = "all")
countsTE <- (countsRFP + 1) / (countsRNA + 1) # with pseudo count

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Differential translation analysis (condition: WT vs CO)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# The design is by default chosen by this factor: The condition column in this case
design(df.rfp, multi.factor = FALSE)
# We now run, and here get 11 unique DTEG genes
res <- DTEG.analysis(df.rfp, df.rna)
# Now let's check if the CO group overexpress the F9 Gene (ENSG00000101981):
significant_genes <- res[Regulation != "No change",]
gene_names <- txNamesToGeneNames(significant_genes$id, df.rfp)
"ENSG00000101981" %in% unique(gene_names) # TRUE
# It does, good good.
# How is it regulated ?
significant_genes[which(gene_names == "ENSG00000101981"),] # By mRNA abundance
# If you downloaded the full libraries, do this to use pshifted libraries instead.
# Not a valid result for pshifted libraries using subset
res <- DTEG.analysis(df.rfp, df.rna, design = "condition",
                     RFP_counts = countTable(df.rfp, region = "cds", type = "summarized",
                                             count.folder = "pshifted"))
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Peak detection (strong peaks in CDS)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#

peaks <- findPeaksPerGene(loadRegion(df.rfp, "cds"), reads = RFP_WT_r1, type = "max")
ORFik::windowCoveragePlot(peaks, type = "cds", scoring = "transcriptNormalized")
# The gene does not have a strong max peak in WT rep1
"ENSG00000101981" %in% peaks$gene_id # FALSE

peaks_CO <- findPeaksPerGene(loadRegion(df.rfp, "cds"), reads = RFP_CO_r1, type = "max")
# The gene does not have a strong max peak in CO rep1 either
"ENSG00000101981" %in% peaks_CO$gene_id # FALSE

#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Feature table (From WT rep 1)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
cds <- loadRegion(df.rfp, "cds")
cds <- ORFik:::removeMetaCols(cds) # Dont need them
cds <- cds[filterTranscripts(df.rfp)] # Filter to sane transcripts (annotation is not perfect)
dt <- computeFeatures(cds,
                RFP = fimport(filepath(df.rfp[6,], "pshifted")),
                RNA = fimport(filepath(df.rna[6,], "ofst")), Gtf = df.rfp,
                grl.is.sorted = TRUE, faFile = df.rfp,
                weight.RFP = "score", weight.RNA = "score",
                riboStart = 21, uorfFeatures = FALSE)
# The features of significant DTEGs.
dt[names(cds) %in%  significant_genes$id,]
# All genes with strong 3nt periodicity of Ribo-seq
dt[ORFScores > 5,]
# Not all genes start with ATG, possible errors in annotation
table(dt$StartCodons)
# All genes with strong start codon peak
dt[startCodonCoverage > 5,]
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# Gene plotting (advanced under development!)
# (Using package that extends ORFik for interactive html plots (RiboCrypt))
# Will create interactive plot for Ribo-seq and RNA-seq sample: Wild type rep 3
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# This package also available on Bioconductor since Bioc version 3.14
# BiocManager::install("RiboCrypt")
devtools::install_github("m-swirski/RiboCrypt", dependencies = TRUE)
library(RiboCrypt)
cds <- loadRegion(df.rfp, "cds")
RiboCrypt::multiOmicsPlot_list(cds[1640], cds[1640], reference_sequence = findFa(df.rfp@fafile), reads = list(fimport(filepath(df.rna[6,], "ofst")), fimport(filepath(df.rfp[6,], "pshifted"))),
                               ylabels = c("RNA", "RFP"), withFrames = c(F, T), frames_type = "columns")
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# All ORF type predictions
# Prediction using peridicity (Similar to RiboCode, ORFScore, minimum coverage, and comparison
# to upstream and downstream window)
# Will create 3 files in format (.rds), GRangesList of candidate ORFs, of predicted ORFs and a table
# of all scores per ORF used for prediction
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
prediction_output_folder <- file.path(libFolder(df.rfp), "predicted_orfs")
tx_subset <- c("ENST00000315377", "ENST00000311672", "ENST00000578186", "ENST00000549017",
               "ENST00000644684", "ENST00000436146") # Use 6 transcript subset
# Run on 2 first libraries
ORFik::detect_ribo_orfs(df.rfp[1:2,], prediction_output_folder,
                        c("uORF", "uoORF", "annotated", "NTE", "NTT", "doORF", "dORF"), 
                        startCodon = "ATG|CTG|TTG|GTG", 
                        mrna = loadRegion(df.rfp, "mrna", tx_subset),
                        cds = loadRegion(df.rfp, "cds", tx_subset)) # Human also has a lot of ACG uORFs btw
table <- riboORFs(df.rfp[1:2,], type = "table", prediction_output_folder)
# Remember we are only predicting on 2 million reads, so we wont find that much
print(table(table[predicted == TRUE,]$type)) # 3 cds, 5 NTT, 3 uORFs
table[ensembl_tx_name == "ENST00000315377" & predicted == TRUE,]
# this tx has 3 NTTs predicted, 2 in RFP_CO_r1 and 1 in RFP_CO_r2
print(nrow(table[type %in% c("annotated") & predicted == TRUE ])) # 86 cds predicted
print(nrow(table[type %in% c("uORF", "uoORF") & predicted == TRUE ])) # 3 uORFs predicted
# I highly advice to check results with results of the python predictor RiboCode,
# it is by far the best alternative to ORFik prediction out there (I have tested: 
# RiboTaper (deprecated), ORFquant (very bad), RiboTricer (very bad), RiboNT (OK), RiboCode (very good!))
# I will link to my optimized github fork of RiboCode which supports input of ORFik covRle objects later 
# (100x speedup compared to bam input, by using directly an internal hdf5 file!)
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
# uORF analysis (deprecated, but take a look if you want)
# (using the extension package to ORFik: uORFomePipe)
# Will create a mysql database + bed12 file of uORFs with color codes + plots + files with results
#¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤#
remotes::install_github("Roleren/uORFomePipe", dependencies = TRUE) # Then restart R session!
library(uORFomePipe)
find_uORFome("/media/roler/S/data/Bio_data/projects/Alexaki_uORFome/",
             df.rfp = df.rfp, df.rna = df.rna, df.cage = NULL, biomart = NULL,
             startCodons = "ATG|CTG|TTG|GTG", BPPARAM = BiocParallel::MulticoreParam(2))

grl <- getUorfsInDb()
pred <- readTable("finalPredWithProb")$prediction
cov <- readTable("startCodonCoverage")
grl[pred == 1 & rowSums(cov) > 5]