## ----setup, include=FALSE, echo=F, warning= F, message=F----------------------
knitr::opts_chunk$set(
  message = FALSE, 
  warning = FALSE, 
  error = FALSE, 
  tidy = FALSE,
  fig.align = "center", 
  dpi = 150, 
  cache = FALSE,
  progress = FALSE, 
  quite = TRUE
)

## ----echo=FALSE, results="hide", warning=FALSE,message=FALSE------------------
library(TCGAWorkflow)

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  if (!"BiocManager" %in% rownames(installed.packages()))
#    install.packages("BiocManager")
#  BiocManager::install("TCGAWorkflow")
#  BiocManager::install("TCGAWorkflowData")

## ----eval=TRUE, message=FALSE, warning=FALSE, include=TRUE--------------------
library(TCGAWorkflowData)
library(DT)

## ----eval=TRUE, message=FALSE, warning=FALSE, include=FALSE-------------------
library(TCGAbiolinks)

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  library(TCGAbiolinks)
#  query_met_gbm <- GDCquery(
#    project = "TCGA-GBM",
#    data.category = "DNA Methylation",
#    data.type = "Methylation Beta Value",
#    platform = "Illumina Human Methylation 450",
#    barcode = c("TCGA-76-4926-01B-01D-1481-05", "TCGA-28-5211-01C-11D-1844-05")
#  )
#  GDCdownload(query_met_gbm)
#  
#  met_gbm_450k <- GDCprepare(
#    query = query_met_gbm,
#    summarizedExperiment = TRUE
#  )
#  
#  query_met_lgg <- GDCquery(
#    project = "TCGA-LGG",
#    data.category = "DNA Methylation",
#    data.type = "Methylation Beta Value",
#    platform = "Illumina Human Methylation 450",
#    barcode = c("TCGA-HT-7879-01A-11D-2399-05", "TCGA-HT-8113-01A-11D-2399-05")
#  )
#  GDCdownload(query_met_lgg)
#  met_lgg_450k <- GDCprepare(
#    query = query_met_lgg,
#    summarizedExperiment = TRUE
#  )
#  
#  met_lgg_450k$days_to_death <- NA
#  met_lgg_450k$year_of_death <- NA
#  met_gbm_lgg <- SummarizedExperiment::cbind(
#    met_lgg_450k,
#    met_gbm_450k
#  )
#  
#  
#  # A total of 2.27 GB
#  query_exp_lgg <- GDCquery(
#    project = "TCGA-LGG",
#    data.category = "Transcriptome Profiling",
#    data.type = "Gene Expression Quantification",
#    workflow.type = "STAR - Counts"
#  )
#  
#  GDCdownload(query_exp_lgg)
#  exp_lgg <- GDCprepare(
#    query = query_exp_lgg
#  )
#  
#  query_exp_gbm <- GDCquery(
#    project = "TCGA-GBM",
#    data.category = "Transcriptome Profiling",
#    data.type = "Gene Expression Quantification",
#    workflow.type = "STAR - Counts"
#  )
#  GDCdownload(query_exp_gbm)
#  exp_gbm <- GDCprepare(
#    query = query_exp_gbm
#  )
#  
#  # The following clinical data is not available in GBM
#  missing_cols <- setdiff(colnames(colData(exp_lgg)),colnames(colData(exp_gbm)))
#  for(i in missing_cols){
#    exp_lgg[[i]] <- NULL
#  }
#  
#  exp_gbm_lgg <- SummarizedExperiment::cbind(
#    exp_lgg,
#    exp_gbm
#  )

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  #-----------------------------------------------------------------------------
#  #                   Data.category: Copy number variation aligned to hg38
#  #-----------------------------------------------------------------------------
#  query <- GDCquery(
#    project = "TCGA-ACC",
#    data.category = "Copy Number Variation",
#    data.type = "Copy Number Segment",
#    barcode = c( "TCGA-OR-A5KU-01A-11D-A29H-01", "TCGA-OR-A5JK-01A-11D-A29H-01")
#  )
#  GDCdownload(query)
#  data <- GDCprepare(query)
#  
#  query <- GDCquery(
#    project = "TCGA-ACC",
#    data.category = "Copy Number Variation",
#    data.type = "Masked Copy Number Segment",
#    sample.type = c("Primary Tumor")
#  ) # see the barcodes with getResults(query)$cases
#  GDCdownload(query)
#  data <- GDCprepare(query)

## ----eval=TRUE, include=TRUE--------------------------------------------------
library(SummarizedExperiment)

# Load object from TCGAWorkflowData package
# This object will be created in subsequent sections for enhanced clarity and understanding.
data(TCGA_GBM_Transcriptome_20_samples) 

# get gene expression matrix
data <- assay(exp_gbm)
datatable(
  data = data[1:10,], 
  options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
  rownames = TRUE
)

# get genes information
genes.info <- rowRanges(exp_gbm)
genes.info

# get sample information
sample.info <- colData(exp_gbm)
datatable(
  data = as.data.frame(sample.info), 
  options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
  rownames = FALSE
)


## ----eval=TRUE, include=TRUE--------------------------------------------------
# get indexed clinical patient data for GBM samples
gbm_clin <- GDCquery_clinic(
  project = "TCGA-GBM", 
  type = "Clinical"
)

# get indexed clinical patient data for LGG samples
lgg_clin <- GDCquery_clinic(
  project = "TCGA-LGG", 
  type = "Clinical"
)

# Bind the results, as the columns might not be the same,
# we will will plyr rbind.fill, to have all columns from both files
clinical <- plyr::rbind.fill(
  gbm_clin,
  lgg_clin
)

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
datatable(
  clinical[1:10,], 
  options = list(scrollX = TRUE, keys = TRUE), 
  rownames = FALSE
)

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE----------------
# Fetch clinical data directly from the clinical XML files.
# if barcode is not set, it will consider all samples.
# We only set it to make the example faster
query <- GDCquery(
  project = "TCGA-GBM",
  data.format = "bcr xml",
  data.category = "Clinical",
  barcode = c("TCGA-08-0516","TCGA-02-0317")
) 
GDCdownload(query)
clinical <- GDCprepare_clinic(
  query = query, 
  clinical.info = "patient"
)

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
datatable(
  data = clinical, 
  options = list(scrollX = TRUE, keys = TRUE), 
  rownames = FALSE
)

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE----------------
clinical_drug <- GDCprepare_clinic(
  query = query, 
  clinical.info = "drug"
)

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
clinical_drug |>
  datatable(
    options = list(scrollX = TRUE, keys = TRUE), 
    rownames = FALSE
  )

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE----------------
clinical_radiation <- GDCprepare_clinic(
  query = query, 
  clinical.info = "radiation"
)

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
clinical_radiation |> 
  datatable(
    options = list(scrollX = TRUE, keys = TRUE), 
    rownames = FALSE
  )

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE----------------
clinical_admin <- GDCprepare_clinic(
  query = query, 
  clinical.info = "admin"
)

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
clinical_admin |>
  datatable(
    options = list(scrollX = TRUE, keys = TRUE), 
    rownames = FALSE
  )

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  query <- GDCquery(
#    project = c("TCGA-LGG","TCGA-GBM"),
#    data.category = "Simple Nucleotide Variation",
#    access = "open",
#    data.type = "Masked Somatic Mutation",
#    workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
#  )
#  GDCdownload(query)
#  maf <- GDCprepare(query)

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
data(maf_lgg_gbm)
maf[1:10,] |>
  datatable(
    options = list(scrollX = TRUE, keys = TRUE), 
    rownames = FALSE
  )

## ----eval=TRUE, include=TRUE--------------------------------------------------
gbm_subtypes <- TCGAquery_subtype(
  tumor = "gbm"
)

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
datatable(
  gbm_subtypes[1:10,], 
  options = list(scrollX = TRUE, keys = TRUE), 
  rownames = FALSE
)

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  library(RTCGAToolbox)

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  # Get the last run dates
#  lastRunDate <- getFirehoseRunningDates()[1]
#  
#  # get DNA methylation data, RNAseq2 and clinical data for GBM
#  gbm_data <- getFirehoseData(
#    dataset = "GBM",
#    runDate = lastRunDate,
#    gistic2Date = getFirehoseAnalyzeDates(1),
#    Methylation = FALSE,
#    clinical = TRUE,
#    RNASeq2GeneNorm  = FALSE,
#    Mutation = TRUE,
#    fileSizeLimit = 10000
#  )
#  
#  gbm_mut <- getData(gbm_data,"Mutation")
#  gbm_clin <- getData(gbm_data,"clinical")

## ----eval=FALSE, message=FALSE, warning=FALSE, include=TRUE-------------------
#  # Download GISTIC results
#  lastanalyzedate <- getFirehoseAnalyzeDates(1)
#  gistic <- getFirehoseData(
#    dataset = "GBM",
#    GISTIC = TRUE,
#    gistic2Date = lastanalyzedate
#  )
#  
#  # get GISTIC results
#  gistic_allbygene <- getData(
#    object = gistic,
#    type = "GISTIC",
#    platform = "AllByGene"
#  )
#  gistic_thresholedbygene <- getData(
#    object = gistic,
#    type = "GISTIC",
#    platform = "ThresholdedByGene"
#  )

## ----eval=TRUE, message=FALSE, warning=FALSE, include=TRUE--------------------
data(gbm_gistic)
gistic_allbygene %>% head() %>% gt::gt()
gistic_thresholedbygene %>% head() %>% gt::gt()

## ----results='hide', echo=TRUE, message=FALSE, warning=FALSE------------------
library(maftools)
# recovering data from TCGAWorkflowData package.
data(maf_lgg_gbm)

# To prepare for maftools we will also include clinical data
# For a mutant vs WT survival analysis 
# get indexed clinical patient data for GBM samples
gbm_clin <- GDCquery_clinic(project = "TCGA-GBM", type = "Clinical")
# get indexed clinical patient data for LGG samples
lgg_clin <- GDCquery_clinic(project = "TCGA-LGG", type = "Clinical")
# Bind the results, as the columns might not be the same,
# we will will plyr rbind.fill, to have all columns from both files
clinical <- plyr::rbind.fill(gbm_clin,lgg_clin)
colnames(clinical)[grep("submitter_id",colnames(clinical))] <- "Tumor_Sample_Barcode"

# we need to create a binary variable 1 is dead 0 is not dead
plyr::count(clinical$vital_status)
clinical$Overall_Survival_Status <- 1 # dead
clinical$Overall_Survival_Status[which(clinical$vital_status != "Dead")] <- 0

# If patient is not dead we don't have days_to_death (NA)
# we will set it as the last day we know the patient is still alive
clinical$time <- clinical$days_to_death
clinical$time[is.na(clinical$days_to_death)] <- clinical$days_to_last_follow_up[is.na(clinical$days_to_death)]

# Create object to use in maftools
maf <- read.maf(
  maf = maf, 
  clinicalData = clinical, 
  isTCGA = TRUE
)

## ----echo=TRUE, message=FALSE, warning=FALSE,fig.width=10---------------------
plotmafSummary(
  maf = maf,
  rmOutlier = TRUE,
  addStat = 'median',
  dashboard = TRUE
)

## ----echo=TRUE, message=FALSE, warning=FALSE,fig.height=10,fig.width=15,eval=FALSE----
#  oncoplot(
#    maf = maf,
#    top = 20,
#    legendFontSize = 8,
#    clinicalFeatures = c("tissue_or_organ_of_origin")
#  )

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
plot <- mafSurvival(
  maf = maf,
  genes = "TP53",
  time = 'time',
  Status = 'Overall_Survival_Status',
  isTCGA = TRUE
)

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  query_exp_lgg <- GDCquery(
#    project = "TCGA-LGG",
#    data.category = "Transcriptome Profiling",
#    data.type = "Gene Expression Quantification",
#    workflow.type = "STAR - Counts"
#  )
#  # Get only first 20 samples to make example faster
#  query_exp_lgg$results[[1]] <- query_exp_lgg$results[[1]][1:20,]
#  GDCdownload(query_exp_lgg)
#  exp_lgg <- GDCprepare(
#    query = query_exp_lgg
#  )
#  
#  query_exp_gbm <- GDCquery(
#    project = "TCGA-GBM",
#    data.category = "Transcriptome Profiling",
#    data.type = "Gene Expression Quantification",
#    workflow.type = "STAR - Counts"
#  )
#  # Get only first 20 samples to make example faster
#  query_exp_gbm$results[[1]] <- query_exp_gbm$results[[1]][1:20,]
#  GDCdownload(query_exp_gbm)
#  exp_gbm <- GDCprepare(
#    query = query_exp_gbm
#  )

## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE-------------------
data("TCGA_LGG_Transcriptome_20_samples")
data("TCGA_GBM_Transcriptome_20_samples")

exp_lgg_preprocessed <- TCGAanalyze_Preprocessing(
  object = exp_lgg,
  cor.cut = 0.6,    
  datatype = "unstranded",
  filename = "LGG_IlluminaHiSeq_RNASeqV2.png"
)

exp_gbm_preprocessed <- TCGAanalyze_Preprocessing(
  object = exp_gbm,
  cor.cut = 0.6, 
  datatype = "unstranded",
  filename = "GBM_IlluminaHiSeq_RNASeqV2.png"
)
exp_preprocessed <- cbind(
  exp_lgg_preprocessed, 
  exp_gbm_preprocessed
)

exp_normalized <- TCGAanalyze_Normalization(
  tabDF = cbind(exp_lgg_preprocessed, exp_gbm_preprocessed),
  geneInfo = TCGAbiolinks::geneInfoHT,
  method = "gcContent"
) # 60513   40

exp_filtered <- TCGAanalyze_Filtering(
  tabDF = exp_normalized,
  method = "quantile",
  qnt.cut =  0.25
)  # 44630   40

exp_filtered_lgg <- exp_filtered[
  ,substr(colnames(exp_filtered),1,12) %in% lgg_clin$bcr_patient_barcode
]

exp_filtered_gbm <-   exp_filtered[
  ,substr(colnames(exp_filtered),1,12) %in% gbm_clin$bcr_patient_barcode
]

diff_expressed_genes <- TCGAanalyze_DEA(
  mat1 = exp_filtered_lgg,
  mat2 = exp_filtered_gbm,
  Cond1type = "LGG",
  Cond2type = "GBM",
  fdr.cut = 0.01 ,
  logFC.cut = 1,
  method = "glmLRT"
)

## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE-------------------
# Number of differentially expressed genes (DEG)
nrow(diff_expressed_genes)

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=10, fig.width=10----
#-------------------  4.2 EA: enrichment analysis             --------------------
ansEA <- TCGAanalyze_EAcomplete(
  TFname = "DEA genes LGG Vs GBM", 
  RegulonList = diff_expressed_genes$gene_name
)

TCGAvisualize_EAbarplot(
  tf = rownames(ansEA$ResBP),
  filename = NULL,
  GOBPTab = ansEA$ResBP,
  nRGTab = diff_expressed_genes$gene_name,
  nBar = 20
)

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=10, fig.width=10----
TCGAvisualize_EAbarplot(
  tf = rownames(ansEA$ResBP),
  filename = NULL,
  GOCCTab = ansEA$ResCC,
  nRGTab = diff_expressed_genes$gene_name,
  nBar = 20
)

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=10, fig.width=10----
TCGAvisualize_EAbarplot(
  tf = rownames(ansEA$ResBP),
  filename = NULL,
  GOMFTab = ansEA$ResMF,
  nRGTab = diff_expressed_genes$gene_name,
  nBar = 20
)

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE,fig.height=12, fig.width=15----
TCGAvisualize_EAbarplot(
  tf = rownames(ansEA$ResBP),
  filename = NULL,
  PathTab = ansEA$ResPat,
  nRGTab = rownames(diff_expressed_genes),
  nBar = 20
)

## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE-------------------
library(SummarizedExperiment)

# DEGs TopTable
dataDEGsFiltLevel <- TCGAanalyze_LevelTab(
  FC_FDR_table_mRNA = diff_expressed_genes,
  typeCond1 = "LGG",
  typeCond2 = "GBM",
  TableCond1 = exp_filtered[,colnames(exp_filtered_lgg)],
  TableCond2 = exp_filtered[,colnames(exp_filtered_gbm)]
)

dataDEGsFiltLevel$GeneID <- 0

library(clusterProfiler)
# Converting Gene symbol to geneID
eg = as.data.frame(
  bitr(
    dataDEGsFiltLevel$mRNA,
    fromType = "ENSEMBL",
    toType = c("ENTREZID","SYMBOL"),
    OrgDb = "org.Hs.eg.db"
  )
)
eg <- eg[!duplicated(eg$SYMBOL),]
eg <- eg[order(eg$SYMBOL,decreasing=FALSE),]

dataDEGsFiltLevel <- dataDEGsFiltLevel[dataDEGsFiltLevel$mRNA %in% eg$ENSEMBL,]
dataDEGsFiltLevel <- dataDEGsFiltLevel[eg$ENSEMBL,]
rownames(dataDEGsFiltLevel) <- eg$SYMBOL

all(eg$SYMBOL == rownames(dataDEGsFiltLevel))
dataDEGsFiltLevel$GeneID <- eg$ENTREZID

dataDEGsFiltLevel_sub <- subset(dataDEGsFiltLevel, select = c("GeneID", "logFC"))
genelistDEGs <- as.numeric(dataDEGsFiltLevel_sub$logFC)
names(genelistDEGs) <- dataDEGsFiltLevel_sub$GeneID
library(pathview)
# pathway.id: hsa05214 is the glioma pathway
# limit: sets the limit for gene expression legend and color
hsa05214 <- pathview::pathview(
  gene.data  = genelistDEGs,
  pathway.id = "hsa05214",
  species    = "hsa",
  limit = list(gene = as.integer(max(abs(genelistDEGs))))
)


## ----eval=FALSE, include=TRUE-------------------------------------------------
#  ### read biogrid info (available in TCGAWorkflowData as "biogrid")
#  ### Check last version in https://thebiogrid.org/download.php
#  file <- "https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-ALL-LATEST.tab2.zip"
#  if(!file.exists(gsub("zip","txt",basename(file)))){
#    downloader::download(file,basename(file))
#    unzip(basename(file),junkpaths =TRUE)
#  }
#  
#  tmp.biogrid <- vroom::vroom(
#    dir(pattern = "BIOGRID-ALL.*\\.txt")
#  )

## ----eval=TRUE, include=TRUE--------------------------------------------------
### plot details (colors & symbols)
mycols <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628')

### load network inference libraries
library(minet)
library(c3net)

### deferentially identified genes using TCGAbiolinks
# we will use only a subset (first 50 genes) of it to make the example faster
names.genes.de <- rownames(diff_expressed_genes)[1:30]

data(biogrid)
net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de)

mydata <- exp_filtered_lgg[names.genes.de, ]

### infer networks
t.mydata <- t(mydata)
net.aracne <- minet(t.mydata, method = "aracne")
net.mrnet <- minet(t.mydata)
net.clr <- minet(t.mydata, method = "clr")
net.c3net <- c3net(mydata)

### validate compared to biogrid network
tmp.val <- list(
  validate(net.aracne, net.biogrid.de), 
  validate(net.mrnet, net.biogrid.de),
  validate(net.clr, net.biogrid.de), 
  validate(net.c3net, net.biogrid.de)
)

### plot roc and compute auc for the different networks
dev1 <- show.roc(tmp.val[[1]],cex=0.3,col=mycols[1],type="l")
res.auc <- auc.roc(tmp.val[[1]])
for(count in 2:length(tmp.val)){
  show.roc(tmp.val[[count]],device=dev1,cex=0.3,col=mycols[count],type="l")
  res.auc <- c(res.auc, auc.roc(tmp.val[[count]]))
}

legend(
  "bottomright", 
  legend = paste(c("aracne","mrnet","clr","c3net"), signif(res.auc,4), sep=": "),
  col = mycols[1:length(tmp.val)],
  lty = 1, 
  bty = "n" 
)
# Please, uncomment this line to produce the pdf files.
# dev.copy2pdf(width=8,height=8,device = dev1, file = paste0("roc_biogrid_",cancertype,".pdf"))

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  #----------------------------
#  # Obtaining DNA methylation
#  #----------------------------
#  # Samples
#  lgg.samples <- matchedMetExp("TCGA-LGG", n = 10)
#  gbm.samples <- matchedMetExp("TCGA-GBM", n = 10)
#  samples <- c(lgg.samples,gbm.samples)
#  
#  #-----------------------------------
#  # 1 - Methylation
#  # ----------------------------------
#  # For DNA methylation it is quicker in this case to download the tar.gz file
#  # and get the samples we want instead of downloading files by files
#  query <- GDCquery(
#    project = c("TCGA-LGG","TCGA-GBM"),
#    data.category = "DNA Methylation",
#    platform = "Illumina Human Methylation 450",
#    data.type = "Methylation Beta Value",
#    barcode = samples
#  )
#  GDCdownload(query)
#  met <- GDCprepare(
#    query = query,
#    save = FALSE
#  )
#  
#  # We will use only chr9 to make the example faster
#  met <- subset(met,subset = as.character(seqnames(met)) %in% c("chr9"))
#  # This data is avaliable in the package (object elmerExample)

## ----echo=TRUE, message=FALSE,warning=FALSE, fig.height=8,fig.width=8---------
data(elmerExample)
#----------------------------
# Mean methylation
#----------------------------
# Plot a barplot for the groups in the disease column in the
# summarizedExperiment object

# remove probes with NA (similar to na.omit)
met <- met[rowSums(is.na(assay(met))) == 0,]

df <- data.frame(
  "Sample.mean" = colMeans(assay(met), na.rm = TRUE),
  "groups" = met$project_id
)

library(ggpubr)
ggpubr::ggboxplot(
  data = df,
  y = "Sample.mean",
  x = "groups",
  color = "groups",
  add = "jitter",
  ylab = "Mean DNA methylation (beta-values)",
  xlab = ""
) + stat_compare_means() 

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE-------------------
#------- Searching for differentially methylated CpG sites     ----------
dmc <- TCGAanalyze_DMC(
  data = met,
  groupCol = "project_id", # a column in the colData matrix
  group1 = "TCGA-GBM", # a type of the disease type column
  group2 = "TCGA-LGG", # a type of the disease column
  p.cut = 0.05,
  diffmean.cut = 0.15,
  save = FALSE,
  legend = "State",
  plot.filename = "LGG_GBM_metvolcano.png",
  cores = 1 # if set to 1 there will be a progress bar
)

## ----results='hide', echo=TRUE, message=FALSE,warning=FALSE-------------------
#--------------------------
# DNA Methylation heatmap
#-------------------------
library(ComplexHeatmap)
clinical <- plyr::rbind.fill(
  gbm_clin,
  lgg_clin
)

# get the probes that are Hypermethylated or Hypomethylated
# met is the same object of the section 'DNA methylation analysis'
status.col <- "status"
probes <- rownames(dmc)[grep("hypo|hyper",dmc$status,ignore.case = TRUE)]
sig.met <- met[probes,]


# top annotation, which samples are LGG and GBM
# We will add clinical data as annotation of the samples
# we will sort the clinical data to have the same order of the DNA methylation matrix
clinical.ordered <- clinical[match(substr(colnames(sig.met),1,12),clinical$bcr_patient_barcode),]

ta <- HeatmapAnnotation(
  df = clinical.ordered[, c("primary_diagnosis", "gender", "vital_status", "race")],
  col = list(
    disease = c("LGG" = "grey", "GBM" = "black"),
    gender = c("male" = "blue", "female" = "pink")
  )
)

# row annotation: add the status for LGG in relation to GBM
# For exmaple: status.gbm.lgg Hypomethyated means that the
# mean DNA methylation of probes for lgg are hypomethylated
# compared to GBM ones.
ra = rowAnnotation(
  df = dmc[probes, status.col],
  col = list(
    "status.TCGA.GBM.TCGA.LGG" =
      c(
        "Hypomethylated" = "orange",
        "Hypermethylated" = "darkgreen"
      )
  ),
  width = unit(1, "cm")
)

heatmap  <- Heatmap(
  matrix = assay(sig.met),
  name = "DNA methylation",
  col = matlab::jet.colors(200),
  show_row_names = FALSE,
  cluster_rows = TRUE,
  cluster_columns = FALSE,
  show_column_names = FALSE,
  bottom_annotation = ta,
  column_title = "DNA Methylation"
) 
# Save to pdf
png("heatmap.png",width = 600, height = 400)
draw(heatmap, annotation_legend_side =  "bottom")
dev.off()

## ----results='asis', echo=TRUE, message=FALSE,warning=FALSE-------------------
library(rGADEM)
library(BSgenome.Hsapiens.UCSC.hg19)
library(motifStack)
library(SummarizedExperiment)
library(dplyr)

probes <- rowRanges(met)[rownames(dmc)[grep("hypo|hyper",dmc$status,ignore.case = TRUE)],]

# Get hypo/hyper methylated probes and make a 200bp window 
# surrounding each probe.
sequence <- GRanges(
  seqnames = as.character(seqnames(probes)),
  IRanges(
    start = ranges(probes) %>% as.data.frame() %>% dplyr::pull("start") - 100,
    end = ranges(probes) %>% as.data.frame() %>% dplyr::pull("end") + 100), 
  strand = "*"
)
#look for motifs
gadem <- GADEM(sequence, verbose = FALSE, genome = Hsapiens)

# How many motifs were found?
nMotifs(gadem)

# get the number of occurrences
nOccurrences(gadem)

# view all sequences consensus
consensus(gadem)

# Print motif
pwm <- getPWM(gadem)
pfm  <- new("pfm",mat = pwm[[1]],name = "Novel Site 1")
plotMotifLogo(pfm)

# Number of instances of motif 1?
length(gadem@motifList[[1]]@alignList)


## ----table4, echo=FALSE, message=FALSE, warnings=FALSE, results='asis'--------
tabl <- "
|                  Histone marks                 |                                                   Role                                                  |
|:----------------------------------------------:|:-------------------------------------------------------------------------------------------------------:|
| Histone H3 lysine 4 trimethylation (H3K4me3)   | Promoter regions [@heintzman2007distinct,@bernstein2005genomic]                                      |
| Histone H3 lysine 4 monomethylation (H3K4me1)  | Enhancer regions [@heintzman2007distinct]                                                           |
| Histone H3 lysine 36 trimethylation (H3K36me3) | Transcribed regions                                                                                     |
| Histone H3 lysine 27 trimethylation (H3K27me3) | Polycomb repression [@bonasio2010molecular]                                                         |
| Histone H3 lysine 9 trimethylation (H3K9me3)   | Heterochromatin regions  [@peters2003partitioning]                                                  |
| Histone H3 acetylated at lysine 27 (H3K27ac)   | Increase activation of genomic elements [@heintzman2009histone,@rada2011unique,@creyghton2010histone] |
| Histone H3 lysine 9 acetylation  (H3K9ac)      | Transcriptional activation [@nishida2006histone]                                                    |
"
cat(tabl) 

## ----table5, echo=FALSE, message=FALSE, warnings=FALSE, results='asis'--------
tabl <- "  
|                File                |                               Description                              |
|:----------------------------------:|:----------------------------------------------------------------------:|
| fc.signal.bigwig                   | Bigwig File containing  fold enrichment signal tracks                  |
| pval.signal.bigwig                 | Bigwig File containing -log10(p-value) signal tracks                   |
| hotspot.fdr0.01.broad.bed.gz       | Broad domains on enrichment for  DNase-seq for consolidated epigenomes |
| hotspot.broad.bed.gz               | Broad domains on enrichment for DNase-seq for consolidated epigenomes  |
| broadPeak.gz                       | Broad ChIP-seq peaks for consolidated  epigenomes                      |
| gappedPeak.gz                      | Gapped ChIP-seq peaks for consolidated   epigenomes                    |
| narrowPeak.gz                      | Narrow ChIP-seq peaks for consolidated epigenomes                      |
| hotspot.fdr0.01.peaks.bed.gz       | Narrow DNasePeaks for   consolidated epigenomes                        |
| hotspot.all.peaks.bed.gz           | Narrow DNasePeaks for  consolidated epigenomes                         |
| .macs2.narrowPeak.gz               | Narrow DNasePeaks for consolidated epigenomes                          |
| coreMarks_mnemonics.bed.gz        | 15 state chromatin segmentations                                       |
| mCRF_FractionalMethylation.bigwig | MeDIP/MRE(mCRF) fractional methylation calls                           |
| RRBS_FractionalMethylation.bigwig | RRBS fractional methylation calls                                      |
| WGBS_FractionalMethylation.bigwig | Whole genome bisulphite fractional methylation calls                   |
"
cat(tabl) 

## ----results='hide', eval=TRUE, echo=TRUE, message=FALSE,warning=FALSE--------
library(ChIPseeker)
library(pbapply)
library(ggplot2)

## ----results='hide', eval=FALSE, echo=TRUE, message=FALSE,warning=FALSE-------
#  #------------------ Working with ChipSeq data ---------------
#  # Step 1: download histone marks for a brain and non-brain samples.
#  #------------------------------------------------------------
#  # loading annotation hub database
#  library(AnnotationHub)
#  ah = AnnotationHub()
#  
#  # Searching for brain consolidated epigenomes in the roadmap database
#  bpChipEpi_brain <- query(ah , c("EpigenomeRoadMap", "narrowPeak", "chip", "consolidated","brain","E068"))
#  # Get chip-seq data
#  histone.marks <- pblapply(names(bpChipEpi_brain), function(x) {ah[[x]]})
#  names(histone.marks) <- names(bpChipEpi_brain)
#  # OBS: histone.marks is available in TCGAWorkflowData package

## ----getTagMatrix, results='hide', echo=TRUE, message=FALSE,warning=FALSE-----
data(histoneMarks)
# Create a GR object based on the hypo/hypermethylated probes.
probes <- keepStandardChromosomes(
  rowRanges(met)[rownames(dmc)[dmc$status %in% c("Hypermethylated in TCGA-GBM", "Hypomethylated in TCGA-GBM")],]
)
# Defining a window of 3kbp - 3kbp_probe_3kbp
# to make it work with ChIPseeker package version "1.31.3.900"
attributes(probes)$type <- "start_site"
attributes(probes)$downstream <- 3000
attributes(probes)$upstream <- 3000
probes <- GenomicRanges::resize(probes,6001,fix = "center") 

### Profile of ChIP peaks binding to TSS regions
# First of all, to calculate the profile of ChIP peaks binding to TSS regions, we should
# prepare the TSS regions, which are defined as the flanking sequence of the TSS sites.
# Then align the peaks that are mapping to these regions and generate the tagMatrix.
tagMatrixList <- pbapply::pblapply(histone.marks, function(x) {
  getTagMatrix(keepStandardChromosomes(x), windows = probes, weightCol = "score")
})
# change names retrieved with the following command: basename(bpChipEpi_brain$title)
names(tagMatrixList) <- c("H3K4me1","H3K4me3", "H3K9ac", "H3K9me3", "H3K27ac",  "H3K27me3", "H3K36me3")

## ----tagHeatmap, eval=FALSE, include=TRUE-------------------------------------
#  tagHeatmap(tagMatrixList)

## ----results='asis', echo=FALSE, fig.width = 20, message=FALSE,warning=FALSE----
tagHeatmap(tagMatrixList)

## ----plotAvgProf, eval=FALSE, include=TRUE------------------------------------
#  p <- plotAvgProf(tagMatrixList, xlim = c(-3000,3000), xlab = "Genomic Region (5'->3', centered on CpG)")
#  # We are centreing in the CpG instead of the TSS. So we'll change the labels manually
#  p <- p + scale_x_continuous(
#    breaks = c(-3000,-1500,0,1500,3000),
#    labels = c(-3000,-1500,"CpG",1500,3000)
#  )
#  
#  library(ggthemes)
#  p + theme_few() + scale_colour_few(name = "Histone marks") +  guides(colour = guide_legend(override.aes = list(size=4)))

## ----results='asis', echo=FALSE, message=FALSE,warning=FALSE------------------
p <- plotAvgProf(tagMatrixList, xlim = c(-3000,3000), xlab = "Genomic Region (5'->3', centered on CpG)")
# We are centreing in the CpG instead of the TSS. So we'll change the labels manually

if (requireNamespace("ggplot2", quietly = TRUE)) {
  p <- p + ggplot2::scale_x_continuous(
    breaks = c(-3000,-1500,0,1500,3000),
    labels = c(-3000,-1500,"CpG",1500,3000)
  )
}

if (requireNamespace("ggthemes", quietly = TRUE)){ 
  p + ggthemes::theme_few() + 
    ggthemes::scale_colour_few(name="Histone marks") + 
    ggplot2::guides(colour = guide_legend(override.aes = list(size=4)))
}

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  #----------- 8.3 Identification of Regulatory Enhancers   -------
#  library(TCGAbiolinks)
#  # Samples: primary solid tumor w/ DNA methylation and gene expression
#  lgg.samples <- matchedMetExp("TCGA-LGG", n = 10)
#  gbm.samples <- matchedMetExp("TCGA-GBM", n = 10)
#  samples <- c(lgg.samples,gbm.samples)
#  
#  #-----------------------------------
#  # 1 - Methylation
#  # ----------------------------------
#  query_met <- GDCquery(
#    project = c("TCGA-LGG","TCGA-GBM"),
#    data.category = "DNA Methylation",
#    platform = "Illumina Human Methylation 450",
#    data.type = "Methylation Beta Value",
#    barcode = samples
#  )
#  GDCdownload(query_met)
#  met <- GDCprepare(query_met, save = FALSE)
#  met <- subset(met,subset = as.character(GenomicRanges::seqnames(met)) %in% c("chr9"))
#  
#  #-----------------------------------
#  # 2 - Expression
#  # ----------------------------------
#  query_exp <- GDCquery(
#    project = c("TCGA-LGG","TCGA-GBM"),
#    data.category = "Gene expression",
#    data.type = "Gene expression quantification",
#    platform = "Illumina HiSeq",
#    file.type  = "results",
#    legacy = TRUE,
#    barcode =  samples
#  )
#  GDCdownload(query_exp)
#  exp <- GDCprepare(query_exp, save = FALSE)
#  save(exp, met, gbm.samples, lgg.samples, file = "elmer.example.rda", compress = "xz")

## ----elmer, message = FALSE,warning = FALSE-----------------------------------
library(ELMER)
library(MultiAssayExperiment)
library(GenomicRanges)
distal.probes <- get.feature.probe(
  genome = "hg19", 
  met.platform = "450K"
)

# Recover the data created in the last step
data(elmerExample)
rownames(exp) <- values(exp)$ensembl_gene_id
mae <- createMAE(
  exp = assay(exp), 
  met = met,
  save = TRUE,
  linearize.exp = TRUE,
  save.filename = "mae.rda",
  filter.probes = distal.probes,
  met.platform = "450K",
  genome = "hg19",
  TCGA = TRUE
)
mae

## ----warning=FALSE, results="hide"--------------------------------------------
cores <- 1 # you can use more cores if you want
group.col <- "project_id"
group1 <- "TCGA-GBM"
group2 <- "TCGA-LGG"

# Available directions are hypo and hyper, we will use only hypo
# due to speed constraint
direction <- "hypo"
dir.out <- paste0("elmer/",direction)
dir.create(dir.out, showWarnings = FALSE, recursive = TRUE)

## ----warning=FALSE, results="hide"--------------------------------------------
#--------------------------------------
# STEP 3: Analysis                     |
#--------------------------------------
# Step 3.1: Get diff methylated probes |
#--------------------------------------
message("Get diff methylated probes")
Sig.probes <- get.diff.meth(
  data = mae, 
  group.col = group.col,
  group1 = group1,
  group2 =  group2,
  minSubgroupFrac = 1.0, # Use all samples
  sig.dif = 0.2, # defualt is 0.3
  diff.dir = direction, # Search for hypomethylated probes in group 1
  cores = cores, 
  dir.out = dir.out, 
  pvalue = 0.1
)

## ----warning=FALSE------------------------------------------------------------
datatable(
  data = Sig.probes[1:10,], 
  options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
  rownames = TRUE
)

## ----warning=FALSE, results="hide"--------------------------------------------
#-------------------------------------------------------------
# Step 3.2: Identify significant probe-gene pairs            |
#-------------------------------------------------------------
# Collect nearby 20 genes for Sig.probes
message("Get nearby genes")
nearGenes <- GetNearGenes(
  data = mae,
  numFlankingGenes = 4, # default is 20 genes
  probes = Sig.probes$probe
)

## -----------------------------------------------------------------------------
length(Sig.probes$probe)
dim(nearGenes)
head(nearGenes)

## ----warning=FALSE, results="hide"--------------------------------------------
message("Get anti correlated probes-genes")
pair <- get.pair(
  data = mae,
  group.col = group.col,
  group1 = group1,
  group2 =  group2,
  nearGenes = nearGenes,
  mode = "supervised",
  minSubgroupFrac = 1, # % of samples to use in to create groups U/M
  raw.pvalue = 0.1,   # defualt is 0.001
  Pe = 0.5, # Please set to 0.001 to get significant results
  filter.probes = TRUE, # See preAssociationProbeFiltering function
  filter.percentage = 0.05,
  save = FALSE, # Create CVS file
  filter.portion = 0.3,
  dir.out = dir.out,
  diff.dir = direction,
  cores = cores,
  label = direction
)

## ----warning=FALSE------------------------------------------------------------
datatable(
  pair[1:10,], 
  options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
  rownames = TRUE
)

## ----warning=FALSE, results="hide"--------------------------------------------
#-------------------------------------------------------------
# Step 3.3: Motif enrichment analysis on the selected probes |
#-------------------------------------------------------------
enriched.motif <- get.enriched.motif(
  data = mae,
  probes = pair$Probe, 
  dir.out = dir.out, 
  label = direction,
  pvalue = 1, # default is FDR < 0.05
  min.incidence = 10,
  lower.OR = 1.1
)
# One of the output from the  previous function is a file with the motif, OR and Number of probes
# It will be used for plotting purposes
motif.enrichment <- read.csv(paste0(dir.out,"/getMotif.",direction, ".motif.enrichment.csv"))

## ----warning=FALSE------------------------------------------------------------
head(enriched.motif[names(enriched.motif)[1]]) ## probes in the given set that have the first motif.
motif.enrichment %>% head %>% gt::gt()

## ----warning=FALSE, results="hide"--------------------------------------------
#-------------------------------------------------------------
# Step 3.4: Identifying regulatory TFs                        |
#-------------------------------------------------------------
TF <- get.TFs(
  data = mae, 
  group.col = group.col,
  group1 = group1,
  group2 =  group2,
  mode = "supervised",
  enriched.motif = enriched.motif,
  dir.out = dir.out, 
  cores = cores,
  save.plots = FALSE,
  diff.dir = direction,
  label = direction
)

# One of the output from the previous function is a file with the raking of TF,
# for each motif. It will be used for plotting purposes
TF.meth.cor <- get(load(paste0(dir.out,"/getTF.",direction,".TFs.with.motif.pvalue.rda")))

## ----warning=FALSE------------------------------------------------------------
datatable(
  TF, 
  options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
  rownames = FALSE
)
datatable(
  TF.meth.cor[1:10,1:6], 
  options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), 
  rownames = TRUE
)

## ----echo=TRUE, message=FALSE,eval = FALSE, warnings=FALSE, results='asis',fig.height=6, fig.width=10----
#  heatmapPairs(
#    data = mae,
#    group.col = group.col,
#    group1 = group1,
#    group2 = group2,
#    annotation.col = c("gender"),
#    pairs = pair,
#    filename =  NULL
#  )

## ----echo=TRUE, message=FALSE, warnings=FALSE, results='asis',fig.height=7,fig.width=15----
motif.enrichment.plot(
  motif.enrichment = motif.enrichment, 
  save = FALSE,
  significant = list(lowerOR = 1.1)
) # Filter motifs in the plot lowerOR > 1.3

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  grid:TF.rank.plot(motif.pvalue=TF.meth.cor, motif=TF$motif[1], save=FALSE)

## ----results='asis', echo=FALSE, message=FALSE,warning=FALSE,  fig.align='center',fig.height=8,fig.width=8----
gg <- TF.rank.plot(motif.pvalue = TF.meth.cor, motif=TF$motif[1], save=FALSE)
grid::grid.draw(gg[[1]])

## ----results='asis', echo=TRUE, message=FALSE, warning=FALSE------------------
png("TF.png",width = 800, height = 400)
scatter.plot(
  data = mae, 
  category = group.col, 
  save = FALSE, 
  lm_line = TRUE,
  byTF = list(
    TF = unlist(stringr::str_split(TF[1,"top_5percent_TFs"],";"))[1:4], 
    probe = enriched.motif[[TF$motif[1]]]
  )
)
dev.off()

## -----------------------------------------------------------------------------
pander::pander(sessionInfo(), compact = FALSE)