## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(dpi = 300)
knitr::opts_chunk$set(cache=FALSE)
knitr::opts_chunk$set(echo = TRUE)

## ----message=FALSE, warning=FALSE, include=FALSE------------------------------
library(TCGAbiolinks)
library(SummarizedExperiment)
#library(dplyr)
#library(DT)

## ----echo=TRUE, warning=FALSE, eval=TRUE--------------------------------------
projects <- getGDCprojects()$project_id
as.data.frame(grep("TARGET", projects,value = TRUE))

## ----echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE------
# #Downloading and prepare TARGET CASE
# query_Target <- GDCquery(project = "TARGET-AML",
#                          data.category = "Transcriptome Profiling",
#                          data.type = "Gene Expression Quantification",
#                          workflow.type = "HTSeq - Counts")
# 
# samplesDown_Target <- getResults(query_Target, cols = c("cases"))
# 
# dataSmTB_Target <- TCGAquery_SampleTypes(barcode = samplesDown_Target,
#                                          typesample = "TB")
# 
# dataSmNB_Target <- TCGAquery_SampleTypes(barcode = samplesDown_Target,
#                                          typesample = "TBM")
# dataSmTB_short_Target <- dataSmTB_Target[1:10]
# dataSmNB_short_Target <- dataSmNB_Target[1:10]
# 
# queryDown_Target <- GDCquery(project = "TARGET-AML",
#                              data.category = "Transcriptome Profiling",
#                              data.type = "Gene Expression Quantification",
#                              workflow.type = "HTSeq - Counts",
#                              barcode = c(dataSmTB_short_Target, dataSmNB_short_Target))
# 
# GDCdownload(queryDown_Target)
# 
# ### SummarizedExperiment = TRUE
# data <- GDCprepare(queryDown_Target)
# 
# dataPrep_Target <- TCGAanalyze_Preprocessing(object = data,
#                                              cor.cut = 0.6,
#                                              datatype = "HTSeq - Counts")
# 
# dataNorm_Target <- TCGAanalyze_Normalization(tabDF = dataPrep_Target,
#                                              geneInfo = geneInfoHT,
#                                              method = "gcContent")
# 
# boxplot(dataPrep_Target, outline = FALSE)
# 
# boxplot(dataNorm_Target, outline = FALSE)
# 
# dataFilt_Target <- TCGAanalyze_Filtering(tabDF = dataNorm_Target,
#                                          method = "quantile",
#                                          qnt.cut =  0.25)

## ----echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE------
# CancerProject <- "TCGA-BRCA"
# DataDirectory <- paste0("../GDC/",gsub("-","_",CancerProject))
# FileNameData <- paste0(DataDirectory, "_","HTSeq_Counts",".rda")
# 
# query <- GDCquery(project = CancerProject,
#                   data.category = "Transcriptome Profiling",
#                   data.type = "Gene Expression Quantification",
#                   workflow.type = "HTSeq - Counts")
# 
# samplesDown <- getResults(query,cols = c("cases"))
# 
# dataSmTP <- TCGAquery_SampleTypes(barcode = samplesDown,
#                                   typesample = "TP")
# 
# dataSmNT <- TCGAquery_SampleTypes(barcode = samplesDown,
#                                   typesample = "NT")
# dataSmTP_short <- dataSmTP[1:10]
# dataSmNT_short <- dataSmNT[1:10]
# 
# queryDown <- GDCquery(project = CancerProject,
#                       data.category = "Transcriptome Profiling",
#                       data.type = "Gene Expression Quantification",
#                       workflow.type = "HTSeq - Counts",
#                       barcode = c(dataSmTP_short, dataSmNT_short))
# 
# GDCdownload(query = queryDown)
# 
# dataPrep1 <- GDCprepare(query = queryDown,
#                         save = TRUE,
#                         save.filename = "TCGA_BRCA_HTSeq_Countds.rda")
# 
# dataPrep <- TCGAanalyze_Preprocessing(object = dataPrep1,
#                                       cor.cut = 0.6,
#                                       datatype = "HTSeq - Counts")
# 
# 
# dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep,
#                                       geneInfo = geneInfoHT,
#                                       method = "gcContent")
# dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep,
#                                       geneInfo = geneInfoHT,
#                                       method = "geneLength")
# 
# dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
#                                   method = "quantile",
#                                   qnt.cut =  0.25)

## ----echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE------
# ### dataframe will have genes from dataFilt but raw counts from dataPrep
# dataPrep_raw <- UseRaw_afterFilter(dataPrep, dataFilt)

## ----echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE------
# ##use previously fetched BRCA data:
# Pam50.subtypes<-TCGA_MolecularSubtype(colnames(dataFilt))

## ----echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE------
# dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,dataSmTP_short],
#                             mat2 = dataFilt[,dataSmNT_short],
#                             pipeline="limma",
#                             Cond1type = "Normal",
#                             Cond2type = "Tumor",
#                             fdr.cut = 0.01 ,
#                             logFC.cut = 1,
#                             method = "glmLRT", ClinicalDF = data.frame())

## ----echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE------
# ### ##download and prepare lung data through GDC### ##
# query.lung <- GDCquery(project = "TCGA-LUSC",
#                        data.category = "Transcriptome Profiling",
#                        data.type = "Gene Expression Quantification",
#                        workflow.type = "HTSeq - Counts")
# 
# samplesDown.lusc <- getResults(query.lung,cols=c("cases"))
# 
# dataSmTP.lusc <- TCGAquery_SampleTypes(barcode = samplesDown.lusc,
#                                        typesample = "TP")
# 
# dataSmNT.lusc <- TCGAquery_SampleTypes(barcode = samplesDown.lusc,
#                                        typesample = "NT")
# dataSmTP_short.lusc <- dataSmTP.lusc[1:10]
# dataSmNT_short.lusc <- dataSmNT.lusc[1:10]
# length(dataSmTP.lusc)
# 
# 
# queryDown.lung <- GDCquery(project = "TCGA-LUSC",
#                            data.category = "Transcriptome Profiling",
#                            data.type = "Gene Expression Quantification",
#                            workflow.type = "HTSeq - Counts",
#                            barcode = c(dataSmTP.lusc, dataSmNT.lusc))
# 
# GDCdownload(query = queryDown.lung)
# 
# dataPrep1.tcga <- GDCprepare(query = queryDown.lung,
#                              save = TRUE,
#                              save.filename = "TCGA_LUSC_HTSeq_Countds.rda")
# dataPrep.tcga <- TCGAanalyze_Preprocessing(object = dataPrep1.tcga,
#                                            cor.cut = 0.6,
#                                            datatype = "HTSeq - Counts")
# 
# dataNorm.tcga <- TCGAanalyze_Normalization(tabDF = dataPrep.tcga,
#                                            geneInfo = geneInfoHT,
#                                            method = "gcContent")
# 
# dataNorm.tcga <- TCGAanalyze_Normalization(tabDF = dataNorm.tcga,
#                                            geneInfo = geneInfoHT,
#                                            method = "geneLength")
# dataFilt.tcga <- TCGAanalyze_Filtering(tabDF = dataNorm.tcga,
#                                        method = "quantile",
#                                        qnt.cut =  0.25)
# 
# ### #Filtering data so all samples have a pam50 subtype for LUSC
# 
# diff<-setdiff(colnames(dataFilt.tcga), TCGA_MolecularSubtype(colnames(dataFilt.tcga))$filtered)
# TCGA_MolecularSubtype(diff)$subtypes$barcodes
# 
# dataFilt.tcga.lusc<-dataFilt.tcga[,diff]
# LUSC.subtypes<-TCGA_MolecularSubtype(colnames(dataFilt.tcga.lusc))$subtypes$subtype
# 
# ### Differential expression analysis after correcting for "Plate" factor.
# DEG.lusc <- TCGAanalyze_DEA(MAT=dataFilt.tcga.lusc,
#                             pipeline="limma",
#                             batch.factors = c("Plate"),
#                             Cond1type = "Normal",
#                             Cond2type = "Tumor",
#                             fdr.cut = 0.01 ,
#                             logFC.cut = 1,
#                             method = "glmLRT", ClinicalDF = data.frame(),
#                             Condtypes = LUSC.subtypes,
#                             contrast.formula = "LUSC.basalvsLUSC.classical=LUSC.basal - LUSC.classical")

## ----echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE------
# 
# batch.corrected <- TCGAbatch_Correction(tabDF = dataFilt.tcga, batch.factor = "Plate")
# 
# batch.corrected.adjusted <- TCGAbatch_Correction(tabDF = dataFilt.tcga,
#                                                  batch.factor = "Plate",
#                                                  adjustment=c("TSS"))

## ----echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE------
# 
# #dataFilt is a gene expression matrix from pancancer33 RNASeq data that can be generated following our previous described workflow
# 
# sampleTP <- TCGAquery_SampleTypes(barcode = colnames(dataFilt), typesample = "TP")
# sampleNT <- TCGAquery_SampleTypes(barcode = colnames(dataFilt), typesample = "NT")
# dataFiltTP <- dataFilt[,sampleTP]
# dataFiltNT <- dataFilt[,sampleNT]
# 
# dataSubt_LUAD <-TCGAquery_subtype(tumor = "LUAD")
# sampleStageI_LUAD <- dataSubt_LUAD[dataSubt_LUAD$Tumor.stage %in% c("Stage I","Stage IA","Stage IB"),]
# dataFilt.LUAD.stageI.TP <- dataFiltTP[,substr(colnames(dataFiltTP),1,12) %in% sampleStageI_LUAD$patient]
# dataFilt.LUAD.stageI.NT <- dataFiltNT[,substr(colnames(dataFiltNT),1,12) %in% sampleStageI_LUAD$patient]
# 
# # here for example the end-user can replace LUSC with unpublished data in a dataframe containing counts.
# # and adding the information for LUSC.stageI.TP or LUSC.stageI.NT with unpublished data
# 
# dataSubt_LUSC <-TCGAquery_subtype(tumor = "LUSC")
# sampleStageI_LUSC <- dataSubt_LUSC[dataSubt_LUSC$T.stage %in% c("T1","T1a","T1b"),]
# dataFilt.LUSC.stageI.TP <- dataFiltTP[,substr(colnames(dataFiltTP),1,12) %in% sampleStageI_LUSC$patient]
# dataFilt.LUSC.stageI.NT <- dataFiltNT[,substr(colnames(dataFiltNT),1,12) %in% sampleStageI_LUSC$patient]
# 
# countsTable <-cbind(dataFilt.LUAD.stageI.TP,dataFilt.LUAD.stageI.NT,
#                     dataFilt.LUSC.stageI.TP, dataFilt.LUSC.stageI.NT)
# 
# AnnotationCounts <- matrix(0,ncol(countsTable),2)
# colnames(AnnotationCounts) <- c("Samples","Batch")
# rownames(AnnotationCounts) <- colnames(countsTable)
# 
# AnnotationCounts <- as.data.frame(AnnotationCounts)
# AnnotationCounts$Samples <- colnames(countsTable)
# 
# AnnotationCounts[colnames(dataFilt.LUAD.stageI.TP),"Batch"] <- "LUAD.stageI.TP"
# AnnotationCounts[colnames(dataFilt.LUAD.stageI.NT),"Batch"] <- "LUAD.stageI.NT"
# AnnotationCounts[colnames(dataFilt.LUSC.stageI.TP),"Batch"] <- "LUSC.stageI.TP"
# AnnotationCounts[colnames(dataFilt.LUSC.stageI.NT),"Batch"] <- "LUSC.stageI.NT"
# 
# countsCorrected <- TCGAbatch_Correction(tabDF =countsTable,
#                                         UnpublishedData = TRUE,
#                                         AnnotationDF = AnnotationCounts)

## ----echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE------
# Purity.BRCA <- TCGAtumor_purity(colnames(dataPrep.tcga), 0, 0, 0, 0, 0.7)

## ----echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE------
# ##Brain data through Recount2
# 
# brain.rec <- TCGAquery_recount2(project = "gtex", tissue = "brain")