## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----install, eval = FALSE----------------------------------------------------
#  if (!"BiocManager" %in% rownames(installed.packages()))
#      install.packages("BiocManager", repos = "https://cran.r-project.org")

## ----install-Bioconductor, eval = FALSE---------------------------------------
#  BiocManager::install("SurfR")

## ----install-github, eval = FALSE---------------------------------------------
#  # install.packages("devtools")
#  devtools::install_github("auroramaurizio/SurfR", build_vignettes = TRUE)

## ----gene2protein-------------------------------------------------------------
library(SurfR)

GeneNames <- c("CIITA", "EPCAM", "CD24", "CDCP1", "LYVE1")
SurfaceProteins_df <- Gene2SProtein(GeneNames, 
                                    input_type = "gene_name", 
                                    output_tsv = FALSE,
                                    Surfy_version = "new")
#The output dataframe contains several information retrieved from Surfy.
colnames(SurfaceProteins_df)
#These are the 5 surface protein coding genes detected by SurfR.
SurfaceProteins_df$UniProt.gene

## ----simulate zhang, eval=FALSE-----------------------------------------------
#  library(SPsimSeq)
#  
#  data("zhang.data.sub")
#  zhang.counts <- zhang.data.sub$counts
#  MYCN.status  <- zhang.data.sub$MYCN.status
#  
#  # Simulation of bulk RNA data
#  sim.data.bulk <- SPsimSeq(n.sim = 1,
#                            s.data = zhang.counts,
#                            group = MYCN.status,
#                            n.genes = 1000,
#                            batch.config = 1,
#                            group.config = c(0.5, 0.5),
#                            tot.samples = ncol(zhang.counts),
#                            pDE = 0.2,
#                            lfc.thrld = 0.5,
#                            result.format = "list",
#                            return.details = TRUE)
#  
#  sim.data.bulk1 <- sim.data.bulk$sim.data.list[[1]]
#  
#  countMatrix <- sim.data.bulk1$counts
#  row.names(countMatrix) <- row.names(zhang.counts)
#  metadata <- sim.data.bulk1$colData
#  metadata$Group <- as.factor(metadata$Group)

## ----DGE zhang, eval=FALSE----------------------------------------------------
#  library(SurfR)
#  
#  df_zhang <- DGE(expression = countMatrix,
#                  metadata = metadata,
#                  Nreplica = 50,
#                  design = "~Group",
#                  condition = "Group",
#                  alpha = 0.05,
#                  TEST = "1", CTRL =  "0",
#                  output_tsv = FALSE)
#  
#  head(df_zhang)

## ----G2P zhang, eval=FALSE----------------------------------------------------
#  # remove NA values
#  df_zhang <- df_zhang[!is.na(df_zhang$padj), ]
#  
#  # all fdr
#  fdr_GeneID <- df_zhang[df_zhang$padj < 0.05, "GeneID"]
#  SP <- Gene2SProtein(genes = fdr_GeneID, input_type = "gene_name")
#  
#  # upregulated fdr
#  fdrUP_GeneID <- df_zhang[df_zhang$padj < 0.05 & df_zhang$log2FoldChange > 0,
#                           "GeneID"]
#  SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name")
#  
#  # dowregulated fdr
#  fdrDW_GeneID <- df_zhang[df_zhang$padj < 0.05 & df_zhang$log2FoldChange < 0,
#                           "GeneID"]
#  SPdw <- Gene2SProtein(genes = fdrDW_GeneID, input_type = "gene_name")

## ----GEO input meta_counts, eval=FALSE----------------------------------------
#  library(SurfR)
#  library(stringr)
#  
#  # Download metadata from GEO
#  mGSE121810 <- GEOmetadata(GSE = "GSE121810")
#  
#  # create new metadata column in order to remove unwanted special characters
#  unwanted_character <- " "
#  fx <- function(x) {
#    str_split(string = x, pattern = unwanted_character)[[1]][1]
#  }
#  mGSE121810$condition <- sapply(mGSE121810$therapy, fx)
#  mGSE121810$condition <- as.factor(mGSE121810$condition)
#  
#  # Preview metadata
#  head(mGSE121810)
#  
#  # only select 3 samples per condition to save time
#  na_samples <- c("GSM3447013", "GSM3447018", "GSM3447019")
#  a_samples <- c("GSM3447023", "GSM3447024", "GSM3447026")
#  mGSE121810 <- mGSE121810[c(na_samples, a_samples), ]
#  
#  # Download count matrix from ArchS4
#  cGSE121810 <- DownloadArchS4(mGSE121810$GSM,
#                               species = "human",
#                               print_tsv = FALSE,
#                               filename = NULL)
#  
#  # Preview count matrix
#  head(cGSE121810[, ])

## ----GEO dge, eval=FALSE------------------------------------------------------
#  # Perform DGE
#  df_GEO <- DGE(expression = cGSE121810,
#                metadata = mGSE121810,
#                Nreplica = 3,
#                design = "~condition",
#                condition = "condition",
#                alpha = 0.05,
#                TEST = "neoadjuvant", CTRL =  "adjuvant",
#                output_tsv = FALSE)
#  
#  # remove NA values
#  df_GEO <- df_GEO[!is.na(df_GEO$padj), ]
#  
#  head(df_GEO)

## ----GEO SP, eval=FALSE-------------------------------------------------------
#  # Detect SP amoung differentially expressed genes
#  fdr_GeneID <- df_GEO[df_GEO$padj < 0.1, "GeneID"]
#  
#  SP <- Gene2SProtein(genes = fdr_GeneID, input_type = "gene_name")
#  
#  fdrUP_GeneID <- df_GEO[df_GEO$padj < 0.1 & df_GEO$log2FoldChange > 0, "GeneID"]
#  SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name")
#  
#  fdrDW_GeneID <- df_GEO[df_GEO$padj < 0.1 & df_GEO$log2FoldChange < 0, "GeneID"]
#  SPdw <- Gene2SProtein(genes = fdrDW_GeneID, input_type = "gene_name")
#  

## ----TCGA install, eval=FALSE-------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  
#  BiocManager::install("BioinformaticsFMRP/TCGAbiolinksGUI.data")
#  BiocManager::install("BioinformaticsFMRP/TCGAbiolinks")

## ----echo=FALSE, warning=FALSE, include=FALSE, message = FALSE, eval=FALSE----
#  # Download TCGA data
#  # To save time only download 3 Tumor samples and 3 normal samples.
#  # If barcodes are not specified TCGA_download function will download all the
#  # available samples in the TCGA study
#  barcodes2Download <- c("TCGA-X7-A8D6-11A-22R-A42C-07",
#                         "TCGA-X7-A8D7-11A-11R-A42C-07",
#                         "TCGA-XM-A8RB-01A-11R-A42C-07",
#                         "TCGA-X7-A8M0-01A-11R-A42C-07")
#  TCGA.THYM <- TCGA_download(project = "TCGA-THYM", barcodes = barcodes2Download)
#  
#  cTCGA.THYM <- TCGA.THYM[[1]]
#  mTCGA.THYM <- TCGA.THYM[[2]]
#  table(mTCGA.THYM$shortLetterCode)
#  
#  mTCGA.THYM$shortLetterCode <- as.factor(mTCGA.THYM$shortLetterCode)

## ----TCGA DGE, eval=FALSE-----------------------------------------------------
#  df_TCGA <- DGE(expression = cTCGA.THYM,
#                 metadata = mTCGA.THYM,
#                 Nreplica = 2,
#                 design = "~shortLetterCode",
#                 condition = "shortLetterCode",
#                 alpha = 0.05,
#                 TEST = "TP", CTRL =  "NT",
#                 output_tsv = FALSE)
#  
#  head(df_TCGA)

## ----TCGA SP, eval=FALSE------------------------------------------------------
#  # remove NA values
#  df_TCGA <- df_TCGA[!is.na(df_TCGA$padj), ]
#  
#  fdr_GeneID <- df_TCGA[df_TCGA$padj < 0.05,
#                        "GeneID"]
#  
#  SP <- Gene2SProtein(genes = fdr_GeneID, input_type = "gene_name")
#  
#  fdrUP_GeneID <- df_TCGA[df_TCGA$padj < 0.05 & df_TCGA$log2FoldChange > 0,
#                          "GeneID"]
#  SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name")
#  
#  fdrDW_GeneID <- df_TCGA[df_TCGA$padj < 0.05 & df_TCGA$log2FoldChange < 0,
#                          "GeneID"]
#  SPdw <- Gene2SProtein(genes = fdrDW_GeneID, input_type = "gene_name")

## ----echo=FALSE, warning=FALSE, include=FALSE, message = FALSE----------------
# Download TCGA data
# To save time only download 3 Tumor samples and 3 normal samples.
# If barcodes are not specified TCGA_download function will download all the 
# available samples in the TCGA study
barcodes2Download <- c("TCGA-BH-A1FU-11A-23R-A14D-07", 
                       "TCGA-BH-A1FC-11A-32R-A13Q-07",
                       "TCGA-BH-A0DO-11A-22R-A12D-07", 
                       "TCGA-B6-A0RH-01A-21R-A115-07",
                       "TCGA-BH-A1FU-01A-11R-A14D-07", 
                       "TCGA-A1-A0SE-01A-11R-A084-07")
TCGA.BRCA <- TCGA_download(project = "TCGA-BRCA", barcodes = barcodes2Download)

## ----meta TCGA_input----------------------------------------------------------
cTCGA.BRCA <- TCGA.BRCA[[1]]
mTCGA.BRCA <- TCGA.BRCA[[2]]
table(mTCGA.BRCA$shortLetterCode)


## ----meta GEO_input metadata--------------------------------------------------
mGSE58135 <- GEOmetadata("GSE58135")
mGSE58135 <- mGSE58135[mGSE58135$tissue != "Breast Cancer Cell Line", ]
mGSE58135$condition <- "NT"
mGSE58135$condition[mGSE58135$tissue %in% c("ER+ Breast Cancer Primary Tumor", 
                                            "Triple Negative Breast Cancer Primary Tumor")] <- "TP"

# only select 3 samples per condition to save time
TP_samples <- c("GSM1401694", "GSM1401717", "GSM1401729")
NT_samples <- c("GSM1401799", "GSM1401813", "GSM1401814")
mGSE58135 <- mGSE58135[c(TP_samples, NT_samples), ]

## ----meta GEO_input counts----------------------------------------------------
cGSE58135 <- DownloadArchS4(mGSE58135$GSM, species = "human")
cGSE58135 <- cGSE58135[, row.names(mGSE58135)]

table(mGSE58135$condition)


## ----meta DGE TCGA------------------------------------------------------------
# TCGA DGE
df.TCGA <- DGE(expression = cTCGA.BRCA,
               metadata = mTCGA.BRCA,
               Nreplica = 3,
               design = "~shortLetterCode",
               condition = "shortLetterCode",
               alpha = 0.05,
               TEST = "TP", CTRL =  "NT",
               output_tsv = FALSE)
head(df.TCGA)

## ----meta DGE GEO-------------------------------------------------------------
# GSE58135 DGE
df.GSE58135 <- DGE(expression = cGSE58135,
                   metadata = mGSE58135,
                   Nreplica = 3,
                   design = "~condition",
                   condition = "condition",
                   alpha = 0.05,
                   TEST = "TP", CTRL =  "NT",
                   output_tsv = FALSE)
head(df.GSE58135)

## ----meta fishercomb----------------------------------------------------------

L_fishercomb <- metaRNAseq(ind_deg = list(TCGA.BRCA =  df.TCGA, GEO.GSE58135 = df.GSE58135),
                           test_statistic = "fishercomb",
                           BHth = 0.05,
                           adjpval.t = 0.05)

## ----meta invnorm-------------------------------------------------------------
L_invnorm <- metaRNAseq(ind_deg = list(TCGA.BRCA =  df.TCGA, GEO.GSE58135 = df.GSE58135),
                        test_statistic = "invnorm",
                        BHth = 0.05,
                        adjpval.t = 0.05,
                        nrep = c(102, 56))

## ----metacombine--------------------------------------------------------------
metacomb <- combine_fisher_invnorm(ind_deg = list(TCGA.BRCA =  df.TCGA, 
                                                  GEO.GSE58135 = df.GSE58135),
                                   invnorm = L_invnorm,
                                   fishercomb = L_fishercomb,
                                   adjpval = 0.05)


metacomb_GeneID <- metacomb[metacomb$signFC != 0, "GeneID"]
SP <- Gene2SProtein(genes = metacomb_GeneID, input_type = "gene_name")

## ----metacombine up-----------------------------------------------------------
metacombUP_GeneID <- metacomb[metacomb$signFC == 1, "GeneID"]
SPup <- Gene2SProtein(genes = metacombUP_GeneID, input_type = "gene_name")

## ----metacombine dw-----------------------------------------------------------
metacombDW_GeneID <- metacomb[metacomb$signFC == -1, "GeneID"]
SPdw <- Gene2SProtein(genes = metacombDW_GeneID, input_type = "gene_name")


## ----enrich, eval=FALSE-------------------------------------------------------
#  library(enrichR)
#  
#  dfList <- list(GEO = df.GSE58135,
#                 TCGA = df.TCGA)
#  
#  # Enrichment analysis
#  Enrich <- Enrichment(dfList,
#                       enrich.databases = c("GO_Biological_Process_2021",
#                                            "KEGG_2021_Human"),
#                       p_adj = 0.05, logFC = 1)
#  
#  head(Enrich$GEO$fdr_up$GO_Biological_Process_2021)

## ----fig.width=7, fig.height=2.5, eval=FALSE----------------------------------
#  library(ggplot2)
#  
#  # barplot of the top 5 upregulated pathways
#  Enrichment_barplot(Enrich$GEO,
#                     enrich.databases <- c("GO_Biological_Process_2021",
#                                           "KEGG_2021_Human"),
#                     p_adj = 0.05,
#                     num_term = 5,
#                     cond = "UP")
#  
#  # barplot of the top 5 downregulated pathways
#  Enrichment_barplot(Enrich$GEO,
#                     enrich.databases <- c("GO_Biological_Process_2021",
#                                           "KEGG_2021_Human"),
#                     p_adj = 0.05,
#                     num_term = 5,
#                     cond = "DOWN")

## ----anno spid, eval=FALSE----------------------------------------------------
#  annotated <- Annotate_SPID(df.GSE58135, "WikiPathway_2021_Human")
#  head(annotated, 10)

## ----fig.height= 4.5, fig.width=7---------------------------------------------
# upregulated genes in GEO dataset
fdrUP_GeneID <- df.GSE58135[df.GSE58135$padj < 0.05 & df.GSE58135$log2FoldChange > 0, "GeneID"]
# corresponding Surface Proteins
SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name")
# Barplot of Almen classification 
Splot(SPup,
      group.by = "Membranome.Almen.main-class",
      main = "Almen class")

## ----fig.height= 6, fig.width=7-----------------------------------------------

S_list <- list(SP_1 = c("EPCAM", "CD24",  "DLK1",  "CDCP1", "LYVE1"),
               SP_2 = c("DLK1", "EPCAM", "EGFR", "UPK1A", "UPK2"))

SVenn(S_list,
      cols.use = c("green", "blue"),
      opacity = 0.5,
      output_intersectionFile = FALSE)


## ----fig.height= 6, fig.width=7-----------------------------------------------

SurfR::plotPCA(matrix = edgeR::cpm(cGSE58135), metadata = mGSE58135,               
               dims = c(1, 2),
               color.by = "condition", shape.by = "condition", 
               label = FALSE, main = "PCA GSE58135")

## -----------------------------------------------------------------------------
sessionInfo()