## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----install, eval = FALSE---------------------------------------------------- # if (!"BiocManager" %in% rownames(installed.packages())) # install.packages("BiocManager", repos = "https://cran.r-project.org") ## ----install-Bioconductor, eval = FALSE--------------------------------------- # BiocManager::install("SurfR") ## ----install-github, eval = FALSE--------------------------------------------- # # install.packages("devtools") # devtools::install_github("auroramaurizio/SurfR", build_vignettes = TRUE) ## ----gene2protein------------------------------------------------------------- library(SurfR) GeneNames <- c("CIITA", "EPCAM", "CD24", "CDCP1", "LYVE1") SurfaceProteins_df <- Gene2SProtein(GeneNames, input_type = "gene_name", output_tsv = FALSE, Surfy_version = "new") #The output dataframe contains several information retrieved from Surfy. colnames(SurfaceProteins_df) #These are the 5 surface protein coding genes detected by SurfR. SurfaceProteins_df$UniProt.gene ## ----simulate zhang, eval=FALSE----------------------------------------------- # library(SPsimSeq) # # data("zhang.data.sub") # zhang.counts <- zhang.data.sub$counts # MYCN.status <- zhang.data.sub$MYCN.status # # # Simulation of bulk RNA data # sim.data.bulk <- SPsimSeq(n.sim = 1, # s.data = zhang.counts, # group = MYCN.status, # n.genes = 1000, # batch.config = 1, # group.config = c(0.5, 0.5), # tot.samples = ncol(zhang.counts), # pDE = 0.2, # lfc.thrld = 0.5, # result.format = "list", # return.details = TRUE) # # sim.data.bulk1 <- sim.data.bulk$sim.data.list[[1]] # # countMatrix <- sim.data.bulk1$counts # row.names(countMatrix) <- row.names(zhang.counts) # metadata <- sim.data.bulk1$colData # metadata$Group <- as.factor(metadata$Group) ## ----DGE zhang, eval=FALSE---------------------------------------------------- # library(SurfR) # # df_zhang <- DGE(expression = countMatrix, # metadata = metadata, # Nreplica = 50, # design = "~Group", # condition = "Group", # alpha = 0.05, # TEST = "1", CTRL = "0", # output_tsv = FALSE) # # head(df_zhang) ## ----G2P zhang, eval=FALSE---------------------------------------------------- # # remove NA values # df_zhang <- df_zhang[!is.na(df_zhang$padj), ] # # # all fdr # fdr_GeneID <- df_zhang[df_zhang$padj < 0.05, "GeneID"] # SP <- Gene2SProtein(genes = fdr_GeneID, input_type = "gene_name") # # # upregulated fdr # fdrUP_GeneID <- df_zhang[df_zhang$padj < 0.05 & df_zhang$log2FoldChange > 0, # "GeneID"] # SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name") # # # dowregulated fdr # fdrDW_GeneID <- df_zhang[df_zhang$padj < 0.05 & df_zhang$log2FoldChange < 0, # "GeneID"] # SPdw <- Gene2SProtein(genes = fdrDW_GeneID, input_type = "gene_name") ## ----GEO input meta_counts, eval=FALSE---------------------------------------- # library(SurfR) # library(stringr) # # # Download metadata from GEO # mGSE121810 <- GEOmetadata(GSE = "GSE121810") # # # create new metadata column in order to remove unwanted special characters # unwanted_character <- " " # fx <- function(x) { # str_split(string = x, pattern = unwanted_character)[[1]][1] # } # mGSE121810$condition <- sapply(mGSE121810$therapy, fx) # mGSE121810$condition <- as.factor(mGSE121810$condition) # # # Preview metadata # head(mGSE121810) # # # only select 3 samples per condition to save time # na_samples <- c("GSM3447013", "GSM3447018", "GSM3447019") # a_samples <- c("GSM3447023", "GSM3447024", "GSM3447026") # mGSE121810 <- mGSE121810[c(na_samples, a_samples), ] # # # Download count matrix from ArchS4 # cGSE121810 <- DownloadArchS4(mGSE121810$GSM, # species = "human", # print_tsv = FALSE, # filename = NULL) # # # Preview count matrix # head(cGSE121810[, ]) ## ----GEO dge, eval=FALSE------------------------------------------------------ # # Perform DGE # df_GEO <- DGE(expression = cGSE121810, # metadata = mGSE121810, # Nreplica = 3, # design = "~condition", # condition = "condition", # alpha = 0.05, # TEST = "neoadjuvant", CTRL = "adjuvant", # output_tsv = FALSE) # # # remove NA values # df_GEO <- df_GEO[!is.na(df_GEO$padj), ] # # head(df_GEO) ## ----GEO SP, eval=FALSE------------------------------------------------------- # # Detect SP amoung differentially expressed genes # fdr_GeneID <- df_GEO[df_GEO$padj < 0.1, "GeneID"] # # SP <- Gene2SProtein(genes = fdr_GeneID, input_type = "gene_name") # # fdrUP_GeneID <- df_GEO[df_GEO$padj < 0.1 & df_GEO$log2FoldChange > 0, "GeneID"] # SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name") # # fdrDW_GeneID <- df_GEO[df_GEO$padj < 0.1 & df_GEO$log2FoldChange < 0, "GeneID"] # SPdw <- Gene2SProtein(genes = fdrDW_GeneID, input_type = "gene_name") # ## ----TCGA install, eval=FALSE------------------------------------------------- # if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # BiocManager::install("BioinformaticsFMRP/TCGAbiolinksGUI.data") # BiocManager::install("BioinformaticsFMRP/TCGAbiolinks") ## ----echo=FALSE, warning=FALSE, include=FALSE, message = FALSE, eval=FALSE---- # # Download TCGA data # # To save time only download 3 Tumor samples and 3 normal samples. # # If barcodes are not specified TCGA_download function will download all the # # available samples in the TCGA study # barcodes2Download <- c("TCGA-X7-A8D6-11A-22R-A42C-07", # "TCGA-X7-A8D7-11A-11R-A42C-07", # "TCGA-XM-A8RB-01A-11R-A42C-07", # "TCGA-X7-A8M0-01A-11R-A42C-07") # TCGA.THYM <- TCGA_download(project = "TCGA-THYM", barcodes = barcodes2Download) # # cTCGA.THYM <- TCGA.THYM[[1]] # mTCGA.THYM <- TCGA.THYM[[2]] # table(mTCGA.THYM$shortLetterCode) # # mTCGA.THYM$shortLetterCode <- as.factor(mTCGA.THYM$shortLetterCode) ## ----TCGA DGE, eval=FALSE----------------------------------------------------- # df_TCGA <- DGE(expression = cTCGA.THYM, # metadata = mTCGA.THYM, # Nreplica = 2, # design = "~shortLetterCode", # condition = "shortLetterCode", # alpha = 0.05, # TEST = "TP", CTRL = "NT", # output_tsv = FALSE) # # head(df_TCGA) ## ----TCGA SP, eval=FALSE------------------------------------------------------ # # remove NA values # df_TCGA <- df_TCGA[!is.na(df_TCGA$padj), ] # # fdr_GeneID <- df_TCGA[df_TCGA$padj < 0.05, # "GeneID"] # # SP <- Gene2SProtein(genes = fdr_GeneID, input_type = "gene_name") # # fdrUP_GeneID <- df_TCGA[df_TCGA$padj < 0.05 & df_TCGA$log2FoldChange > 0, # "GeneID"] # SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name") # # fdrDW_GeneID <- df_TCGA[df_TCGA$padj < 0.05 & df_TCGA$log2FoldChange < 0, # "GeneID"] # SPdw <- Gene2SProtein(genes = fdrDW_GeneID, input_type = "gene_name") ## ----echo=FALSE, warning=FALSE, include=FALSE, message = FALSE---------------- # Download TCGA data # To save time only download 3 Tumor samples and 3 normal samples. # If barcodes are not specified TCGA_download function will download all the # available samples in the TCGA study barcodes2Download <- c("TCGA-BH-A1FU-11A-23R-A14D-07", "TCGA-BH-A1FC-11A-32R-A13Q-07", "TCGA-BH-A0DO-11A-22R-A12D-07", "TCGA-B6-A0RH-01A-21R-A115-07", "TCGA-BH-A1FU-01A-11R-A14D-07", "TCGA-A1-A0SE-01A-11R-A084-07") TCGA.BRCA <- TCGA_download(project = "TCGA-BRCA", barcodes = barcodes2Download) ## ----meta TCGA_input---------------------------------------------------------- cTCGA.BRCA <- TCGA.BRCA[[1]] mTCGA.BRCA <- TCGA.BRCA[[2]] table(mTCGA.BRCA$shortLetterCode) ## ----meta GEO_input metadata-------------------------------------------------- mGSE58135 <- GEOmetadata("GSE58135") mGSE58135 <- mGSE58135[mGSE58135$tissue != "Breast Cancer Cell Line", ] mGSE58135$condition <- "NT" mGSE58135$condition[mGSE58135$tissue %in% c("ER+ Breast Cancer Primary Tumor", "Triple Negative Breast Cancer Primary Tumor")] <- "TP" # only select 3 samples per condition to save time TP_samples <- c("GSM1401694", "GSM1401717", "GSM1401729") NT_samples <- c("GSM1401799", "GSM1401813", "GSM1401814") mGSE58135 <- mGSE58135[c(TP_samples, NT_samples), ] ## ----meta GEO_input counts---------------------------------------------------- cGSE58135 <- DownloadArchS4(mGSE58135$GSM, species = "human") cGSE58135 <- cGSE58135[, row.names(mGSE58135)] table(mGSE58135$condition) ## ----meta DGE TCGA------------------------------------------------------------ # TCGA DGE df.TCGA <- DGE(expression = cTCGA.BRCA, metadata = mTCGA.BRCA, Nreplica = 3, design = "~shortLetterCode", condition = "shortLetterCode", alpha = 0.05, TEST = "TP", CTRL = "NT", output_tsv = FALSE) head(df.TCGA) ## ----meta DGE GEO------------------------------------------------------------- # GSE58135 DGE df.GSE58135 <- DGE(expression = cGSE58135, metadata = mGSE58135, Nreplica = 3, design = "~condition", condition = "condition", alpha = 0.05, TEST = "TP", CTRL = "NT", output_tsv = FALSE) head(df.GSE58135) ## ----meta fishercomb---------------------------------------------------------- L_fishercomb <- metaRNAseq(ind_deg = list(TCGA.BRCA = df.TCGA, GEO.GSE58135 = df.GSE58135), test_statistic = "fishercomb", BHth = 0.05, adjpval.t = 0.05) ## ----meta invnorm------------------------------------------------------------- L_invnorm <- metaRNAseq(ind_deg = list(TCGA.BRCA = df.TCGA, GEO.GSE58135 = df.GSE58135), test_statistic = "invnorm", BHth = 0.05, adjpval.t = 0.05, nrep = c(102, 56)) ## ----metacombine-------------------------------------------------------------- metacomb <- combine_fisher_invnorm(ind_deg = list(TCGA.BRCA = df.TCGA, GEO.GSE58135 = df.GSE58135), invnorm = L_invnorm, fishercomb = L_fishercomb, adjpval = 0.05) metacomb_GeneID <- metacomb[metacomb$signFC != 0, "GeneID"] SP <- Gene2SProtein(genes = metacomb_GeneID, input_type = "gene_name") ## ----metacombine up----------------------------------------------------------- metacombUP_GeneID <- metacomb[metacomb$signFC == 1, "GeneID"] SPup <- Gene2SProtein(genes = metacombUP_GeneID, input_type = "gene_name") ## ----metacombine dw----------------------------------------------------------- metacombDW_GeneID <- metacomb[metacomb$signFC == -1, "GeneID"] SPdw <- Gene2SProtein(genes = metacombDW_GeneID, input_type = "gene_name") ## ----enrich, eval=FALSE------------------------------------------------------- # library(enrichR) # # dfList <- list(GEO = df.GSE58135, # TCGA = df.TCGA) # # # Enrichment analysis # Enrich <- Enrichment(dfList, # enrich.databases = c("GO_Biological_Process_2021", # "KEGG_2021_Human"), # p_adj = 0.05, logFC = 1) # # head(Enrich$GEO$fdr_up$GO_Biological_Process_2021) ## ----fig.width=7, fig.height=2.5, eval=FALSE---------------------------------- # library(ggplot2) # # # barplot of the top 5 upregulated pathways # Enrichment_barplot(Enrich$GEO, # enrich.databases <- c("GO_Biological_Process_2021", # "KEGG_2021_Human"), # p_adj = 0.05, # num_term = 5, # cond = "UP") # # # barplot of the top 5 downregulated pathways # Enrichment_barplot(Enrich$GEO, # enrich.databases <- c("GO_Biological_Process_2021", # "KEGG_2021_Human"), # p_adj = 0.05, # num_term = 5, # cond = "DOWN") ## ----anno spid, eval=FALSE---------------------------------------------------- # annotated <- Annotate_SPID(df.GSE58135, "WikiPathway_2021_Human") # head(annotated, 10) ## ----fig.height= 4.5, fig.width=7--------------------------------------------- # upregulated genes in GEO dataset fdrUP_GeneID <- df.GSE58135[df.GSE58135$padj < 0.05 & df.GSE58135$log2FoldChange > 0, "GeneID"] # corresponding Surface Proteins SPup <- Gene2SProtein(genes = fdrUP_GeneID, input_type = "gene_name") # Barplot of Almen classification Splot(SPup, group.by = "Membranome.Almen.main-class", main = "Almen class") ## ----fig.height= 6, fig.width=7----------------------------------------------- S_list <- list(SP_1 = c("EPCAM", "CD24", "DLK1", "CDCP1", "LYVE1"), SP_2 = c("DLK1", "EPCAM", "EGFR", "UPK1A", "UPK2")) SVenn(S_list, cols.use = c("green", "blue"), opacity = 0.5, output_intersectionFile = FALSE) ## ----fig.height= 6, fig.width=7----------------------------------------------- SurfR::plotPCA(matrix = edgeR::cpm(cGSE58135), metadata = mGSE58135, dims = c(1, 2), color.by = "condition", shape.by = "condition", label = FALSE, main = "PCA GSE58135") ## ----------------------------------------------------------------------------- sessionInfo()