## ----install, eval=FALSE------------------------------------------------- # install.packages("BiocManager") # BiocManager::install("psichomics") ## ----load, message=FALSE------------------------------------------------- library(psichomics) ## ----TCGA options-------------------------------------------------------- # Available tumour types cohorts <- getFirebrowseCohorts() # Available sample dates date <- getFirebrowseDates() # Available data types dataTypes <- getFirebrowseDataTypes() ## ----download, eval=FALSE------------------------------------------------ # # Set download folder # folder <- getDownloadsFolder() # # # Download and load most recent junction quantification and clinical data from # # TCGA/Firebrowse for Breast Cancer # data <- loadFirebrowseData(folder=folder, # cohort="BRCA", # data=c("clinical", "junction_quantification", # "RSEM_genes"), # date="2016-01-28") # # # Select clinical and junction quantification dataset # clinical <- data[[1]]$`Clinical data` # sampleInfo <- data[[1]]$`Sample metadata` # junctionQuant <- data[[1]]$`Junction quantification (Illumina HiSeq)` # geneExpr <- data[[1]]$`Gene expression` ## ----prepare examples, include=FALSE------------------------------------- clinical <- readRDS("BRCA_clinical.RDS") geneExpr <- readRDS("BRCA_geneExpr.RDS") ## ----normalise gene expression------------------------------------------- # Check genes where min. counts are available in at least N samples and filter # out genes with mean expression and variance of 0 checkCounts <- rowSums(geneExpr >= 10) >= 10 filter <- rowMeans(geneExpr) > 0 & rowVars(geneExpr) > 0 & checkCounts geneExprFiltered <- geneExpr[filter, ] # Normalise gene expression and perform log2-transformation (after adding 0.5 # to avoid zeroes) geneExprNorm <- normaliseGeneExpression(geneExprFiltered, log2transform=TRUE) ## ----quantify options---------------------------------------------------- # Available alternative splicing annotation annotList <- listSplicingAnnotations() annotList ## ----prepare to quantify splicing, eval=FALSE---------------------------- # # Load Human (hg19/GRCh37 assembly) annotation # human <- listSplicingAnnotations()[[1]] # annotation <- loadAnnotation(human) ## ----event types--------------------------------------------------------- # Available alternative splicing event types (skipped exon, alternative # first/last exon, mutually exclusive exons, etc.) getSplicingEventTypes() ## ----quantify splicing, eval=FALSE--------------------------------------- # # Discard alternative splicing quantified using few reads # minReads <- 10 # default # # psi <- quantifySplicing(annotation, junctionQuant, minReads=minReads) ## ----load splicing, echo=FALSE------------------------------------------- psi <- readRDS("BRCA_psi.RDS") sampleInfo <- parseTcgaSampleInfo(colnames(psi)) ## ----check splicing events----------------------------------------------- # Check the identifier of the splicing events in the resulting table events <- rownames(psi) head(events) ## ----data grouping------------------------------------------------------- # Group by normal and tumour samples types <- createGroupByAttribute("Sample types", sampleInfo) normal <- types$`Solid Tissue Normal` tumour <- types$`Primary solid Tumor` # Group by tumour stage (I, II, III or IV) or normal samples stages <- createGroupByAttribute( "patient.stage_event.pathologic_stage_tumor_stage", clinical) groups <- list() for (i in c("i", "ii", "iii", "iv")) { stage <- Reduce(union, stages[grep(sprintf("stage %s[a|b|c]{0,1}$", i), names(stages))]) # Include only tumour samples stageTumour <- names(getSubjectFromSample(tumour, stage)) elem <- list(stageTumour) names(elem) <- paste("Tumour Stage", toupper(i)) groups <- c(groups, elem) } groups <- c(groups, Normal=list(normal)) # Prepare group colours (for consistency across downstream analyses) colours <- c("#6D1F95", "#FF152C", "#00C7BA", "#FF964F", "#00C65A") names(colours) <- names(groups) attr(groups, "Colour") <- colours # Prepare normal versus tumour stage I samples normalVSstage1Tumour <- groups[c("Tumour Stage I", "Normal")] attr(normalVSstage1Tumour, "Colour") <- attr(groups, "Colour") # Prepare normal versus tumour samples normalVStumour <- list(Normal=normal, Tumour=tumour) attr(normalVStumour, "Colour") <- c(Normal="#00C65A", Tumour="#EFE35C") ## ----perform pca--------------------------------------------------------- # PCA of PSI between normal and tumour stage I samples psi_stage1Norm <- psi[ , unlist(normalVSstage1Tumour)] pcaPSI_stage1Norm <- performPCA(t(psi_stage1Norm)) ## ----plot pca------------------------------------------------------------ # Explained variance across principal components plotVariance(pcaPSI_stage1Norm) # Score plot (clinical individuals) plotPCA(pcaPSI_stage1Norm, groups=normalVSstage1Tumour) # Loading plot (variable contributions) plotPCA(pcaPSI_stage1Norm, loadings=TRUE, individuals=FALSE) # Table of variable contributions (as used to plot PCA, also) table <- calculateLoadingsContribution(pcaPSI_stage1Norm) knitr::kable(head(table, 5)) ## ----perform and plot remaining pca, eval=FALSE-------------------------- # # PCA of PSI between all samples (coloured by tumour stage and normal samples) # pcaPSI_all <- performPCA(t(psi)) # plotPCA(pcaPSI_all, groups=groups) # plotPCA(pcaPSI_all, loadings=TRUE, individuals=FALSE) # # # PCA of gene expression between all samples (coloured by tumour stage and # # normal samples) # pcaGE_all <- performPCA(t(geneExprNorm)) # plotPCA(pcaGE_all, groups=groups) # plotPCA(pcaGE_all, loadings=TRUE, individuals=FALSE) # # # PCA of gene expression between normal and tumour stage I samples # ge_stage1Norm <- geneExprNorm[ , unlist(normalVSstage1Tumour)] # pcaGE_stage1Norm <- performPCA(t(ge_stage1Norm)) # plotPCA(pcaGE_stage1Norm, groups=normalVSstage1Tumour) # plotPCA(pcaGE_stage1Norm, loadings=TRUE, individuals=FALSE) ## ----diff splicing NUMB exon 12------------------------------------------ # Find the right event ASevents <- rownames(psi) (tmp <- grep("NUMB", ASevents, value=TRUE)) NUMBskippedExon12 <- tmp[1] # Plot its PSI distribution plotDistribution(psi[NUMBskippedExon12, ], normalVStumour) ## ----correlation, warning=FALSE------------------------------------------ # Find the right gene genes <- rownames(geneExprNorm) (tmp <- grep("QKI", genes, value=TRUE)) QKI <- tmp[1] # "QKI|9444" # Plot its gene expression distribution plotDistribution(geneExprNorm[QKI, ], normalVStumour, psi=FALSE) plotCorrelation(correlateGEandAS( geneExprNorm, psi, QKI, NUMBskippedExon12, method="spearman")) ## ----exploratory diff analysis, message=FALSE---------------------------- diffSplicing <- diffAnalyses(psi, normalVSstage1Tumour) # Filter based on |∆ Median PSI| > 0.1 and q-value < 0.01 deltaPSIthreshold <- abs(diffSplicing$`∆ Median`) > 0.1 pvalueThreshold <- diffSplicing$`Wilcoxon p-value (BH adjusted)` < 0.01 # Plot results library(ggplot2) ggplot(diffSplicing, aes(`∆ Median`, -log10(`Wilcoxon p-value (BH adjusted)`))) + geom_point(data=diffSplicing[deltaPSIthreshold & pvalueThreshold, ], colour="orange", alpha=0.5, size=3) + geom_point(data=diffSplicing[!deltaPSIthreshold | !pvalueThreshold, ], colour="gray", alpha=0.5, size=3) + theme_light(16) + ylab("-log10(q-value)") ## ----survival------------------------------------------------------------ # Events already tested which have prognostic value events <- c( "SE_9_+_6486925_6492303_6492401_6493826_UHRF2", "SE_4_-_87028376_87024397_87024339_87023185_MAPK10", "SE_2_+_152324660_152324988_152325065_152325155_RIF1", "SE_2_+_228205096_228217230_228217289_228220393_MFF", "MXE_15_+_63353138_63353397_63353472_63353912_63353987_63354414_TPM1", "SE_2_+_173362828_173366500_173366629_173368819_ITGA6", "SE_1_+_204957934_204971724_204971876_204978685_NFASC") # Survival curves based on optimal PSI cutoff library(survival) # Assign alternative splicing quantification to patients based on their samples samples <- colnames(psi) match <- getPatientFromSample(samples, clinical, sampleInfo=sampleInfo) survPlots <- list() for (event in events) { # Find optimal cutoff for the event eventPSI <- assignValuePerPatient(psi[event, ], match, clinical, samples=unlist(tumour)) opt <- optimalSurvivalCutoff(clinical, eventPSI, censoring="right", event="days_to_death", timeStart="days_to_death") (optimalCutoff <- opt$par) # Optimal exon inclusion level (optimalPvalue <- opt$value) # Respective p-value label <- labelBasedOnCutoff(eventPSI, round(optimalCutoff, 2), label="PSI values") survTerms <- processSurvTerms(clinical, censoring="right", event="days_to_death", timeStart="days_to_death", group=label, scale="years") surv <- survfit(survTerms) pvalue <- testSurvival(survTerms) plotSurvivalCurves(surv, pvalue=pvalue, mark=FALSE) } ## ----exploratory diff expression, message=FALSE, warning=FALSE----------- # Prepare groups of samples to analyse and further filter unavailable samples in # selected groups for gene expression ge <- geneExprNorm[ , unlist(normalVSstage1Tumour), drop=FALSE] isFromGroup1 <- colnames(ge) %in% normalVSstage1Tumour[[1]] design <- cbind(1, ifelse(isFromGroup1, 0, 1)) # Fit a gene-wise linear model based on selected groups library(limma) fit <- lmFit(ge, design) # Calculate moderated t-statistics and DE log-odds using limma::eBayes ebayesFit <- eBayes(fit, trend=TRUE) # Prepare data summary pvalueAdjust <- "BH" # Benjamini-Hochberg p-value adjustment (FDR) summary <- topTable(ebayesFit, number=nrow(fit), coef=2, sort.by="none", adjust.method=pvalueAdjust, confint=TRUE) names(summary) <- c("log2 Fold-Change", "CI (low)", "CI (high)", "Average expression", "moderated t-statistics", "p-value", paste0("p-value (", pvalueAdjust, " adjusted)"), "B-statistics") attr(summary, "groups") <- normalVSstage1Tumour # Calculate basic statistics stats <- diffAnalyses(ge, normalVSstage1Tumour, "basicStats", pvalueAdjust=NULL) final <- cbind(stats, summary) # Differential gene expression between breast tumour stage I and normal samples library(ggplot2) library(ggrepel) cognateGenes <- unlist(parseSplicingEvent(events)$gene) logFCthreshold <- abs(final$`log2 Fold-Change`) > 1 pvalueThreshold <- final$`p-value (BH adjusted)` < 0.01 final$genes <- gsub("\\|.*$", "\\1", rownames(final)) ggplot(final, aes(`log2 Fold-Change`, -log10(`p-value (BH adjusted)`))) + geom_point(data=final[logFCthreshold & pvalueThreshold, ], colour="orange", alpha=0.5, size=3) + geom_point(data=final[!logFCthreshold | !pvalueThreshold, ], colour="gray", alpha=0.5, size=3) + geom_text_repel(data=final[cognateGenes, ], aes(label=genes), box.padding=0.4, size=5) + theme_light(16) + ylab("-log10(q-value)") ## ----UHRF2 exon 10 diff splicing----------------------------------------- # UHRF2 skipped exon 10's PSI values per tumour stage I and normal samples UHRF2skippedExon10 <- events[1] plotDistribution(psi[UHRF2skippedExon10, ], normalVSstage1Tumour) ## ----UHRF2 PSI survival-------------------------------------------------- # Find optimal cutoff for the event UHRF2skippedExon10 <- events[1] eventPSI <- assignValuePerPatient(psi[UHRF2skippedExon10, ], match, clinical, samples=unlist(tumour)) opt <- optimalSurvivalCutoff(clinical, eventPSI, censoring="right", event="days_to_death", timeStart="days_to_death") (optimalCutoff <- opt$par) # Optimal exon inclusion level (optimalPvalue <- opt$value) # Respective p-value label <- labelBasedOnCutoff(eventPSI, round(optimalCutoff, 2), label="PSI values") survTerms <- processSurvTerms(clinical, censoring="right", event="days_to_death", timeStart="days_to_death", group=label, scale="years") surv <- survfit(survTerms) pvalue <- testSurvival(survTerms) plotSurvivalCurves(surv, pvalue=pvalue, mark=FALSE) ## ----UHRF2 GE diff expression-------------------------------------------- plotDistribution(geneExprNorm["UHRF2", ], normalVSstage1Tumour, psi=FALSE) ## ----UHRF2 GE survival--------------------------------------------------- UHRF2ge <- assignValuePerPatient(geneExprNorm["UHRF2", ], match, clinical, samples=unlist(tumour)) # Survival curves based on optimal gene expression cutoff opt <- optimalSurvivalCutoff(clinical, UHRF2ge, censoring="right", event="days_to_death", timeStart="days_to_death") (optimalCutoff <- opt$par) # Optimal exon inclusion level (optimalPvalue <- opt$value) # Respective p-value # Process again after rounding the cutoff roundedCutoff <- round(optimalCutoff, 2) label <- labelBasedOnCutoff(UHRF2ge, roundedCutoff, label="Gene expression") survTerms <- processSurvTerms(clinical, censoring="right", event="days_to_death", timeStart="days_to_death", group=label, scale="years") surv <- survfit(survTerms) pvalue <- testSurvival(survTerms) plotSurvivalCurves(surv, pvalue=pvalue, mark=FALSE) ## ----load GTEx, eval=FALSE----------------------------------------------- # # Replace with the correct path to these files # subjects <- "~/Downloads/GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt" # sampleAttr <- "~/Downloads/GTEx_Data_V6_Annotations_SampleAttributesDS.txt" # junctionQuant <- "~/Downloads/GTEx_junction_reads.txt" # # # Check GTEx tissues available based on the sample attributes # getGtexTissues(sampleAttr) # # tissues <- c("blood", "brain") # gtex <- loadGtexData(subjects, sampleAttr, junctionQuant, tissues) ## ----load all GTEx, eval=FALSE------------------------------------------- # tissues <- NULL # gtex <- loadGtexData(subjectPhenotype, sampleAttr, junctionQuant, tissues) ## ----load recount, eval=FALSE-------------------------------------------- # library(recount) # View(recount_abstract) # sra <- loadSRAproject("SRP053101") ## ----load local, eval=FALSE---------------------------------------------- # folder <- "~/Downloads/GTEx/" # ignore <- c(".aux.", ".mage-tab.") # data <- loadLocalFiles(folder, ignore=ignore) # # # Select clinical and junction quantification dataset # clinical <- data[[1]]$`Clinical data` # sampleInfo <- data[[1]]$`Sample metadata` # junctionQuant <- data[[1]]$`Junction quantification (Illumina HiSeq)`