## ----include = FALSE---------------------------------------------------------- # Prevent certificate issues for GitHub actions options(gemma.SSL = FALSE, datatable.print.trunc.cols = FALSE) # temporary options # options(gemma.API = "https://dev.gemma.msl.ubc.ca/rest/v2/") # options(gemma.memoised = TRUE) knitr::opts_chunk$set( comment = "", cache = FALSE ) ## ----setup, message = FALSE--------------------------------------------------- library(gemma.R) library(dplyr) library(poolr) library(ggplot2) library(stringr) ## ----include = FALSE---------------------------------------------------------- gemma.R:::setGemmaPath('prod') ## ----------------------------------------------------------------------------- annots <- search_annotations("parkinson's") annots %>% head %>% gemma_kable() ## ----------------------------------------------------------------------------- annot_counts <- annots$value.URI %>% sapply(\(x){ attributes(get_datasets(uris = x))$totalElements }) annots$counts = annot_counts annots %>% filter(counts>0) %>% arrange(desc(counts)) %>% gemma_kable() ## ----------------------------------------------------------------------------- filter_properties()$dataset %>% head %>% gemma_kable ## ----------------------------------------------------------------------------- # getting all resulting datasets using limit and offset arguments results <- get_datasets(filter = "allCharacteristics.valueUri = http://purl.obolibrary.org/obo/MONDO_0005180 and allCharacteristics.valueUri = http://purl.obolibrary.org/obo/UBERON_0000955", taxa ='human') %>% get_all_pages() results %>% select(experiment.shortName,experiment.name) %>% head ## ----------------------------------------------------------------------------- results <- results %>% filter(experiment.batchEffect == 1) ## ----------------------------------------------------------------------------- experiment_contrasts <- results$experiment.shortName %>% lapply(function(x){ out = get_dataset_differential_expression_analyses(x) }) %>% do.call(rbind,.) ## ----------------------------------------------------------------------------- parkin_contrasts <- experiment_contrasts %>% filter(factor.category == 'disease') %>% filter(sapply(experimental.factors,function(x){ "http://purl.obolibrary.org/obo/MONDO_0005180" %in% x$value.URI }) & sapply(baseline.factors,function(x){ "http://purl.obolibrary.org/obo/OBI_0000220" %in% x$value.URI })) ## ----------------------------------------------------------------------------- dup_exp <- parkin_contrasts$experiment.ID %>% table %>% {.[.>1]} %>% names dup_exp parkin_contrasts %>% filter(experiment.ID %in% dup_exp) ## ----------------------------------------------------------------------------- parkin_contrasts %>% filter(experiment.ID %in% dup_exp) %>% {.$subsetFactor} ## ----------------------------------------------------------------------------- # We are using the first 5 contrasts to keep this analysis short # sanity_sets = c('GSE8397','GSE7621','GSE20168','GSE20291','GSE20292') # sanity_ids = results %>% filter(experiment.Accession %in% sanity_sets) %>% .$experiment.ID # parkin_contrasts <- parkin_contrasts %>% filter(experiment.ID %in% sanity_ids) # parkin_contrasts <- parkin_contrasts[1:10,] ## ----------------------------------------------------------------------------- differentials <- parkin_contrasts$result.ID %>% lapply(function(x){ # take the first and only element of the output. the function returns a list # because single experiments may have multiple resultSets. Here we use the # resultSet argument to directly access the results we need get_differential_expression_values(resultSets = x)[[1]] }) # some datasets might not have all the advertised differential expression results # calculated due to a variety of factors. here we remove the empty differentials missing_contrasts <- differentials %>% sapply(nrow) %>% {.==0} differentials <- differentials[!missing_contrasts] parkin_contrasts <- parkin_contrasts[!missing_contrasts,] ## ----------------------------------------------------------------------------- condition_diffs <- seq_along(differentials) %>% lapply(function(i){ # iterate over the differentials diff = differentials[[i]] # get the contrast information about the differential contrast = parkin_contrasts[i,] p_vals = diff[[paste0('contrast_',contrast$contrast.ID,"_pvalue")]] log2fc = diff[[paste0('contrast_',contrast$contrast.ID,"_log2fc")]] genes = diff$GeneSymbol data.frame(genes,p_vals,log2fc) }) # we can use result.IDs and contrast.IDs to uniquely name this. # we add the experiment.id for readability names(condition_diffs) = paste0(parkin_contrasts$experiment.ID,'.', parkin_contrasts$result.ID,'.', parkin_contrasts$contrast.ID) condition_diffs[[1]] %>% head ## ----------------------------------------------------------------------------- all_genes <- condition_diffs %>% lapply(function(x){ x$genes %>% unique }) %>% unlist %>% table # we will remove any gene that doesn't appear in all of the results # while this criteria is too strict, it does help this example to run # considerably faster all_genes <- all_genes[all_genes==max(all_genes)] all_genes <- names(all_genes) # remove any probesets matching multiple genes. gemma separates these by using "|" all_genes <- all_genes[!grepl("|",all_genes,fixed = TRUE)] # remove the "". This comes from probesets not aligned to any genes all_genes <- all_genes[all_genes != ""] all_genes %>% head ## ----eval = FALSE------------------------------------------------------------- # fisher_results <- all_genes %>% lapply(function(x){ # p_vals <- condition_diffs %>% sapply(function(y){ # # we will resolve multiple probesets by taking the minimum p value for # # this example # out = y[y$genes == x,]$p_vals # if(length(out) == 0 ||all(is.na(out))){ # return(NA) # } else{ # return(min(out)) # } # }) # # fold_changes <- condition_diffs %>% sapply(function(y){ # pv = y[y$genes == x,]$p_vals # if(length(pv) == 0 ||all(is.na(pv))){ # return(NA) # } else{ # return(y[y$genes == x,]$log2fc[which.min(pv)]) # } # }) # # median_fc = fold_changes %>% na.omit() %>% median # names(median_fc) = 'Median FC' # # combined = p_vals %>% na.omit() %>% fisher() %>% {.$p} # names(combined) = 'Combined' # c(p_vals,combined,median_fc) # }) %>% do.call(rbind,.) # fisher_results <- as.data.frame(fisher_results) # rownames(fisher_results) = all_genes # # fisher_results[,'Adjusted'] <- p.adjust(fisher_results[,'Combined'], # method = 'fdr') # # fisher_results %>% # arrange(Adjusted) %>% # select(Combined,Adjusted,`Median FC`) %>% # head ## ----fisher, include = FALSE-------------------------------------------------- library(parallel) cores = detectCores() if (Sys.info()['sysname'] == 'Windows'){ cores = 1 } fisher_results <- all_genes %>% mclapply(function(x){ p_vals <- condition_diffs %>% sapply(function(y){ # we will resolve multiple probesets by taking the minimum p value for # this example out = y[y$genes == x,]$p_vals if(length(out) == 0 ||all(is.na(out))){ return(NA) } else{ return(min(out)) } }) fold_changes <- condition_diffs %>% sapply(function(y){ pv = y[y$genes == x,]$p_vals if(length(pv) == 0 ||all(is.na(pv))){ return(NA) } else{ return(y[y$genes == x,]$log2fc[which.min(pv)]) } }) median_fc = fold_changes %>% na.omit() %>% median names(median_fc) = 'Median FC' combined = p_vals %>% na.omit() %>% fisher() %>% {.$p} names(combined) = 'Combined' c(p_vals,combined,median_fc) },mc.cores = ceiling(cores/2)) %>% do.call(rbind,.) fisher_results <- as.data.frame(fisher_results) rownames(fisher_results) = all_genes fisher_results[,'Adjusted'] <- p.adjust(fisher_results[,'Combined'], method = 'fdr') fisher_results %>% arrange(Adjusted) %>% select(Combined,Adjusted,`Median FC`) %>% head ## ----------------------------------------------------------------------------- sum(fisher_results$Adjusted<0.05) # FDR<0.05 nrow(fisher_results) # number of all genes ## ----------------------------------------------------------------------------- # markers are taken from https://www.eneuro.org/content/4/6/ENEURO.0212-17.2017 dopa_markers <- c("ADCYAP1", "ATP2B2", "CACNA2D2", "CADPS2", "CALB2", "CD200", "CDK5R2", "CELF4", "CHGA", "CHGB", "CHRNA6", "CLSTN2", "CNTNAP2", "CPLX1", "CYB561", "DLK1", "DPP6", "ELAVL2", "ENO2", "GABRG2", "GRB10", "GRIA3", "KCNAB2", "KLHL1", "LIN7B", "MAPK8IP2", "NAPB", "NR4A2", "NRIP3", "HMP19", "NTNG1", "PCBP3", "PCSK1", "PRKCG", "RESP18", "RET", "RGS8", "RNF157", "SCG2", "SCN1A", "SLC12A5", "SLC4A10", "SLC6A17", "SLC6A3", "SMS", "SNCG", "SPINT2", "SPOCK1", "SYP", "SYT4", "TACR3", "TENM1", "TH", "USP29") fisher_results %>% arrange(Combined) %>% rownames %>% {.%in% dopa_markers} %>% which %>% hist(breaks=20, main = 'Rank distribution of dopaminergic markers') ## ----------------------------------------------------------------------------- # the top gene from our results gene <- fisher_results %>% arrange(Adjusted) %>% .[1,] %>% rownames p_values <- fisher_results %>% arrange(Adjusted) %>% .[1,] %>% select(-Combined,-`Median FC`,-Adjusted) %>% unlist %>% na.omit() #sum(p_values<0.05) #length(p_values) # p values of the result in individual studies p_values %>% log10() %>% data.frame(`log p value` = .,dataset = 1:length(.),check.names = FALSE) %>% ggplot(aes(y = `log p value`,x = dataset)) + geom_point() + geom_hline(yintercept = log10(0.05),color = 'firebrick') + geom_text(y = log10(0.05), x = 0, label = 'p < 0.05',vjust =-1,hjust = -0.1) + theme_bw() + ggtitle(paste("P-values for top gene (", gene, ") in the data sets")) + theme(axis.text.x = element_blank()) ## ----------------------------------------------------------------------------- # we need the NCBI id of the gene in question, lets get that from the original # results NCBIid <- differentials[[1]] %>% filter(GeneSymbol == gene) %>% .$NCBIid %>% unique #NCBIid expression_frame <- get_dataset_object(datasets = parkin_contrasts$experiment.ID, resultSets = parkin_contrasts$result.ID, contrasts = parkin_contrasts$contrast.ID, genes = NCBIid,type = 'tidy',consolidate = 'pickvar') # get the contrast names for significance markers signif_contrasts <- which(p_values < 0.05) %>% names ## ----------------------------------------------------------------------------- expression_frame <- expression_frame %>% filter(!is.na(expression)) %>% # add a column to represent the contrast dplyr::mutate(contrasts = paste0(experiment.ID,'.', result.ID,'.', contrast.ID)) %>% # simplify the labels dplyr::mutate(disease = ifelse(disease == 'reference subject role','Control','PD')) # for adding human readable labels on the plot labeller <- function(x){ x %>% mutate(contrasts = contrasts %>% strsplit('.',fixed = TRUE) %>% purrr::map_chr(1) %>% {expression_frame$experiment.shortName[match(.,expression_frame$experiment.ID)]}) } # pass it all to ggplot expression_frame %>% ggplot(aes(x = disease,y = expression)) + facet_wrap(~contrasts,scales = 'free',labeller = labeller) + theme_bw() + geom_boxplot(width = 0.5) + geom_point() + ggtitle(paste("Expression of", gene, " per study")) + geom_text(data = data.frame(contrasts = signif_contrasts, expression_frame %>% group_by(contrasts) %>% summarise(expression = max(expression)) %>% .[match(signif_contrasts,.$contrasts),]), x = 1.5, label = '*',size=5,vjust= 1) ## ----------------------------------------------------------------------------- sessionInfo()