## ----init, results='hide', echo=FALSE, warning=FALSE, message=FALSE------ library(knitr) ## ----install_github, eval=FALSE------------------------------------------ # source('https://bioconductor.org/biocLite.R') # biocLite('GenomicDataCommons') ## ----libraries, message=FALSE-------------------------------------------- library(GenomicDataCommons) ## ----statusQS------------------------------------------------------------ GenomicDataCommons::status() ## ----findQS-------------------------------------------------------------- library(magrittr) ge_manifest = files() %>% filter( ~ cases.project.project_id == 'TCGA-OV' & type == 'gene_expression' & analysis.workflow_type == 'HTSeq - Counts') %>% manifest() ## ----downloadQS---------------------------------------------------------- destdir = tempdir() fnames = lapply(ge_manifest$id[1:20],gdcdata, destination_dir=destdir,overwrite=TRUE, progress=FALSE) ## ----metadataQS---------------------------------------------------------- expands = c("diagnoses","annotations", "demographic","exposures") clinResults = cases() %>% GenomicDataCommons::select(NULL) %>% GenomicDataCommons::expand(expands) %>% results(size=50) clinDF = as.data.frame(clinResults) library(DT) datatable(clinDF, extensions = 'Scroller', options = list( deferRender = TRUE, scrollY = 200, scrollX = TRUE, scroller = TRUE )) ## ----projectquery-------------------------------------------------------- pquery = projects() ## ----pquery-------------------------------------------------------------- str(pquery) ## ----pquerycount--------------------------------------------------------- pcount = count(pquery) # or pcount = pquery %>% count() pcount ## ----pqueryresults------------------------------------------------------- presults = pquery %>% results() ## ----presultsstr--------------------------------------------------------- str(presults) ## ----presultsall--------------------------------------------------------- length(ids(presults)) presults = pquery %>% results_all() length(ids(presults)) # includes all records length(ids(presults)) == count(pquery) ## ----simplifyProjects---------------------------------------------------- head(as.data.frame(presults)) ## ----defaultfields------------------------------------------------------- default_fields('files') # The number of fields available for files endpoint length(available_fields('files')) # The first few fields available for files endpoint head(available_fields('files')) ## ----selectexample------------------------------------------------------- # Default fields here qcases = cases() qcases$fields # set up query to use ALL available fields # Note that checking of fields is done by select() qcases = cases() %>% GenomicDataCommons::select(available_fields('cases')) head(qcases$fields) ## ----aggexample---------------------------------------------------------- # total number of files of a specific type res = files() %>% facet(c('type','data_type')) %>% aggregations() res$type ## ----allfilesunfiltered-------------------------------------------------- qfiles = files() qfiles %>% count() # all files ## ----onlyGeneExpression-------------------------------------------------- qfiles = files() %>% filter(~ type == 'gene_expression') # here is what the filter looks like after translation str(get_filter(qfiles)) ## ----filtAvailFields----------------------------------------------------- grep('pro',available_fields('files'),value=TRUE) ## ----filtProgramID------------------------------------------------------- files() %>% facet('cases.project.project_id') %>% aggregations() ## ----filtfinal----------------------------------------------------------- qfiles = files() %>% filter( ~ cases.project.project_id == 'TCGA-OV' & type == 'gene_expression') str(get_filter(qfiles)) qfiles %>% count() ## ----filtAndManifest----------------------------------------------------- manifest_df = qfiles %>% manifest() head(manifest_df) ## ----filterForHTSeqCounts------------------------------------------------ qfiles = files() %>% filter( ~ cases.project.project_id == 'TCGA-OV' & type == 'gene_expression' & analysis.workflow_type == 'HTSeq - Counts') manifest_df = qfiles %>% manifest() nrow(manifest_df) ## ----authenNoRun, eval=FALSE--------------------------------------------- # token = gdc_token() # transfer(...,token=token) # # or # transfer(...,token=get_token()) ## ----singlefileDL-------------------------------------------------------- fnames = gdcdata(manifest_df$id[1:2],progress=FALSE) ## ----bulkDL-------------------------------------------------------------- mfile = tempfile() write.table(manifest_df[1:50,],mfile, col.names=TRUE, row.names=FALSE, quote=FALSE,sep="\t") transfer(mfile,gdc_client='gdc-client') ## ----casesPerProject----------------------------------------------------- res = cases() %>% facet("project.project_id") %>% aggregations() head(res) library(ggplot2) ggplot(res$project.project_id,aes(x = key, y = doc_count)) + geom_bar(stat='identity') + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ## ----casesInTCGA--------------------------------------------------------- cases() %>% filter(~ project.program.name=='TARGET') %>% count() ## ----casesInTARGET------------------------------------------------------- cases() %>% filter(~ project.program.name=='TCGA') %>% count() ## ----casesTCGABRCASampleTypes-------------------------------------------- # The need to do the "&" here is a requirement of the # current version of the GDC API. I have filed a feature # request to remove this requirement. resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' & project.project_id=='TCGA-BRCA' ) %>% facet('samples.sample_type') %>% aggregations() resp$samples.sample_type ## ----casesTCGABRCASolidNormal-------------------------------------------- # The need to do the "&" here is a requirement of the # current version of the GDC API. I have filed a feature # request to remove this requirement. resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' & samples.sample_type=='Solid Tissue Normal') %>% GenomicDataCommons::select(c(default_fields(cases()),'samples.sample_type')) %>% response_all() count(resp) res = resp %>% results() str(res[1],list.len=6) head(ids(resp)) ## ----filesVCFCount------------------------------------------------------- res = files() %>% facet('type') %>% aggregations() res$type ggplot(res$type,aes(x = key,y = doc_count)) + geom_bar(stat='identity') + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ## ----filesRNAseqGeneGBM-------------------------------------------------- q = files() %>% GenomicDataCommons::select(available_fields('files')) %>% filter(~ cases.project.project_id=='TCGA-GBM' & data_type=='Gene Expression Quantification') q %>% facet('analysis.workflow_type') %>% aggregations() # so need to add another filter file_ids = q %>% filter(~ cases.project.project_id=='TCGA-GBM' & data_type=='Gene Expression Quantification' & analysis.workflow_type == 'HTSeq - Counts') %>% GenomicDataCommons::select('file_id') %>% response_all() %>% ids() ## ----filesRNAseqGeneGBMforBAM-------------------------------------------- q = files() %>% GenomicDataCommons::select(available_fields('files')) %>% filter(~ cases.project.project_id == 'TCGA-GBM' & data_type == 'Aligned Reads' & experimental_strategy == 'RNA-Seq' & data_format == 'BAM') file_ids = q %>% response_all() %>% ids() ## ----slicing10, eval=FALSE----------------------------------------------- # bamfile = slicing(file_ids[1],regions="chr12:6534405-6538375",token=gdc_token()) # library(GenomicAlignments) # aligns = readGAlignments(bamfile) ## ----sessionInfo--------------------------------------------------------- sessionInfo()