In this example we investigate the ENCODE Chip-Seq narrow and broad data, extracted from the GMQL remote repository to automate all the steps needed to identify transcription factor (TF) high accumulation DNA zones using RGQML together with TFHAZ, another R/Bioconductor package. The knowledge of DNA regions in which transcription factors bind, in particular the HOT (High Occupancy Target) regions occupied by many different factors, is crucial to understand cancer genesis and develop new targeted therapies.
Load the RGMQL package and initialize the remote GMQL context of scalable data management engine, specifying remote_processing = TRUE, and, possibly, an authenticated login:
library(RGMQL)
## Caricamento del pacchetto richiesto: RGMQLlib
## GMQL successfully loaded
remote_url <- "http://www.gmql.eu/gmql-rest"
init_gmql(url = remote_url, remote_processing = TRUE) #, username = 'XXXX', password = 'XXXX')
## [1] "your Token is 39281b47-d51e-477a-a3e2-63de076bedb0"
Download and extract the list of datasets in the curated remote repository and focus on those concerning ENCODE:
dataset_list <- show_datasets_list(remote_url)
list <- unlist(lapply(dataset_list[["datasets"]], function(x) x$name))
grep(pattern = 'ENCODE', x = list, value = TRUE)
## [1] "GRCh38_ANNOTATION_GENCODE" "GRCh38_ENCODE_BROAD_2019_01"
## [3] "GRCh38_ENCODE_BROAD_2019_07" "GRCh38_ENCODE_BROAD_2020_01"
## [5] "GRCh38_ENCODE_BROAD_AUG_2017" "GRCh38_ENCODE_BROAD_NOV_2017"
## [7] "GRCh38_ENCODE_NARROW_2019_01" "GRCh38_ENCODE_NARROW_2019_07"
## [9] "GRCh38_ENCODE_NARROW_2020_01" "GRCh38_ENCODE_NARROW_AUG_2017"
## [11] "GRCh38_ENCODE_NARROW_NOV_2017" "HG19_ANNOTATION_GENCODE"
## [13] "HG19_ENCODE_BROAD_2019_01" "HG19_ENCODE_BROAD_AUG_2017"
## [15] "HG19_ENCODE_BROAD_NOV_2016" "HG19_ENCODE_BROAD_NOV_2017"
## [17] "HG19_ENCODE_NARROW_2019_01" "HG19_ENCODE_NARROW_AUG_2017"
## [19] "HG19_ENCODE_NARROW_NOV_2016" "HG19_ENCODE_NARROW_NOV_2017"
Select ChIP-seq data from the ENCODE NARROW dataset AUG_2017 and ENCODE BROAD dataset AUG_2017, aligned to HG19:
Enc_Broad <- read_gmql("public.HG19_ENCODE_BROAD_AUG_2017",
is_local = FALSE)
Enc_Narrow <- read_gmql("public.HG19_ENCODE_NARROW_AUG_2017",
is_local = FALSE)
Select Encode ChIP-seq aligned to HG19 and related to human embryonic stem cell line (H1-hESC):
HM_TF_rep_broad <- filter(Enc_Broad, assay == "ChIP-seq" & file_status ==
"released" & biosample_term_name == "H1-hESC" &
output_type == "peaks")
HM_TF_rep_narrow <- filter(Enc_Narrow, assay == "ChIP-seq" & file_status ==
"released" & biosample_term_name == "H1-hESC" &
output_type == "optimal idr thresholded peaks")
Take the union of the two previously generated datasets:
HM_TF_rep <- union(HM_TF_rep_broad, HM_TF_rep_narrow)
Filter out samples subjected to pharmacological treatment or with specific “audit” marker:
HM_TF_rep_good_0 <- filter(HM_TF_rep, !biosample_treatments == "*" & ! (audit_error == "extremely low read depth" | audit_error == "extremely low read length") & !(audit_warning == "insufficient read depth") & !(audit_not_compliant == "insufficient read depth" | audit_not_compliant =="insufficient replicate concordance" | audit_not_compliant == "missing input control" | audit_not_compliant == "severe bottlenecking" | audit_not_compliant == "unreplicated experiment"))
Filter out samples related to HM histone modifications:
TF_rep_good_0 <- filter(HM_TF_rep_good_0, !(experiment_target == "H2AFZhuman" | experiment_target == "H3F3A-human" | experiment_target == "H3K27ac-human" | experiment_target == "H3K27me3-human" | experiment_target == "H3K36me3-human" | experiment_target == "H3K4me1-human" | experiment_target == "H3K4me2-human" | experiment_target == "H3K4me3-human" | experiment_target == "H3K79me2-human" | experiment_target == "H3K9ac-human" | experiment_target == "H3K9me1-human" | experiment_target == "H3K9me2-human" | experiment_target == "H3K9me3-human" | experiment_target == "H4K20me1-human"))
Update region attributes with length of each region and, for each sample, compute the number of regions and the sum of each region length just created:
TF_rep_good_1 <- select(TF_rep_good_0, regions_update = list(length = right - left))
TF_rep_good <- extend(TF_rep_good_1, Region_number = COUNT(),
sum_length = SUM("length"))
TF_rep_good_merged <- aggregate(TF_rep_good, groupBy =
conds(default = c("biosample_term_name")))
TF_rep_good_ordered <- arrange(TF_rep_good_merged,
regions_ordering = list(ASC("length")))
collect(TF_rep_good_ordered, name = "TF_rep_good_ordered")
job <- execute()
(1.1)Monitor the job status:
trace_job(remote_url , job$id)
dataset_name <- job$datasets[[1]]$name
print(dataset_name)
GRL_TF_rep_good_ordered <- download_as_GRangesList(remote_url, dataset_name)
download_dataset(remote_url, datasetName = dataset_name, path = './Results_use_case_3')
name_sample <- names(GRL_TF_rep_good_ordered)
g <- GRL_TF_rep_good_ordered[[name_sample]]
Region_number_tot <- length(g)
n_up <- Region_number_tot * 0.95
n_up_1 <- n_up + 1
index <- which(g$order >= ceiling(n_up) & g$order <= floor(n_up_1))
region <- g[index]
threshold <- region$length
threshold <- as.numeric(threshold)
threshold
## [1] 1291
Going back to RGQML remote processing, take only the regions with region lengths greater than 100 and smaller than the threshold:
TF_rep_good_filtered_0 <- filter(TF_rep_good, r_predicate = length >= 100 & length <= threshold)
Create new metadata for each sample, with number of filtered regions and the sum of their lengths:
TF_rep_good_filtered <- extend(TF_rep_good_filtered_0,
region_number_filtered = COUNT(),
sum_length_filtered = SUM("length"))
Combine multiple replicate samples of the same TF experiment:
TF_0 <- cover(TF_rep_good_filtered, 1, ANY(), groupBy =
conds("experiment_target"))
Add new region attribute as length of each region after sample combination:
TF_1 <- select(TF_0, regions_update = list(length_cov = right - left))
Create new metadata for each sample, i.e. number of combined regions and min, max and sum of their lengths:
TF <- extend(TF_1, region_number_cover = COUNT(), sum_length_cover =
SUM("length_cov"), min_length_cover = MIN("length_cov"), max_length_cover = MAX("length_cov"))
Materialize TF dataset into repository and download it on mass memory but also in main memory as GRangesList
collect(TF, name= "TF_res")
res <- execute()
#Monitor job status:
trace_job(remote_url, res$id)
res_name <- res$datasets[[1]]$name
download_dataset(remote_url, res_name, path = './Results_use_case_3')
samples <- download_as_GRangesList(remote_url, res_name)
Log out from remote engine:
logout_gmql(remote_url)
## [1] "Logout"
Post-processing before the analysis with TFHAZ
TF=vector()
len_rep <- sapply(samples, function(x) len <- length(x))
TF_rep <- mapply(function(x, l){
exp <- x$experiment_target
TF_temp <- rep(exp, l)
TF_0 <- append(TF, TF_temp)
}, samples@metadata, len_rep)
TF <- unlist(TF_rep)
H1_hESC_0 <- unlist(samples)
data1 <- data.frame(H1_hESC_0)
data <- cbind(data1[1:5], TF)
GR_H1_hESC <- as(data, "GRanges")
After loading the TFHAZ package, find the transcription factor HOT DNA zones focusing on one chromosome at at time (e.g. chr21), by executing the following instructions:
library(TFHAZ)
TF_acc_21_w_0 <- accumulation(GR_H1_hESC, "TF", "chr21", 0)
plot_accumulation(TF_acc_21_w_0)
## png
## 2
d_zones <- high_accumulation_zones(TF_acc_21_w_0, method =
"overlaps", threshold = "std", plotZones = TRUE)
print(d_zones)
## $zones
## GRanges object with 186 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr21 16817665-16817920 *
## [2] chr21 17102271-17102482 *
## [3] chr21 17125366-17125783 *
## [4] chr21 18811080-18811303 *
## [5] chr21 18827001-18827280 *
## ... ... ... ...
## [182] chr21 47743787-47743850 *
## [183] chr21 47744158-47744473 *
## [184] chr21 47878223-47878892 *
## [185] chr21 48055239-48055335 *
## [186] chr21 48055358-48055756 *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
##
## $n_zones
## [1] 186
##
## $n_bases
## [1] 56339
##
## $lengths
## TH n_zones length_zone_min length_zone_max
## 5.554748 186.000000 9.000000 1279.000000
## length_zone_mean length_zone_median length_zone_sd
## 302.897849 295.000000 194.842182
##
## $distances
## TH n_zones dist_zone_min dist_zone_max
## 5.554748e+00 1.860000e+02 3.000000e+00 4.196696e+06
## dist_zone_mean dist_zone_median dist_zone_sd
## 1.685510e+05 5.256100e+04 4.266607e+05
##
## $TH
## [1] 5.554748
##
## $chr
## [1] "chr21"
##
## $w
## [1] 0
##
## $acctype
## [1] "TF"