## ----knitr_setup, include = FALSE--------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.height = 4, fig.width = 4 ) options(rmarkdown.html_vignette.check_title = FALSE) ## ----load_libraries, message=FALSE-------------------------------------------- library(VariantAnnotation) library(GenomicRanges) library(BSgenome.Hsapiens.UCSC.hg19) library(SEMPLR) ## ----load_data---------------------------------------------------------------- SEMC ## ----display_data------------------------------------------------------------- SEMC ## ----view_sem_metadata-------------------------------------------------------- semData(SEMC) ## ----subset_sem_collection---------------------------------------------------- getSEMs(SEMC, semId = "JUN") ## ----inputs_scoreBinding------------------------------------------------------ # create a GRanges object gr <- GenomicRanges::GRanges( seqnames = "chr12", ranges = 94136009 ) ## ----scoreBinding------------------------------------------------------------- # calculate binding propensity sb <- scoreBinding(x = gr, sem = SEMC, genome = Hsapiens) sb ## ----scores_accessor---------------------------------------------------------- scores(sb) ## ----plot_sem----------------------------------------------------------------- # subset JUN score jun_score <- scores(sb)[SEM == "JUN" & rc == "fwd"] jun_score # plot the JUN motif with the scored sequence plotSEM(SEMC, motif = "JUN", motifSeq = jun_score$seq ) ## ----simulation_functions----------------------------------------------------- # create random sequences weighted by ppm probabilities simulatePPMSeqs <- function(ppm, nSeqs) { ppm_t <- t(ppm) position_samples <- lapply( seq_len(nrow(ppm_t)), function(i) { sample(colnames(ppm_t), size = nSeqs, replace = TRUE, prob = ppm_t[i, ] ) } ) rand_pwm_seq_mtx <- position_samples |> unlist() |> matrix(ncol = nrow(ppm_t), nrow = nSeqs) rand_pwm_seqs <- apply( rand_pwm_seq_mtx, 1, function(x) paste0(x, collapse = "") ) return(rand_pwm_seqs) } # generate DNA sequences simulateRandSeqs <- function(seqLength, nSeqs = 1) { bps <- c("A", "C", "G", "T") seqs <- sample(bps, replace = TRUE, size = seqLength ) |> stringi::stri_c(collapse = "") |> replicate(n = nSeqs) return(seqs) } ## ----build_enrich_scenario---------------------------------------------------- ppm <- convertSEMsToPPMs(getSEMs(SEMC, "JUN"))[[1]] # simulate sequences from the JUN PPM sim_seqs <- simulatePPMSeqs(ppm = ppm, nSeqs = 200) # add flanks to the simulated sequences to make them 3x the length of the motif sim_seqs <- lapply( sim_seqs, function(x) { paste0( simulateRandSeqs(seqLength = ncol(ppm)), x, simulateRandSeqs(seqLength = ncol(ppm)) ) } ) |> unlist() # generate random DNA sequences rand_seqs <- simulateRandSeqs(seqLength = ncol(ppm) * 3, nSeqs = 800) ## ----scoring_for_enrich_test-------------------------------------------------- # combine all sequences into a single vector all_seqs <- c(sim_seqs, rand_seqs) sb <- scoreBinding( x = all_seqs, sem = SEMC, genome = BSgenome.Hsapiens.UCSC.hg19::Hsapiens ) sb ## ----enrichSEMs--------------------------------------------------------------- e <- enrichSEMs(sb, sem = SEMC, seqs = all_seqs) # order the results by adjusted pvalue head(e[order(padj, decreasing = FALSE)]) ## ----plotEnrich--------------------------------------------------------------- plotEnrich(e, sem = SEMC, threshold = 0.05, method = "WPCC", pvalRange = c(0, 50), heatmapCols = c("lightgrey", "dodgerblue2") ) ## ----promoter_enrichment------------------------------------------------------ library(TxDb.Hsapiens.UCSC.hg38.knownGene) library(BSgenome.Hsapiens.UCSC.hg38) library(org.Hs.eg.db) txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene orgdb <- org.Hs.eg.db genes_oi <- c("ENSG00000139618", "ENSG00000157764") background_genes <- c("ENSG00000111640", "ENSG00000075624", "ENSG00000165704") result <- enrichmentSets( foreground_ids = genes_oi, background_ids = background_genes, txdb = txdb, orgdb = orgdb, id_type = "ENSEMBL" ) result ## ----make_vranges------------------------------------------------------------- vr <- VRanges( seqnames = c("chr12", "chr19"), ranges = c(94136009, 10640062), ref = c("G", "T"), alt = c("C", "A"), id = c("variant1", "variant2") ) ## ----score_variants----------------------------------------------------------- sempl_results <- scoreVariants( x = vr, sem = SEMC, genome = BSgenome.Hsapiens.UCSC.hg38::Hsapiens, varId = "id" ) sempl_results ## ----SEMScores_accessors, eval = FALSE---------------------------------------- # # access the variants slot # getRanges(sempl_results) # # # access the semData slot # semData(sempl_results) # # # access the scores slot # scores(sempl_results) ## ----plotSemMotifs, fig.height=4, fig.width=4--------------------------------- plotSEMMotifs( s = sempl_results, variant = "variant2", label = "transcription_factor" ) ## ----plotSEMVariants---------------------------------------------------------- plotSEMVariants(sempl_results, sem = "HLF") ## ----load_custom_sems--------------------------------------------------------- # find .sem files sem_folder <- system.file("extdata", "SEMs", package = "SEMPLR") sem_files <- list.files(sem_folder, full.names = TRUE) # load metadata sempl_metadata_file <- system.file("extdata", "sempl_metadata.csv", package = "SEMPLR" ) sempl_metadata <- read.csv(sempl_metadata_file) ## ----format_meta_data--------------------------------------------------------- ix <- lapply( sem_files, function(x) which(sempl_metadata$SEM == basename(x)) ) |> unlist() sem_ids <- sempl_metadata$transcription_factor[ix] ## ----loadSEMCollection-------------------------------------------------------- sc <- loadSEMCollection( semFiles = sem_files, semMetaData = sempl_metadata, semMetaKey = "transcription_factor", semIds = sem_ids ) sc ## ----session_info------------------------------------------------------------- devtools::session_info()