## ----setup, include = TRUE----------------------------------------------------
library(knitr)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----Initialization-----------------------------------------------------------
library(PubScore)
# list of genes :
selected_genes <- c('CD79A', 'CD14', 'NKG7', 'CST3', 'AIF1')

# These genes were selected from a panel containing the following genes
total_genes <- c('CD79A', 'CD14', 'NKG7', 'CST3', 'AIF1', 'FOXA1', 'PPT2', 'ZFP36L1','AFF4', 'ANTXR2', "HDAC8", "VKORC1" )

terms_of_interest <- c("B cells", "macrophages", "NK cells")

## ----Creating PubScore Object-------------------------------------------------
pub <- pubscore(terms_of_interest = terms_of_interest, genes = selected_genes )
print(getScore(pub))

## ----Testing the Pubscore Object----------------------------------------------
pub <- test_score(pub, total_genes =  total_genes)


## ----Heatmap Visualization----------------------------------------------------
p <- heatmapViz(pub)
library(plotly)
ggplotly(p)

## ----Netowrk Visualization----------------------------------------------------
p <- networkViz(pub)
plot(p)

## ----messages = FALSE---------------------------------------------------------
library("FCBF")
library('SingleCellExperiment')
# load expression data
data(scDengue)

## ----Running FCBF-------------------------------------------------------------
infection <- SummarizedExperiment::colData(scDengue)
target <- infection$infection

exprs <- as.data.frame(SummarizedExperiment::assay(scDengue,'logcounts'))
discrete_expression <- as.data.frame(FCBF::discretize_exprs(exprs))

fcbf_features <- fcbf(discrete_expression, target, thresh = 0.05, verbose = FALSE)
total_features <- rownames(exprs)
head(fcbf_features)

## ----Running biomaRt----------------------------------------------------------
library(biomaRt)

ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
reference = biomaRt::getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol'), 
              mart = ensembl)

total_genes <- reference[reference$ensembl_gene_id %in% total_features,]$hgnc_symbol

fcbf_genes<- reference[reference$ensembl_gene_id %in% rownames(fcbf_features),]$hgnc_symbol

## -----------------------------------------------------------------------------
total_genes <- total_genes[total_genes != ""]
fcbf_genes <- fcbf_genes[fcbf_genes != ""]

## -----------------------------------------------------------------------------
terms_of_interest <- c('Dengue')

pub <- pubscore(terms_of_interest = terms_of_interest, genes = fcbf_genes )
print(getScore(pub))

p_map <- heatmapViz(pub)
library(plotly)
ggplotly(p_map)

p_net <- networkViz(pub)
plot(p_net)

## -----------------------------------------------------------------------------

#' pub <- test_score(pub, total_genes = total_genes)
#' all_counts <- get_all_counts(pub)
#' save(all_counts, file= '../data/all_counts.RData')
data("all_counts")
set_all_counts(pub) <- all_counts

## -----------------------------------------------------------------------------

# A table with all possible combinations is generated
simulation_of_literature_null <- get_all_counts(pub)

# Then, a number of genes equal to your original list is picked
# and, from this list, we get a score.
# This process is repeated 10 thousand times.

      message('Running 10000 simulations')
      distribution_of_scores <- c()
      for (i in seq_len(10000)){
        
        genes_to_sample_now <- sample(total_genes, 62)
        
        simu_now <- simulation_of_literature_null[simulation_of_literature_null$Genes %in% genes_to_sample_now,]$count
        
        #the list score is made by the sum of all scores divided by the number of combinations (62 genes x 1 term)
        list_score <- sum(simu_now) / (62 * 1)
        distribution_of_scores <- c(distribution_of_scores,list_score )
        
      }
      
    distribution_of_scores<- data.frame(d = distribution_of_scores)
    
    # The score to compare is the one of the original list  
    score <- getScore(pub)
      
    # Then you ask: how many of the null simulations had a score
    # as high as the one of the original list?
    # This is the simulated p-value!

    pvalue <-sum(distribution_of_scores[,1] >= score)/length(distribution_of_scores[,1])
    
    
    print('The p-value by simulation is:')
    print(pvalue) 
      

## -----------------------------------------------------------------------------
all_counts <- get_all_counts(pub)
head(all_counts$Genes[order(all_counts$count, decreasing = TRUE)], 10)

## -----------------------------------------------------------------------------

pub <- test_score(pub, remove_ambiguous = TRUE, nsim = 10000)

## -----------------------------------------------------------------------------
retested_literature_object <- test_score(pub, remove_ambiguous = TRUE,
                                                      ambiguous_terms = 
                                                      c("PC", "JUN", "IMPACT"), nsim = 10000)