params <-
list(seed = 20188102)

## ----eval=FALSE---------------------------------------------------------------
#  if (!require("BiocManager")) {
#      install.packages("BiocManager")
#  }
#  BiocManager::install("glmSparseNet")

## ----packages, message=FALSE, warning=FALSE, results='hide'-------------------
library(dplyr)
library(Matrix)
library(ggplot2)
library(forcats)
library(parallel)
library(reshape2)
library(survival)
library(VennDiagram)
library(futile.logger)
library(curatedTCGAData)
library(MultiAssayExperiment)
library(TCGAutils)
#
library(glmSparseNet)
#
#
# Some general options for futile.logger the debugging package
flog.layout(layout.format("[~l] ~m"))
options(
    "glmSparseNet.show_message" = FALSE,
    "glmSparseNet.base_dir" = withr::local_tempdir()
)
# Setting ggplot2 default theme as minimal
theme_set(ggplot2::theme_minimal())

## ----eval=FALSE---------------------------------------------------------------
#  # Not evaluated in vignette as it takes too long to download and process
#  allInteractions700 <- stringDBhomoSapiens(scoreThreshold = 700)
#  stringNetwork <- buildStringNetwork(allInteractions700, "external")

## ----include=FALSE------------------------------------------------------------
data("string.network.700.cache", package = "glmSparseNet")
stringNetwork <- string.network.700.cache

## -----------------------------------------------------------------------------
stringNetworkUndirected <- stringNetwork + Matrix::t(stringNetwork)
stringNetworkUndirected <- (stringNetworkUndirected != 0) * 1

## ----echo=FALSE, collapse=TRUE------------------------------------------------
flog.info("Directed graph (score_threshold = %d)", 700)
flog.info("  *       total edges: %d", sum(stringNetwork != 0))
flog.info("  *    unique protein: %d", nrow(stringNetwork))
flog.info(
    "  * edges per protein: %f",
    sum(stringNetwork != 0) / nrow(stringNetwork)
)
flog.info("")
flog.info("Undirected graph (score_threshold = %d)", 700)
flog.info("  *       total edges: %d", sum(stringNetworkUndirected != 0) / 2)
flog.info("  *    unique protein: %d", nrow(stringNetworkUndirected))
flog.info(
    "  * edges per protein: %f",
    sum(stringNetworkUndirected != 0) / 2 / nrow(stringNetworkUndirected)
)

## ----echo=FALSE---------------------------------------------------------------
stringNetworkBinary <- (stringNetworkUndirected != 0) * 1
degreeNetworkVector <- (
    Matrix::rowSums(stringNetworkBinary) +
        Matrix::colSums(stringNetworkBinary)
) / 2

flog.info("Summary of degree:", summary(degreeNetworkVector), capture = TRUE)

## ----warning=FALSE------------------------------------------------------------
qplot(
    degreeNetworkVector[
        degreeNetworkVector <= quantile(degreeNetworkVector, probs = .99999)
    ],
    geom = "histogram", fill = my.colors(2), bins = 100
) +
    theme(legend.position = "none") + xlab("Degree (up until 99.999% quantile)")

## ----include=FALSE------------------------------------------------------------
# chunk not included as it produces to many unnecessary messages
brca <- tryCatch(
    {
        curatedTCGAData(
            diseaseCode = "BRCA",
            assays = "RNASeq2GeneNorm",
            version = "1.1.38",
            dry.run = FALSE
        )
    },
    error = function(err) {
        NULL
    }
)

## ----eval=FALSE---------------------------------------------------------------
#  brca <- curatedTCGAData(
#      diseaseCode = "BRCA", assays = "RNASeq2GeneNorm",
#      version = "1.1.38", dry.run = FALSE
#  )

## ----data.show, warning=FALSE, error=FALSE, eval=!is.null(brca)---------------
# keep only solid tumour (code: 01)
brcaPrimarySolidTumor <- TCGAutils::TCGAsplitAssays(brca, "01")
xdataRaw <- t(assay(brcaPrimarySolidTumor[[1]]))

# Get survival information
ydataRaw <- colData(brcaPrimarySolidTumor) |>
    as.data.frame() |>
    # Convert days to integer
    dplyr::mutate(
        Days.to.date.of.Death = as.integer(Days.to.date.of.Death),
        Days.to.Last.Contact  = as.integer(Days.to.Date.of.Last.Contact)
    ) |>
    # Find max time between all days (ignoring missings)
    dplyr::rowwise() |>
    dplyr::mutate(
        time = max(days_to_last_followup, Days.to.date.of.Death,
            Days.to.Last.Contact, days_to_death,
            na.rm = TRUE
        )
    ) |>
    # Keep only survival variables and codes
    dplyr::select(patientID, status = vital_status, time) |>
    # Discard individuals with survival time less or equal to 0
    dplyr::filter(!is.na(time) & time > 0) |>
    as.data.frame()

# Set index as the patientID
rownames(ydataRaw) <- ydataRaw$patientID

# keep only features that are in degreeNetworkVector and have
#  standard deviation > 0
validFeatures <- colnames(xdataRaw)[
    colnames(xdataRaw) %in% names(degreeNetworkVector[degreeNetworkVector > 0])
]
xdataRaw <- xdataRaw[
    TCGAbarcode(rownames(xdataRaw)) %in% rownames(ydataRaw), validFeatures
]
xdataRaw <- scale(xdataRaw)

# Order ydata the same as assay
ydataRaw <- ydataRaw[TCGAbarcode(rownames(xdataRaw)), ]

# Using only a subset of genes previously selected to keep this short example.
set.seed(params$seed)
smallSubset <- c(
    "AAK1", "ADRB1", "AK7", "ALK", "APOBEC3F", "ARID1B", "BAMBI",
    "BRAF", "BTG1", "CACNG8", "CASP12", "CD5", "CDA", "CEP72",
    "CPD", "CSF2RB", "CSN3", "DCT", "DLG3", "DLL3", "DPP4",
    "DSG1", "EDA2R", "ERP27", "EXD1", "GABBR2", "GADD45A",
    "GBP1", "HTR1F", "IFNK", "IRF2", "IYD", "KCNJ11", "KRTAP5-6",
    "MAFA", "MAGEB4", "MAP2K6", "MCTS1", "MMP15", "MMP9",
    "NFKBIA", "NLRC4", "NT5C1A", "OPN4", "OR13C5", "OR13C8",
    "OR2T6", "OR4K2", "OR52E6", "OR5D14", "OR5H1", "OR6C4",
    "OR7A17", "OR8J3", "OSBPL1A", "PAK6", "PDE11A", "PELO",
    "PGK1", "PIK3CB", "PMAIP1", "POLR2B", "POP1", "PPFIA3",
    "PSME1", "PSME2", "PTEN", "PTGES3", "QARS", "RABGAP1",
    "RBM3", "RFC3", "RGPD8", "RPGRIP1L", "SAV1", "SDC1", "SDC3",
    "SEC16B", "SFPQ", "SFRP5", "SIPA1L1", "SLC2A14", "SLC6A9",
    "SPATA5L1", "SPINT1", "STAR", "STXBP5", "SUN3", "TACC2",
    "TACR1", "TAGLN2", "THPO", "TNIP1", "TP53", "TRMT2B", "TUBB1",
    "VDAC1", "VSIG8", "WNT3A", "WWOX", "XRCC4", "YME1L1",
    "ZBTB11", "ZSCAN21"
) |>
    sample(size = 50) |>
    sort()

# make sure we have 100 genes
smallSubset <- c(smallSubset, sample(colnames(xdataRaw), 51)) |>
    unique() |>
    sort()

xdata <- xdataRaw[, smallSubset[smallSubset %in% colnames(xdataRaw)]]
ydata <- ydataRaw |>
    dplyr::select(time, status) |>
    dplyr::filter(!is.na(time) | time < 0)

## ----eval=!is.null(brca)------------------------------------------------------
#
# Add degree 0 to genes not in STRING network

myDegree <- degreeNetworkVector[smallSubset]
myString <- stringNetworkBinary[smallSubset, smallSubset]

## ----echo=FALSE, warning=FALSE, eval=!is.null(brca)---------------------------
qplot(myDegree, bins = 100, fill = my.colors(3)) +
    theme(legend.position = "none")

## ----eval=!is.null(brca)------------------------------------------------------
set.seed(params$seed)
foldid <- glmSparseNet:::balancedCvFolds(ydata$status)$output

## ----include=FALSE, eval=!is.null(brca)---------------------------------------
# List that will store all selected genes
selectedGenes <- list()

## ----warning=FALSE, error=FALSE, eval=!is.null(brca)--------------------------
resultCVHub <- cv.glmHub(xdata,
    Surv(ydata$time, ydata$status),
    family = "cox",
    foldid = foldid,
    network = myString,
    network.options = networkOptions(minDegree = 0.2)
)

## ----echo=FALSE, eval=!is.null(brca)------------------------------------------
separate2GroupsCox(
    as.vector(coef(resultCVHub, s = "lambda.min")[, 1]),
    xdata, ydata,
    plot.title = "Full dataset",
    legend.outside = FALSE
)

selectedGenes[["Hub"]] <- Filter(
    function(.x) .x != 0,
    coef(resultCVHub, s = "lambda.min")[, 1]
) |>
    names() |>
    geneNames() |>
    magrittr::extract2("external_gene_name")

## ----warning=FALSE, error=FALSE, eval=!is.null(brca)--------------------------
resultCVOrphan <- cv.glmOrphan(xdata,
    Surv(ydata$time, ydata$status),
    family = "cox",
    foldid = foldid,
    network = myString,
    network.options = networkOptions(minDegree = 0.2)
)

## ----echo=FALSE, eval=!is.null(brca)------------------------------------------
separate2GroupsCox(
    as.vector(coef(resultCVOrphan, s = "lambda.min")[, 1]),
    xdata, ydata,
    plot.title = "Full dataset",
    legend.outside = FALSE
)

selectedGenes[["Orphan"]] <- Filter(
    function(.x) .x != 0,
    coef(resultCVOrphan, s = "lambda.min")[, 1]
) |>
    names() |>
    geneNames() |>
    magrittr::extract2("external_gene_name")

## ----warning=FALSE, error=FALSE, eval=!is.null(brca)--------------------------
library(glmnet)
resultCVGlmnet <- cv.glmnet(xdata,
    Surv(ydata$time, ydata$status),
    family = "cox",
    foldid = foldid
)

## ----echo=FALSE, eval=!is.null(brca)------------------------------------------
separate2GroupsCox(
    as.vector(coef(resultCVGlmnet, s = "lambda.min")[, 1]),
    xdata, ydata,
    plotTitle = "Full dataset",
    legendOutside = FALSE
)

selectedGenes[["GLMnet"]] <- Filter(
    function(.x) .x != 0,
    coef(resultCVGlmnet, s = "lambda.min")[, 1]
) |>
    names() |>
    geneNames() |>
    magrittr::extract2("external_gene_name")

## ----echo=FALSE, warning=FALSE, eval=!is.null(brca)---------------------------
vennPlot <- venn.diagram(
    selectedGenes,
    NULL,
    fill = c(
        myColors(5), myColors(3),
        myColors(4)
    ),
    alpha = c(0.3, 0.3, .3),
    cex = 2,
    cat.fontface = 4,
    category.names = names(selectedGenes)
)
grid.draw(vennPlot)

## ----echo=FALSE, warning=FALSE, eval=!is.null(brca)---------------------------
melt(selectedGenes) |>
    mutate(
        Degree = myDegree[value],
        value = factor(value),
        L1 = factor(L1)
    ) |>
    mutate(value = fct_reorder(value, Degree)) |>
    as.data.frame() |>
    ggplot() +
    geom_point(
        aes(value, L1, size = Degree),
        shape = mySymbols(3), color = myColors(3)
    ) +
    theme(
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0),
        legend.position = "top"
    ) +
    ylab("Model") +
    xlab("Gene") +
    scale_size_continuous(trans = "log10")

## ----sessionInfo--------------------------------------------------------------
sessionInfo()