## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    fig.align = "center"
)

.DT <- function(x) {
    dt_options <- list(
        scrollX = TRUE,
        pageLength = 6,
        dom = "t",
        initComplete = DT::JS(
            "function(settings, json) {",
            "$(this.api().table().header()).css({'font-size':'10pt'});",
            "}"
        )
    )

    x %>%
        DT::datatable(options = dt_options, rownames = FALSE) %>%
        DT::formatStyle(
            columns = colnames(x),
            fontSize = "10pt"
        )
}

library(BiocStyle)

## ----eval = FALSE, include = TRUE---------------------------------------------
#  # install BiocManager if not present
#  if (!requireNamespace("BiocManager", quietly = TRUE)) {
#      install.packages("BiocManager")
#  }
#  
#  # install MetMashR and dependencies
#  BiocManager::install("MetMashR")

## ----eval=TRUE, include=FALSE-------------------------------------------------
suppressPackageStartupMessages({
    # load the packages
    library(MetMashR)
    library(ggplot2)
    library(structToolbox)
    library(dplyr)
    library(DT)
})

## ----eval=FALSE, include=TRUE-------------------------------------------------
#  # load the packages
#  library(MetMashR)
#  library(ggplot2)
#  library(dplyr)

## -----------------------------------------------------------------------------
# prepare workflow
M <- import_source() +
    add_labels(
        labels = c(
            assay = "placeholder", # will be replaced later
            source_name = "CD"
        )
    ) +
    filter_labels(
        column_name = "compound_match",
        labels = "Full match",
        mode = "include"
    ) +
    filter_labels(
        column_name = "compound_match",
        labels = "Full match",
        mode = "include"
    ) +
    filter_range(
        column_name = "library_ppm_diff",
        upper_limit = 2,
        lower_limit = -2,
        equal_to = FALSE
    ) +
    combine_records(
        group_by = c("compound", "ion"),
        default_fcn = select_max(
            max_col = "mzcloud_score",
            keep_NA = FALSE,
            use_abs = TRUE
        )
    )

# place to store results
CD <- list()
for (assay in c("HILIC_NEG", "HILIC_POS", "LIPIDS_NEG", "LIPIDS_POS")) {
    # prepare source
    AT <- cd_source(
        source = c(
            system.file(
                paste0("extdata/MTox/CD/", assay, ".xlsx"),
                package = "MetMashR"
            ),
            system.file(
                paste0("extdata/MTox/CD/", assay, "_comp.xlsx"),
                package = "MetMashR"
            )
        ),
        tag = paste0("CD_", assay)
    )

    # update labels in workflow
    M[2]$labels$assay <- assay

    # apply workflow to source
    CD[[assay]] <- model_apply(M, AT)
}

## -----------------------------------------------------------------------------
predicted(CD$HILIC_NEG)

## ----echo=FALSE---------------------------------------------------------------
.DT(predicted(CD$HILIC_NEG)$data)

## -----------------------------------------------------------------------------
C <- annotation_pie_chart(
    factor_name = "compound_match",
    label_rotation = FALSE,
    label_location = "outside",
    legend = TRUE,
    label_type = "percent",
    centre_radius = 0.5,
    centre_label = ".total"
)

## -----------------------------------------------------------------------------
# plot individual charts
g1 <- chart_plot(C, predicted(CD$HILIC_NEG[3])) +
    ggtitle("Compound matches\nafter filtering") +
    theme(plot.title = element_text(hjust = 0.5))


g2 <- chart_plot(C, predicted(CD$HILIC_NEG[2])) +
    ggtitle("Compound matches\nbefore filtering") +
    theme(plot.title = element_text(hjust = 0.5))


# get legend
leg <- cowplot::get_legend(g2)

# layout
cowplot::plot_grid(
    g2 + theme(legend.position = "none"),
    g1 + theme(legend.position = "none"),
    leg,
    nrow = 1,
    rel_widths = c(1, 1, 0.5)
)

## -----------------------------------------------------------------------------
C <- annotation_histogram(
    factor_name = "library_ppm_diff", vline = c(-2, 2)
)
G <- list()
G$HILIC_NEG <- chart_plot(C, predicted(CD$HILIC_NEG[2]))
G$HILIC_POS <- chart_plot(C, predicted(CD$HILIC_POS[3]))

cowplot::plot_grid(plotlist = G, labels = c("HILIC_NEG", "HILIC_POS"))

## -----------------------------------------------------------------------------
# prepare workflow
M <- import_source() +
    add_labels(
        labels = c(
            assay = "placeholder", # will be replaced later
            source_name = "LS"
        )
    ) +
    filter_labels(
        column_name = "Grade",
        labels = c("A", "B"),
        mode = "include"
    ) +
    filter_labels(
        column_name = "Grade",
        labels = c("A", "B"),
        mode = "include"
    ) +
    combine_records(
        group_by = c("LipidIon"),
        default_fcn = select_min(
            min_col = "library_ppm_diff",
            keep_NA = FALSE,
            use_abs = TRUE
        )
    )

# place to store results
LS <- list()
for (assay in c("HILIC_NEG", "HILIC_POS", "LIPIDS_NEG", "LIPIDS_POS")) {
    # prepare source
    AT <- ls_source(
        source = system.file(
            paste0("extdata/MTox/LS/MTox_2023_", assay, ".txt"),
            package = "MetMashR"
        ),
        tag = paste0("LS_", assay)
    )

    # update labels in workflow
    M[2]$labels$assay <- assay

    # apply workflow to source
    LS[[assay]] <- model_apply(M, AT)
}

## -----------------------------------------------------------------------------
predicted(LS$LIPIDS_NEG)

## ----echo=FALSE---------------------------------------------------------------
.DT(predicted(LS$LIPIDS_NEG)$data)

## -----------------------------------------------------------------------------
C <- annotation_pie_chart(
    factor_name = "Grade",
    label_rotation = FALSE,
    label_location = "outside",
    label_type = "percent",
    legend = FALSE,
    centre_radius = 0.5,
    centre_label = ".total"
)
g1 <- chart_plot(C, predicted(LS$LIPIDS_NEG)) +
    ggtitle("Grades after filtering") +
    theme(plot.margin = unit(c(1, 1.5, 1, 1.5), "cm"))

g2 <- chart_plot(C, predicted(LS$LIPIDS_NEG[1])) +
    ggtitle("Grades before filtering") +
    theme(plot.margin = unit(c(1, 1.5, 1, 1.5), "cm"))


cowplot::plot_grid(g2, g1, nrow = 1, align = "v")

## -----------------------------------------------------------------------------
# prepare venn chart object
C <- annotation_venn_chart(
    factor_name = "compound",
    line_colour = "white",
    fill_colour = ".group",
    legend = TRUE,
    labels = FALSE
)

## plot
# get all CD tables
cd <- lapply(CD, predicted)
g1 <- chart_plot(C, cd) + ggtitle("Compounds in CD per assay")

# get all LS tables
C$factor_name <- "LipidName"
ls <- lapply(LS, predicted)
g2 <- chart_plot(C, ls) + ggtitle("Compounds in LS per assay")

# prepare upset object
C2 <- annotation_upset_chart(
    factor_name = "compound",
    n_intersections = 10
)
g3 <- chart_plot(C2, cd)
C2$factor_name <- "LipidName"
g4 <- chart_plot(C2, ls)

# layout
cowplot::plot_grid(g1, g2, g3, g4, nrow = 2)

## -----------------------------------------------------------------------------
# get all the cleaned annotation tables in one list
all_source_tables <- lapply(c(CD, LS), predicted)

# prepare to merge
combine_workflow <-
    combine_sources(
        source_list = all_source_tables,
        matching_columns = c(
            name = "LipidName",
            name = "compound",
            adduct = "ion",
            adduct = "LipidIon"
        ),
        keep_cols = ".all",
        source_col = "annotation_table",
        exclude_cols = NULL,
        tag = "combined"
    )

# merge
combine_workflow <- model_apply(combine_workflow, lcms_table())

# show
predicted(combine_workflow)

## -----------------------------------------------------------------------------
C <- annotation_pie_chart(
    factor_name = "assay",
    label_rotation = FALSE,
    label_location = "outside",
    label_type = "percent",
    legend = TRUE,
    centre_radius = 0.5,
    centre_label = ".total"
)

chart_plot(C, predicted(combine_workflow)) +
    ggtitle("Annotations per assay") +
    theme(plot.margin = unit(c(1, 1.5, 1, 1.5), "cm")) +
    guides(fill = guide_legend(title = "Assay"))

## -----------------------------------------------------------------------------
# change to plot source_name column
C$factor_name <- "source_name"
chart_plot(C, predicted(combine_workflow)) +
    ggtitle("Annotations per source") +
    theme(plot.margin = unit(c(1, 1.5, 1, 1.5), "cm")) +
    guides(fill = guide_legend(title = "Source"))

## -----------------------------------------------------------------------------
# import cached results
inchikey_cache <- rds_database(
    source = file.path(
        system.file("cached", package = "MetMashR"),
        "pubchem_inchikey_mtox_cache.rds"
    )
)

id_workflow <-
    pubchem_property_lookup(
        query_column = "name",
        search_by = "name",
        suffix = "",
        property = "InChIKey",
        records = "best",
        cache = inchikey_cache
    )

id_workflow <- model_apply(id_workflow, predicted(combine_workflow))

## -----------------------------------------------------------------------------
# prepare cached results for vignette
inchikey_cache2 <- rds_database(
    source = file.path(
        system.file("cached", package = "MetMashR"),
        "pubchem_inchikey_mtox_cache2.rds"
    )
)
inchikey_cache3 <- rds_database(
    source = file.path(
        system.file("cached", package = "MetMashR"),
        "pubchem_inchikey_mtox_cache3.rds"
    )
)

N <- normalise_strings(
    search_column = "name",
    output_column = "normalised_name",
    dictionary = c(
        # custom dictionary
        list(
            # replace "NP" with "Compound NP"
            list(pattern = "^NP-", replace = "Compound NP-"),
            # replace ? with NA, since this is ambiguous
            list(pattern = "?", replace = NA, fixed = TRUE),
            # remove terms in trailing brackets e.g." (ATP)"
            list(pattern = "\\ \\([^\\)]*\\)$", replace = ""),
            # replace known abbreviations
            list(
                pattern = "(+/-)9-HpODE",
                replace = "9-hydroperoxy-10E,12Z-octadecadienoic acid",
                fixed = TRUE
            ),
            list(
                pattern = "(+/-)19(20)-DiHDPA",
                replace =
                    "19,20-dihydroxy-4Z,7Z,10Z,13Z,16Z-docosapentaenoic acid",
                fixed = TRUE
            )
        ),
        # replace greek characters
        greek_dictionary,
        # remove racemic properties
        racemic_dictionary
    )
) +
    pubchem_property_lookup(
        query_column = "normalised_name",
        search_by = "name",
        suffix = "_norm",
        property = "InChIKey",
        records = "best",
        cache = inchikey_cache2
    ) +
    opsin_lookup(
        query_column = "normalised_name",
        suffix = "_opsin",
        output = "stdinchikey",
        cache = inchikey_cache3
    ) +
    prioritise_columns(
        column_names = c("stdinchikey_opsin", "InChIKey_norm", "InChIKey"),
        output_name = "inchikey",
        source_name = "inchikey_source",
        clean = TRUE
    )

N <- model_apply(N, predicted(id_workflow))

## -----------------------------------------------------------------------------
# venn inchikey columns
C <- annotation_venn_chart(
    factor_name = c("InChIKey", "InChIKey_norm", "stdinchikey_opsin"),
    line_colour = "white",
    fill_colour = ".group",
    legend = TRUE,
    labels = FALSE
)
chart_plot(C, predicted(N[3])) +
    guides(
        fill = guide_legend(title = "Source"),
        colour = guide_legend(title = "Source")
    ) +
    theme(plot.margin = unit(c(1, 1.5, 1, 1.5), "cm"))

## -----------------------------------------------------------------------------
# upset inchikey columns
C <- annotation_upset_chart(
    factor_name = c("InChIKey", "InChIKey_norm", "stdinchikey_opsin"),
    min_size = 0,
    n_intersections = 10
)
chart_plot(C, predicted(N[3]))

## -----------------------------------------------------------------------------
# pie source of inchikey
C <- annotation_pie_chart(
    factor_name = "inchikey_source",
    label_rotation = FALSE,
    label_location = "outside",
    label_type = "percent",
    legend = TRUE,
    centre_radius = 0.5,
    centre_label = ".total",
    count_na = TRUE
)
chart_plot(C, predicted(N)) +
    guides(
        fill = guide_legend(title = "Source"),
        colour = guide_legend(title = "Source")
    ) +
    theme(plot.margin = unit(c(1, 1.5, 1, 1.5), "cm"))

## -----------------------------------------------------------------------------
# prepare workflow
FL <- filter_labels(
    column_name = "inchikey_source",
    labels = "InChIKey",
    mode = "exclude"
)
# apply
FF <- model_apply(FL, predicted(N))

# print summary
predicted(FF)

## -----------------------------------------------------------------------------
# prepare object
R <- rds_database(
    source = file.path(
        system.file("extdata", package = "MetMashR"),
        "standard_mixtures.rds"
    ),
    .writable = FALSE
)

# read
R <- read_source(R)

## ----echo=FALSE---------------------------------------------------------------
.DT(R$data)

## -----------------------------------------------------------------------------
# convert standard mixtures to source, then get inchikey from MTox700+
SM <- import_source() +
    filter_na(
        column_name = "rt"
    ) +
    filter_na(
        column_name = "median_ms2_scans"
    ) +
    filter_na(
        column_name = "mzcloud_id"
    ) +
    filter_range(
        column_name = "median_ms2_scans",
        upper_limit = Inf,
        lower_limit = 0,
        equal_to = TRUE
    ) +
    database_lookup(
        query_column = "hmdb_id",
        database_column = "hmdb_id",
        database = MTox700plus_database(),
        include = "inchikey",
        suffix = "",
        not_found = NA
    ) +
    id_counts(
        id_column = "inchikey",
        count_column = "inchikey_count",
        count_na = FALSE
    )
# apply
SM <- model_apply(SM, R)

## -----------------------------------------------------------------------------
C <- annotation_venn_chart(
    factor_name = "inchikey",
    group_column = "ion_mode",
    line_colour = "white",
    fill_colour = ".group",
    legend = TRUE,
    labels = FALSE
)

## plot
chart_plot(C, predicted(SM))

## ----fig.width=4--------------------------------------------------------------
# get processed data
AN <- predicted(FF)
AN$tag <- "Annotations"
sM <- predicted(SM)
sM$tag <- "Standards"

# prepare chart
C <- annotation_venn_chart(
    factor_name = "inchikey",
    line_colour = "white",
    fill_colour = ".group"
)

# plot
chart_plot(C, sM, AN) + ggtitle("All assays, all sources")

## ----fig.height  = 10, fig.width = 8------------------------------------------
G <- list()
VV <- list()
for (k in c("HILIC_NEG", "HILIC_POS", "LIPIDS_NEG", "LIPIDS_POS")) {
    wf <- filter_labels(
        column_name = "assay",
        labels = k,
        mode = "include"
    )
    wf1 <- model_apply(wf, AN)
    wf$column_name <- "ion_mode"
    wf2 <- model_apply(wf, sM)
    G[[k]] <- chart_plot(C, predicted(wf2), predicted(wf1))

    V <- filter_venn(
        factor_name = "inchikey",
        tables = list(predicted(wf1)),
        levels = "Standards/Annotations",
        mode = "include"
    )
    V <- model_apply(V, predicted(wf2))
    VV[[k]] <- predicted(V)
    VV[[k]]$tag <- k
}
r1 <- cowplot::plot_grid(
    plotlist = G, nrow = 2,
    labels = c("HILIC_NEG", "HILIC_POS", "LIPIDS_NEG", "LIPIDS_POS")
)

cowplot::plot_grid(r1, chart_plot(C, VV), nrow = 2, rel_heights = c(1, 0.5))

## ----fig.height  = 3, fig.width = 8-------------------------------------------
G <- list()
for (k in c("CD", "LS")) {
    wf <- filter_labels(
        column_name = "source_name",
        labels = k,
        mode = "include"
    )
    wf1 <- model_apply(wf, AN)

    G[[k]] <- chart_plot(C, sM, predicted(wf1))
}
cowplot::plot_grid(plotlist = G, nrow = 1, labels = c("CD", "LS"))

## -----------------------------------------------------------------------------
sessionInfo()