---
title: "Handling Binary and Categorical Variables"
date: "`r format(Sys.Date(), '%Y-%m-%d')`"
output: 
  rmarkdown::html_vignette:
    toc: true
    toc_depth: 2
    fig_width: 7
    fig_height: 5
    dpi: 600
vignette: >
  %\VignetteIndexEntry{Handling Binary and Categorical Variables}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

The CISS-VAE model can handle binary and categorical variables, but categorical variables must first be converted into binary dummy variables. 

```{r setup, echo=FALSE, include=FALSE}
rm(list = ls())

library(rCISSVAE)
library(tidyverse)
library(kableExtra)
library(reticulate)

set.seed(42)

```

The [Palmer Penguins](https://allisonhorst.github.io/palmerpenguins/) dataset has both continuous (bill_length, bill_depth, flipper_length, body_mass) and categorical (species, island, sex) values so it makes a good example for this. We can use the `dummy_cols()` function from the `fastDummies` package to create dummy variables for our categories. Set `ignore_na = TRUE` and `remove_selected_columns = TRUE` to avoid creating a new column for NA values and to remove the original categoricals once the dummies are created. 

```{r start}
library(tidyverse)
library(kableExtra)
library(reticulate)
library(rCISSVAE)
library(fastDummies)
library(palmerpenguins)
data(package = 'palmerpenguins')

penguins_clean = na.omit(penguins)%>%
        select(year, everything()) ## removing existing incomplete rows for illustration purposes

glue::glue("Dimensions: {paste0(dim(penguins), collapse = ',')}")

head(penguins_clean) %>% kable()

## create penguins_missing
n  <- nrow(penguins_clean)
p  <- ncol(penguins_clean)
m  <- floor(0.20 * n * p)               # number of cells to mask
idx <- sample.int(n * p, m)             # positions in a logical matrix

mask <- matrix(FALSE, nrow = n, ncol = p)
mask[idx] <- TRUE

penguins_missing <- penguins_clean

## anything can be missing except the year
for (j in seq(2, p, 1)) {
  penguins_missing[[j]][mask[, j]] <- NaN
}

# quick check of missingness rate
glue::glue("\nMissingness proportion of penguins_missing: {round(mean(is.na(as.matrix(penguins_missing))), 2)}") 

## create dummy vars

penguin_dummies_complete = penguins_clean %>% 
    dummy_cols(select_columns = c("species", "island", "sex"),
    ignore_na = TRUE,
    remove_first_dummy = TRUE,
    remove_selected_columns = TRUE) 

penguin_dummies = penguins_missing %>% 
    dummy_cols(select_columns = c("species", "island", "sex"),
    ignore_na = TRUE,
    remove_first_dummy = TRUE,
    remove_selected_columns = TRUE)

head(penguin_dummies) %>% kable()

```

Now that the dummy vars are created and there is missingness, we can create a `binary_feature_mask` and impute with `run_cissvae()`.


```{r eval=FALSE}
binary_feature_mask = c(rep(FALSE, 5), rep(TRUE, 5))

glue::glue("Binary Feature Mask: {paste0(binary_feature_mask, collapse = ', ')}")

results = run_cissvae(
    data = penguin_dummies,
    val_proportion = 0.20, ## small dataset so using higher val proportion
    columns_ignore = "year",
    binary_feature_mask = binary_feature_mask,
    clusters = NULL,
    n_clusters = 1,
    scale_features = TRUE,
    epochs = 500,
    debug = FALSE
)

head(results$imputed_dataset)
head(penguin_dummies)
```

```{r echo=FALSE}
results = readRDS(system.file("extdata", "binary_res.rds", package = "rCISSVAE"))

head(results)
head(penguin_dummies)
```

As we can see above, the imputed values for the binary variables are in terms of **probability**, not a flat 0,1 so we have to convert those values to binary. The 'imputed_dataset' is returned as a data.frame, so we can use tidyverse mutate to convert the binary variables. 

```{r eval=FALSE}
results$imputed_dataset <- results$imputed_dataset %>%
  mutate(across(
    .cols = matches("species|island|sex"),
    .fns = ~ case_when(
      .x > 0.5 ~ 1,
      .x <= 0.5 ~ 0,
      TRUE ~ .x
    )
  ))

head(results$imputed_dataset)
head(penguin_dummies)
head(penguin_dummies_complete)
```

```{r echo=FALSE}
results <- results %>%
  mutate(across(
    .cols = matches("species|island|sex"),
    .fns = ~ case_when(
      .x > 0.5 ~ 1,
      .x <= 0.5 ~ 0,
      TRUE ~ .x
    )
  ))

head(results)
head(penguin_dummies)
head(penguin_dummies_complete)
```