## ----message=FALSE, warning=FALSE, eval = requireNamespace(c("stringr", "tidyr"))----
library(dplyr)
library(stringr)
library(tidyr)
library(edgeR)
library(mice)
# Generate random covariate data
set.seed(2023)
x1 <- rnorm(n = 500)
y1 <- rnorm(n = 500)
z1 <- rnorm(n = 500)
a1 <- rbinom(n = 500, prob = .25, size = 1)
b1 <- rbinom(n = 500, prob = .5, size = 1)
data <- data.frame(x = x1, y = y1, z = z1, a = a1, b = b1)

# Generate random count data from Poisson distribution
nsamp <- nrow(data)
ngene <- 500
mat <- matrix(stats::rpois(n = nsamp * ngene, lambda = sample(1:500, ngene, replace = TRUE)),
    nrow = ngene,
    ncol = nsamp
)

# Make fake ENSEMBL gene numbers
annot <- tibble(number = seq_len(ngene), name1 = "ENS") %>%
    mutate(ENSEMBL = str_c(name1, number)) %>%
    dplyr::select(ENSEMBL)
rownames(mat) <- annot$ENSEMBL

# Make DGE list and set rownames to ENSEMBL gene numbers above
example_DGE <- DGEList(mat, genes = annot)
rownames(example_DGE) <- annot$ENSEMBL

## ----message=FALSE, warning=FALSE---------------------------------------------
# First get all combos of 0 or 1 for the 4 other variables (y, z, a, and b)
pattern_vars <- expand.grid(c(0, 1), c(0, 1), c(0, 1), c(0, 1))
# Then add back a column for the predictor of interest, which is never amputed, so the first col =1 the whole way down
pattern2 <- matrix(1, nrow = nrow(pattern_vars), ncol = 1)
pattern1 <- cbind(pattern2, pattern_vars)
# Remove last row which is all 1s (all 1s means no missingness induced)
pattern <- pattern1[seq_len(15), ]

# Specify proportion of missing data and missingness mechanism for the ampute function
prop_miss <- 25
miss_mech <- "MAR"
result <- ampute(data = data, prop = prop_miss, mech = miss_mech, patterns = pattern)

# Extract the missing data
example_data <- result$amp
# Ampute function turns factors to numeric, so convert back to factors
example_data <- example_data %>% mutate(
    a = as.factor(a),
    b = as.factor(b)
)

# Calculate the new sample size if NAs are dropped as in a complete case analysis
sample_size <- example_data %>%
    drop_na() %>%
    nrow()

# As shown below, we have 24.2% missingness.
100 * (nsamp - sample_size) / nsamp

## ----message=FALSE, warning=FALSE, echo=FALSE---------------------------------
example_data %>%
    head(10) %>%
    knitr::kable(digits = 3, caption = "The first 10 rows of simulated covariate data with missingness")

## ----sessionInfo, echo=FALSE--------------------------------------------------
sessionInfo()