## ----eval=FALSE---------------------------------------------------------------
# install.packages("renv")
# library(renv)
# renv::init()

## ----eval=FALSE---------------------------------------------------------------
# install.packages("sparklyr") # version 1.9.1
# options(timeout = 6000)
# library(sparklyr)
# spark_install(version="4.0.0")

## ----eval=FALSE---------------------------------------------------------------
# sparklyr::spark_installed_versions()
# utils::packageVersion("sparklyr")

## ----eval=FALSE---------------------------------------------------------------
# 
# install.packages("bigMICE")

## ----eval=FALSE---------------------------------------------------------------
# library(bigMICE)

## ----eval=FALSE---------------------------------------------------------------
# library(bigMICE)
# library(dplyr)
# library(sparklyr)

## ----eval=FALSE---------------------------------------------------------------
# conf <- spark_config()
# conf$`sparklyr.shell.driver-memory`<- "10G"
# conf$spark.memory.fraction <- 0.8
# conf$`sparklyr.cores.local` <- 4
# #conf$`spark.local.dir` <- "/local/data/spark_tmp/" # needed for checkpointing.
# # If not possible, add the parameter checkpointing = FALSE to the mice.spark call
# 
# sc = spark_connect(master = "local", config = conf)

## ----eval=FALSE---------------------------------------------------------------
# # Loading the data
# load("boys.rda")
# 
# #Making a binary outcome
# boysBin <- boys %>%
#  mutate(
#    phb = as.factor(case_when(
#      phb == "P1" ~ 1,
#      is.na(phb) ~ NA,
#      TRUE ~ 0
#    ))
#  )
# 
# tmpdir <- tempdir()
# csv_file <- paste(tmpdir,"data.csv", sep="/")
# write.csv(boys, csv_file, row.names = FALSE)
# 
# sdf <- spark_read_csv(sc, "data", csv_file, header = TRUE, infer_schema = TRUE, null_value = "NA") %>%
#   select(-all_of(c("hgt","wgt","bmi","hc")))
# unlink(tmpdir, recursive= T)
# 
# # preparing the elements before running bigMICE
# 
# 
# variable_types <- c(age = "Continuous_float",
#                    gen = "Nominal",
#                    phb = "Binary",
#                    tv = "Continuous_int",
#                    reg = "Nominal")
# 
# analysis_formula <- as.formula("phb ~ age + gen + tv + reg")
# 

## ----eval=FALSE---------------------------------------------------------------
# imputation_results <- bigMICE::mice.spark(data = sdf,
#                                             sc = sc,
#                                 variable_types = variable_types,
#                               analysis_formula = analysis_formula,
#                                predictorMatrix = NULL,
#                                              m = 2,
#                                          maxit = 1,
#                                  checkpointing = FALSE)
# 
# print(imputation_results)
# 
# spark_disconnect(sc)