## ----eval=FALSE--------------------------------------------------------------- # install.packages("renv") # library(renv) # renv::init() ## ----eval=FALSE--------------------------------------------------------------- # install.packages("sparklyr") # version 1.9.1 # options(timeout = 6000) # library(sparklyr) # spark_install(version="4.0.0") ## ----eval=FALSE--------------------------------------------------------------- # sparklyr::spark_installed_versions() # utils::packageVersion("sparklyr") ## ----eval=FALSE--------------------------------------------------------------- # # install.packages("bigMICE") ## ----eval=FALSE--------------------------------------------------------------- # library(bigMICE) ## ----eval=FALSE--------------------------------------------------------------- # library(bigMICE) # library(dplyr) # library(sparklyr) ## ----eval=FALSE--------------------------------------------------------------- # conf <- spark_config() # conf$`sparklyr.shell.driver-memory`<- "10G" # conf$spark.memory.fraction <- 0.8 # conf$`sparklyr.cores.local` <- 4 # #conf$`spark.local.dir` <- "/local/data/spark_tmp/" # needed for checkpointing. # # If not possible, add the parameter checkpointing = FALSE to the mice.spark call # # sc = spark_connect(master = "local", config = conf) ## ----eval=FALSE--------------------------------------------------------------- # # Loading the data # load("boys.rda") # # #Making a binary outcome # boysBin <- boys %>% # mutate( # phb = as.factor(case_when( # phb == "P1" ~ 1, # is.na(phb) ~ NA, # TRUE ~ 0 # )) # ) # # tmpdir <- tempdir() # csv_file <- paste(tmpdir,"data.csv", sep="/") # write.csv(boys, csv_file, row.names = FALSE) # # sdf <- spark_read_csv(sc, "data", csv_file, header = TRUE, infer_schema = TRUE, null_value = "NA") %>% # select(-all_of(c("hgt","wgt","bmi","hc"))) # unlink(tmpdir, recursive= T) # # # preparing the elements before running bigMICE # # # variable_types <- c(age = "Continuous_float", # gen = "Nominal", # phb = "Binary", # tv = "Continuous_int", # reg = "Nominal") # # analysis_formula <- as.formula("phb ~ age + gen + tv + reg") # ## ----eval=FALSE--------------------------------------------------------------- # imputation_results <- bigMICE::mice.spark(data = sdf, # sc = sc, # variable_types = variable_types, # analysis_formula = analysis_formula, # predictorMatrix = NULL, # m = 2, # maxit = 1, # checkpointing = FALSE) # # print(imputation_results) # # spark_disconnect(sc)