---
title: "Adjusted Limited Dependent Variable Mixture Models of Health State Utilities in R"
author: Mark Pletscher
date: "last modified: 2025-11-18"
package: aldvmm
output:
  bookdown::html_document2:
    base_format: rmarkdown::html_vignette
    highlight: monochrome
    toc: true
bibliography: aldvmm_bib.bib
vignette: >
  %\VignetteIndexEntry{Adjusted Limited Dependent Variable Mixture Models of Health State Utilities in R}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---
  
<!-- Setup -->
  
```{r setup, eval = TRUE, echo = FALSE, results = 'hide'}

# Load packages
#--------------

library("aldvmm")
library("xtable")
library("ggplot2")
library("scales")
library("reshape2")
library("kableExtra")

# Load functions
#---------------

source("rep_tab_reg.R")

# Figure position HERE
#---------------------

knitr::opts_chunk$set(fig.pos = "!H", out.extra = "")
warnings(file = "./R-warnings.txt")

```

<!-- Load data -->

```{r load_data, eval = FALSE, warning = FALSE, echo = FALSE, results = 'hide'}

# List to collect results used in text
#-------------------------------------

textout <- list()

# Download data
#--------------

temp <- tempfile()
download.file(paste0("https://files.digital.nhs.uk/publicationimport",
                     "/pub11xxx/pub11359/final-proms-eng-apr11-mar12",
                     "-data-pack-csv.zip"), temp)
df <- read.table(unz(description = temp,
                     filename = 'Hip Replacement 1112.csv'),
                 sep = ',',
                 header = TRUE)
unlink(temp)
rm(temp)

# Recode data
#------------

df <- df[df$AGEBAND != '*' & df$SEX != '*', 
         c('AGEBAND', 'SEX', 'Q2_EQ5D_INDEX', 'HR_Q2_SCORE')]

df$eq5d <- df$Q2_EQ5D_INDEX
df$hr <- df$HR_Q2_SCORE/10

# Select sample
#--------------

textout[["n"]] <- nrow(df)
df <- df[stats::complete.cases(df), ]
textout[["ncplt"]] <- nrow(df)

set.seed(101010101)
df <- df[sample(1:nrow(df), size = nrow(df)*0.3), ]
textout[["nspl"]] <- nrow(df)

# Population average Oxford Hip Score
#------------------------------------

textout[["meanhr"]] <- mean(df[, "hr"], na.rm = TRUE)

# Save output for text
#---------------------

save(textout,
     file = "textout.RData",
     compress = TRUE)

rm(textout)

# Save data for plots
#--------------------

save(df,
     file = "df.RData",
     compress = FALSE)

rm(df)

```

<!-- Computationally heavy calculations -->

```{r calc_fit, eval = FALSE, warning = FALSE, echo = FALSE, results = 'hide'}

load("df.RData")

#------------------------------------------------------------------------------
# Fit model 1
#------------------------------------------------------------------------------

formula <- eq5d ~ hr | 1

# Assess all optimization methods
#--------------------------------

init.method <- c("zero", "random", "constant", "sann")

optim.method <- c("Nelder-Mead", "BFGS", "CG", "L-BFGS-B", 
                  #"nlm", 
                  "nlminb", 
                  "Rcgmin", "Rvmmin","hjn")

fit1_all <- list()

for (i in init.method) {
  for (j in optim.method) {
    
    cat(paste0("\n", i, " - ", j, "\n"))
    
    start <- Sys.time()
    
    set.seed(101010101)
    fit <- tryCatch({
      aldvmm::aldvmm(data = df,
                     formula = formula,
                     psi = c(0.883, -0.594),
                     ncmp = 2,
                     init.method = i,
                     optim.method = j,
                     model = FALSE)
      
    }, error = function(e) {
      return(list())
    })
    
    end <- Sys.time()
    
    fit1_all[["fit"]][[i]][[j]] <- fit
    fit1_all[["time"]][[i]][[j]] <- difftime(end, start, units = c("mins"))
    rm(fit, start, end)
    
  }
}

save(fit1_all, 
     file = "fit1_all.RData",
     compress = TRUE)

rm(fit1_all, init.method, optim.method, i, j)

# Default model ("nlminb")
#-------------------------

# With standard errors of fitted values for comparison to STATA

fit1_nlminb <- aldvmm::aldvmm(data = df,
                              formula = formula,
                              psi = c(0.883, -0.594),
                              ncmp = 2,
                              optim.method = "nlminb",
                              se.fit = TRUE)

save(fit1_nlminb, 
     file = "fit1_nlminb.RData",
     compress = TRUE)

rm(fit1_nlminb)

# Constrained optimization
#-------------------------

# c(0.2358245, 0.1458986, -0.4306492, 0.3134739, 0.7282714, -2.4622704, -1.2480114) 

init <- c(0,    0,   0,   0,    0,    0,    0.7283)
lo   <- c(-Inf, -Inf, -3,  -Inf, -Inf, -3, -Inf)
hi   <- c(Inf,  Inf,  Inf, Inf,  Inf,  Inf,  Inf)

fit1_cstr <- aldvmm::aldvmm(data = df,
                            formula = formula,
                            psi = c(0.883, -0.594),
                            ncmp = 2,
                            init.est = init,
                            init.lo = lo,
                            init.hi = hi)

save(fit1_cstr, 
     file = "fit1_cstr.RData",
     compress = TRUE)

rm(fit1_cstr, init, lo, hi)

# Constrained optimization, Hernandez Alava and Wailoo (2015)
#-----------------------------------------------------------

# With standard errors of fitted values for comparison to STATA

init <- c(-.0883435, .2307964, 100, 0, 7.320611, -1.646109, 1.00e-30)
lo <- c(-Inf, -Inf, 100, -Inf, -Inf, -Inf, 1e-30)
hi <- c(Inf,   Inf, Inf,    0,  Inf,  Inf, Inf)

fit1_cstr_stata <- aldvmm::aldvmm(data = df,
                                  formula = formula,
                                  psi = c(0.883, -0.594),
                                  ncmp = 2,
                                  init.est = init,
                                  init.lo = lo,
                                  init.hi = hi,
                                  se.fit = TRUE)

save(fit1_cstr_stata, 
     file = "fit1_cstr_stata.RData",
     compress = TRUE)

rm(fit1_cstr_stata, init, lo, hi)

# Constant-only initial values ("nlminb")
#----------------------------------------

# With standard errors of fitted values for comparison to STATA

fit1_cons <- aldvmm::aldvmm(data = df,
                            formula = formula,
                            psi = c(0.883, -0.594),
                            ncmp = 2,
                            init.method = "constant",
                            optim.method = "nlminb",
                            se.fit = TRUE)

save(fit1_cons, 
     file = "fit1_cons.RData",
     compress = TRUE)

rm(fit1_cons)

# Single-component model
#-----------------------

fit1_tobit <- aldvmm::aldvmm(data = df,
                             formula = formula,
                             psi = c(0.883, -0.594),
                             ncmp = 1,
                             optim.method = "nlminb")

save(fit1_tobit, 
     file = "fit1_tobit.RData",
     compress = TRUE)

rm(fit1_tobit)
rm(formula)

#------------------------------------------------------------------------------
# Fit model 2
#------------------------------------------------------------------------------

formula <- eq5d ~ hr | hr

# Zero starting values
#---------------------

fit2 <- aldvmm::aldvmm(data = df,
                       formula = formula,
                       psi = c(0.883, -0.594),
                       ncmp = 2,
                       optim.method = "nlminb")

save(fit2, 
     file = "fit2.RData",
     compress = TRUE)

rm(fit2)

# Starting values from Hernandez Alava and Wailoo (2015)
#-------------------------------------------------------

# With standard errors of fitted values for comparison to STATA

init <- c(-.40293118, .30502755, .22614716, .14801581, -.70755741, 0, 
          -1.2632051, -2.4541401)

fit2_stata <- aldvmm::aldvmm(data = df,
                             formula = formula,
                             psi = c(0.883, -0.594),
                             ncmp = 2,
                             init.est = init,
                             optim.method = "nlminb",
                             se.fit = TRUE)

save(fit2_stata, 
     file = "fit2_stata.RData",
     compress = TRUE)

rm(fit2_stata, init)

rm(formula)
rm(df)

```

<!-- Make plots -->

```{r calc_plot, eval = FALSE, warning = FALSE, echo = FALSE, results = 'hide'}

# Load data
#----------

load("df.RData")
load("textout.RData")

# Load functions
#---------------

source("est_mhl.R")

# Custom ggplot theme
#--------------------

ggplot_theme <- theme(panel.background = element_rect(fill = "white",
                                                      colour = "white",
                                                      #linewidth = 0.5, 
                                                      linetype = "solid"),
                      panel.grid.major = element_blank(), 
                      panel.grid.minor = element_blank(),
                      panel.grid.major.y = element_line(#linewidth = 0.25, 
                        linetype = 'dashed', 
                        colour = "grey"),
                      text = element_text(size = 11),
                      plot.margin = margin(10, 10, 0, 10),
                      #axis.line = element_line(linewidth = 0.25),
                      axis.text = element_text(size = 11),
                      #legend.title = element_text(size = 11),
                      legend.title = element_blank(),
                      legend.text = element_text(size = 11),
                      legend.position = 'bottom',
                      legend.direction = "vertical",
                      legend.key = element_blank())

# Histogram of observed outcomes
#-------------------------------

plot_hist_obs <- ggplot2::ggplot(df, aes(x = eq5d)) + 
  geom_histogram(binwidth = 0.02, color="black", fill="white") +
  xlab("EQ-5D-3L utilities") +
  ylab("Frequency") +
  scale_y_continuous(labels = scales::comma) +
  ggplot_theme

ggsave("plot_hist_obs.png",
       width = 6,
       height = 2.9,
       dpi = 150)

rm(plot_hist_obs)

# Histogram of predicted outcomes Model 1, zero starting values, nlminb
#---------------------------------------------------------------------------

load("fit1_all.RData")

plotdf <- data.frame(yhat = fit1_all[["fit"]][["zero"]][["nlminb"]][["pred"]][["yhat"]])

plot_hist_pred <- ggplot2::ggplot(plotdf, aes(x = yhat)) + 
  geom_histogram(binwidth = 0.02, color="black", fill="white") +
  xlab("EQ-5D-3L utilities") +
  ylab("Frequency") +
  scale_y_continuous(labels = scales::comma) +
  ggplot_theme

ggsave("plot_hist_pred.png",
       width = 6,
       height = 2.9,        dpi = 150)

rm(plotdf, plot_hist_pred, fit1_all)

# Comparison of predicted values from different optimization methods
#-------------------------------------------------------------------

load("fit1_all.RData")

reslist <- lapply(fit1_all[["fit"]][["zero"]], function (x) {
  x[["pred"]][["yhat"]] - fit1_all[["fit"]][["zero"]][["BFGS"]][["pred"]][["yhat"]]
})

tmpdf <- as.data.frame(do.call("cbind", reslist))
tmpdf$yhat <- fit1_all[["fit"]][["zero"]][["BFGS"]][["pred"]][["yhat"]]
tmpdf <- melt(tmpdf, id.vars = "yhat", variable.name = "algorithm")
tmpdf <- tmpdf[!duplicated(tmpdf[c("yhat", "algorithm")]), ]

plot_comp_pred <- ggplot2::ggplot(tmpdf, aes(x = yhat, y = value, group = algorithm, color = algorithm)) +
  geom_line(aes(linetype = algorithm)) +
  #scale_linetype_manual(values = c("solid", "dashed", "dotted", "dotdash")) +
  #scale_color_manual(values = c("black", "green", "blue", "red")) +
  #scale_x_continuous(breaks = seq(-0.2, 1, by = 0.1)) +
  xlab("E[y|X] BFGS") +
  ylab("Difference from E[y|X] BFGS") +
  guides(linetype = guide_legend(nrow = 1)) +
  ggplot_theme +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_blank()) +
  guides(color = guide_legend(ncol = 5, byrow = TRUE),
         linetype = guide_legend(nrow = 2, byrow = TRUE))

ggsave("plot_comp_pred.png",
       width = 6,
       height = 2.9,        dpi = 150)

rm(fit1_all, reslist, tmpdf, plot_comp_pred)

# Modified Hosmer-Lemeshow test by optimization method
#-----------------------------------------------------

load("fit1_all.RData")

set.seed(101010101)
plot_comp_mhl_bfgs <- est_mhl(fit1_all[["fit"]][["zero"]][["BFGS"]],
                              ngroup = 10)

ggsave("plot_comp_mhl_bfgs.png",
       width = 6,
       height = 2.9,        dpi = 150)

rm(plot_comp_mhl_bfgs)

set.seed(101010101)
plot_comp_mhl_nm <- est_mhl(fit1_all[["fit"]][["zero"]][["Nelder-Mead"]],
                            ngroup = 10)

ggsave("plot_comp_mhl_nm.png",
       width = 6,
       height = 2.9,        dpi = 150)

rm(plot_comp_mhl_nm)

set.seed(101010101)
plot_comp_mhl_nlminb <- est_mhl(fit1_all[["fit"]][["zero"]][["nlminb"]],
                                ngroup = 10)

ggsave("plot_comp_mhl_nlminb.png",
       width = 6,
       height = 2.9,        dpi = 150)

rm(plot_comp_mhl_nlminb)

set.seed(101010101)
plot_comp_mhl_hjn <- est_mhl(fit1_all[["fit"]][["zero"]][["hjn"]],
                             ngroup = 10)

ggsave("plot_comp_mhl_hjn.png",
       width = 6,
       height = 2.9,        dpi = 150)

rm(plot_comp_mhl_hjn)

rm(fit1_all)

# Box plot of fitted values in R and STATA
#-----------------------------------------

# Load aldvmm objects
load("fit1_nlminb.RData")
load("fit1_cstr_stata.RData")
load("fit1_cons.RData")
load("fit2_stata.RData")
stata_yhat <- read.table("stata_yhat.csv", 
                         header = TRUE, 
                         sep = ";", 
                         dec = ".")

stata_yhat_long <- stata_yhat
names(stata_yhat_long) <- c("Ref. case 1", 
                            "Ref. case 2", 
                            "Ref. case 3", 
                            "Ref. case 4")
suppressMessages(stata_yhat_long <- reshape2::melt(stata_yhat_long))
stata_yhat_long$variable <- as.character(stata_yhat_long$variable)
stata_yhat_long$software <- "STATA"

r_yhat_long <- rbind(cbind(variable = "Ref. case 1",
                           value = fit1_nlminb[["pred"]][["yhat"]],
                           software = "R"),
                     cbind(variable = "Ref. case 2",
                           value = fit1_cstr_stata[["pred"]][["yhat"]],
                           software = "R"),
                     cbind(variable = "Ref. case 3",
                           value = fit1_cons[["pred"]][["yhat"]],
                           software = "R"),
                     cbind(variable = "Ref. case 4",
                           value = fit2_stata[["pred"]][["yhat"]],
                           software = "R"))

r_yhat_long <- as.data.frame(r_yhat_long)

plotdf <- rbind(stata_yhat_long, r_yhat_long)

suppressWarnings(
  plot_box_yhat <- ggplot2::ggplot(plotdf, 
                                   aes(x = variable, 
                                       y = as.numeric(value), 
                                       fill = software)) +
    geom_boxplot() +
    ylab("Fitted values") +
    ggplot_theme +
    theme(axis.title.x = element_blank()) +
    guides(fill = guide_legend(nrow = 1, byrow = TRUE))
)

ggsave("plot_box_yhat.png",
       width = 6,
       height = 2.9,        dpi = 150)

rm(plotdf, plot_box_yhat)
rm(fit1_cons, fit1_cstr_stata, fit1_nlminb)
rm(fit2_stata)
rm(stata_yhat, stata_yhat_long)
rm(r_yhat_long)

# Box plot of standard errors of fitted values in R and STATA
#------------------------------------------------------------

load("fit1_nlminb.RData")
load("fit1_cstr_stata.RData")
load("fit1_cons.RData")
load("fit2_stata.RData")
stata_se <- read.table("stata_se.csv", 
                       header = TRUE, 
                       sep = ";", 
                       dec = ".")

stata_se_long <- stata_se
names(stata_se_long) <- c("Ref. case 1", 
                          "Ref. case 2", 
                          "Ref. case 3", 
                          "Ref. case 4")
suppressMessages(stata_se_long <- reshape2::melt(stata_se_long))
stata_se_long$variable <- as.character(stata_se_long$variable)
stata_se_long$software <- "STATA"

r_se_long <- rbind(cbind(variable = "Ref. case 1",
                         value = fit1_nlminb[["pred"]][["se.fit"]],
                         software = "R"),
                   # cbind(variable = "Ref. case 2",
                   #       value = fit1_cstr_stata[["pred"]][["se.fit"]],
                   #       software = "R"),
                   cbind(variable = "Ref. case 3",
                         value = fit1_cons[["pred"]][["se.fit"]],
                         software = "R"),
                   cbind(variable = "Ref. case 4",
                         value = fit2_stata[["pred"]][["se.fit"]],
                         software = "R"))

r_se_long <- as.data.frame(r_se_long)

plotdf <- rbind(stata_se_long, r_se_long)

suppressWarnings(
  plot_box_se <- ggplot2::ggplot(plotdf, 
                                 aes(x = variable, 
                                     y = as.numeric(value), 
                                     fill = software)) +
    geom_boxplot() +
    ylab("Standard errors of fitted values") +
    ggplot_theme +
    theme(axis.title.x = element_blank()) +
    guides(fill = guide_legend(nrow = 1, byrow = TRUE))
)

ggsave("plot_box_se.png",
       width = 6,
       height = 2.9,        dpi = 150)

rm(plotdf, plot_box_se)
rm(fit1_cons, fit1_cstr_stata, fit1_nlminb)
rm(fit2_stata)
rm(stata_se, stata_se_long)
rm(r_se_long)
rm(df)
rm(est_mhl, ggplot_theme)

```

<!-- Make summary tables -->

```{r calc_tab, eval = FALSE, warning = FALSE, echo = FALSE, results = 'hide'}

# Load data
#----------

load("df.RData")

# Load functions
#---------------

source("rep_tab_fit.R")

# Model 1, comparison of optimization methods
#--------------------------------------------

load("fit1_all.RData")

tab_comp_ll <- matrix(NA,
                      nrow     = length(fit1_all[["fit"]]),
                      ncol     = length(fit1_all[["fit"]][[1]]),
                      dimnames = list(names(fit1_all[["fit"]]),
                                      names(fit1_all[["fit"]][[1]])))
tab_comp_time <- tab_comp_ll
tab_comp_cov <- tab_comp_ll
tab_comp_mse <- tab_comp_ll
tab_comp_mae <- tab_comp_ll

for (i in names(fit1_all[["fit"]])) { # Initial value methods
  for (j in names(fit1_all[["fit"]][[1]])) { # Optimization methods
    if (length(fit1_all[["fit"]][[i]][[j]]) != 0) {
      tab_comp_ll[i, j] <- -fit1_all[["fit"]][[i]][[j]][["gof"]][["ll"]]
      tab_comp_time[i, j] <- fit1_all[["time"]][[i]][[j]]
      cov <- fit1_all[["fit"]][[i]][[j]][["cov"]]
      tab_comp_cov[i, j] <- sum(diag(cov) < 0) == 0 & 
        sum(is.na(diag(cov))) == 0
      rm(cov)
      tab_comp_mse[i, j] <- fit1_all[["fit"]][[i]][[j]][["gof"]][["mse"]]
      tab_comp_mae[i, j] <- fit1_all[["fit"]][[i]][[j]][["gof"]][["mae"]]
    }
  }
}

save(tab_comp_ll, 
     file = "tab_comp_ll.RData",
     compress = TRUE)

save(tab_comp_time, 
     file = "tab_comp_time.RData",
     compress = TRUE)

save(tab_comp_cov, 
     file = "tab_comp_cov.RData",
     compress = TRUE)

save(tab_comp_mse, 
     file = "tab_comp_mse.RData",
     compress = TRUE)

save(tab_comp_mae, 
     file = "tab_comp_mae.RData",
     compress = TRUE)

rm(tab_comp_ll, tab_comp_time, tab_comp_cov, tab_comp_mse, tab_comp_mae, i, j)
rm(fit1_all)

# Model 1 comparison of coefficients by optimization method
#----------------------------------------------------------

load("fit1_all.RData")

comp <- rep_tab_fit(fit1_all[["fit"]][["zero"]][["BFGS"]])[["table"]][, 1]
comp <- comp[-length(comp)]
vars <- rep_tab_fit(fit1_all[["fit"]][["zero"]][["BFGS"]])[["table"]][, 2]
vars <- vars[-length(vars)]
tmplist <- lapply(fit1_all[["fit"]][["zero"]], function (x) {
  col <- rep_tab_fit(x)[["table"]][, 3]
  col <- col[-length(col)]
})

tab_comp_coef <- cbind(" " = comp,
                       " " = vars,
                       do.call("cbind", tmplist))

save(tab_comp_coef, 
     file = "tab_comp_coef.RData",
     compress = TRUE)

rm(tmplist, comp, vars, tab_comp_coef)
rm(fit1_all)

# Summary table model 1, zero, BFGS 
#----------------------------------

load("fit1_all.RData")

tab_sum_mod1bfgs <- rep_tab_fit(fit1_all[["fit"]][["zero"]][["BFGS"]])

save(tab_sum_mod1bfgs, 
     file = "tab_sum_mod1bfgs.RData",
     compress = TRUE)

rm(tab_sum_mod1bfgs)
rm(fit1_all)

# Summary table model 1, zero, nlminb
#------------------------------------

load("fit1_all.RData")

tab_sum_mod1nlminb <- rep_tab_fit(fit1_all[["fit"]][["zero"]][["nlminb"]])

save(tab_sum_mod1nlminb, 
     file = "tab_sum_mod1nlminb.RData",
     compress = TRUE)

rm(tab_sum_mod1nlminb)
rm(fit1_all)

# Summary table model 1, constant-only model as starting values, nlminb
#----------------------------------------------------------------------

load("fit1_all.RData")

tab_sum_const <- rep_tab_fit(fit1_all[["fit"]][["constant"]][["nlminb"]])

save(tab_sum_const, 
     file = "tab_sum_const.RData",
     compress = TRUE)

rm(tab_sum_const)
rm(fit1_all)

# Summary table model 1, zero, constrained 
#-----------------------------------------

load("fit1_cstr.RData")

tab_sum_cstr <- rep_tab_fit(fit1_cstr)

save(tab_sum_cstr, 
     file = "tab_sum_cstr.RData",
     compress = TRUE)

rm(fit1_cstr, tab_sum_cstr)

# Summary table model 1, constrained with stata estimates as starting values 
#---------------------------------------------------------------------------

load("fit1_cstr_stata.RData")

tab_sum_cstata <- rep_tab_fit(fit1_cstr_stata)

save(tab_sum_cstata, 
     file = "tab_sum_cstata.RData",
     compress = TRUE)

rm(fit1_cstr_stata, tab_sum_cstata)

# Summary table model 1, single-component, zero starting values, nlminb 
#----------------------------------------------------------------------

load("fit1_tobit.RData")

tab_sum_tobit <- rep_tab_fit(fit1_tobit)

save(tab_sum_tobit, 
     file = "tab_sum_tobit.RData",
     compress = TRUE)

rm(fit1_tobit, tab_sum_tobit)

# Summary table model 2, nlminb
#------------------------------

load("fit2.RData")

tab_sum_mod2 <- rep_tab_fit(fit2)

save(tab_sum_mod2, 
     file = "tab_sum_mod2.RData",
     compress = TRUE)

rm(fit2, tab_sum_mod2)

# Summary table model 2, stata estimates as starting values, nlminb
#------------------------------------------------------------------

load("fit2_stata.RData")

tab_sum_mod2stata <- rep_tab_fit(fit2_stata)

save(tab_sum_mod2stata, 
     file = "tab_sum_mod2stata.RData",
     compress = TRUE)

rm(fit2_stata, tab_sum_mod2stata)

# Summary statistics of differences in fitted values between R and STATA
#-----------------------------------------------------------------------

load("fit1_nlminb.RData")
load("fit1_cstr_stata.RData")
load("fit1_cons.RData")
load("fit2_stata.RData")
stata_yhat <- read.table("stata_yhat.csv", 
                         header = TRUE, 
                         sep = ";", 
                         dec = ".")
stata_se <- read.table("stata_se.csv", 
                       header = TRUE, 
                       sep = ";", 
                       dec = ".")


# Fitted values
sumhat <- list()

sumhat[["yhat"]][["Ref. case 1"]] <- summary(stata_yhat[, 1] - fit1_nlminb[["pred"]][["yhat"]])
sumhat[["yhat"]][["Ref. case 2"]] <- summary(stata_yhat[, 2] - fit1_cstr_stata[["pred"]][["yhat"]])
sumhat[["yhat"]][["Ref. case 3"]] <- summary(stata_yhat[, 3] - fit1_cons[["pred"]][["yhat"]])
sumhat[["yhat"]][["Ref. case 4"]] <- summary(stata_yhat[, 4] - fit2_stata[["pred"]][["yhat"]])

tab_diff_yhat <- suppressWarnings( round(do.call("cbind", sumhat[["yhat"]]), 6) )
tab_diff_yhat <- tab_diff_yhat[1:6, ]

save(tab_diff_yhat, 
     file = "tab_diff_yhat.RData",
     compress = TRUE)

# Standard errors of fitted values

sumhat[["se"]][["Ref. case 1"]] <- summary(stata_se[, 1] - fit1_nlminb[["pred"]][["se.fit"]])
sumhat[["se"]][["Ref. case 2"]] <- summary(stata_se[, 2] - fit1_cstr_stata[["pred"]][["se.fit"]])
sumhat[["se"]][["Ref. case 3"]] <- summary(stata_se[, 3] - fit1_cons[["pred"]][["se.fit"]])
sumhat[["se"]][["Ref. case 4"]] <- summary(stata_se[, 4] - fit2_stata[["pred"]][["se.fit"]])

tab_diff_se <- suppressWarnings( round(do.call("cbind", sumhat[["se"]]), 6) )
tab_diff_se <- tab_diff_se[1:6, ]

save(tab_diff_se, 
     file = "tab_diff_se.RData",
     compress = TRUE)

rm(fit1_cons, fit1_cstr_stata, fit1_nlminb)
rm(fit2_stata)
rm(tab_diff_se, tab_diff_yhat)
rm(stata_se, stata_yhat)
rm(sumhat)

# Comparison of R and STATA results
#---------------------------------

# Load summary tables
load("tab_sum_mod1nlminb.RData")
load("tab_sum_cstata.RData")
load("tab_sum_const.RData")
load("tab_sum_mod2stata.RData")
load("tab_sum_stata1.RData")
load("tab_sum_stata2.RData")
load("tab_sum_stata3.RData")
load("tab_sum_stata4.RData")

# Model keys
tabvec <- c("tab_sum_mod1nlminb",  "tab_sum_stata1", 
            "tab_sum_cstata",     "tab_sum_stata2", 
            "tab_sum_const", "tab_sum_stata3",
            "tab_sum_mod2stata", "tab_sum_stata4")

# Matrix of coefficients
tab_rstata_coef <- matrix(NA, 
                          nrow = nrow(tab_sum_mod2stata$table),
                          ncol = length(tabvec) + 2)

tab_rstata_coef[, 1:2] <- as.matrix(tab_sum_mod2stata$table[, 1:2])
tab_rstata_coef[nrow(tab_rstata_coef), 2] <- ""

# Matrix of standard errors
tab_rstata_se <- tab_rstata_coef

# Populate tables
for (i in tabvec) {
  nr <- nrow(get(i)$table)
  for (j in 1:(nr - 1)) {
    
    lltmp <- format( 
      as.numeric(gsub("[^0-9.-]", "", as.matrix(get(i)$table)[nr, 2])), 
      big.mark = "'"
    )
    
    tab_rstata_coef[j, 2 + match(i, tabvec)] <- as.matrix(get(i)$table)[j, 3]
    tab_rstata_coef[nrow(tab_rstata_coef), 2 + match(i, tabvec)] <- lltmp
    tab_rstata_coef[nrow(tab_rstata_coef), 2] <- "ll"
    
    tab_rstata_se[j, 2 + match(i, tabvec)] <- as.matrix(get(i)$table)[j, 4]
    tab_rstata_se[nrow(tab_rstata_se), 2 + match(i, tabvec)] <- lltmp
    tab_rstata_se[nrow(tab_rstata_se), 2] <- "ll"
    
  }
  rm(nr)
}
rm(i, j, lltmp)

tab_rstata_coef <- as.data.frame(rbind(c("", "", "R", "STATA", "R", "STATA", "R", "STATA", "R", "STATA"),
                                       tab_rstata_coef))

tab_rstata_se <- as.data.frame(rbind(c("", "", "R", "STATA", "R", "STATA", "R", "STATA", "R", "STATA"),
                                     tab_rstata_se))

names(tab_rstata_coef) <- names(tab_rstata_se) <- c("", "", "Ref. case 1", "", "Ref. case 2", "", "Ref. case 3", "", "Ref. case 4", "")

save(tab_rstata_coef, 
     file = "tab_rstata_coef.RData",
     compress = TRUE)

save(tab_rstata_se, 
     file = "tab_rstata_se.RData",
     compress = TRUE)

rm(tab_rstata_coef, tab_rstata_se, tabvec, tab_sum_const, tab_sum_cstata, tab_sum_mod1nlminb, tab_sum_mod2stata,
   tab_sum_stata1, tab_sum_stata2, tab_sum_stata3, tab_sum_stata4)

# Remove fit objects
#-------------------

# file.remove(dir(path = ".",  pattern="fi1_"))
# file.remove(dir(path = ".",  pattern="fi2_"))
rm(df)
rm(rep_tab_fit)

```

<!-- Make tables of STATA results -->

```{r calc_tab_stata, eval = FALSE, warning = FALSE, echo = FALSE, results = 'hide'}

# Load functions
#---------------

source("rep_tab_stata.R")

# Model 1, default options
#-------------------------

tab_sum_stata1 <- rep_tab_stata("stata_model1_default.csv")

save(tab_sum_stata1, 
     file = "tab_sum_stata1.RData",
     compress = TRUE)

rm(tab_sum_stata1)

# Model 1, constraints
#----------------------

tab_sum_stata2 <- rep_tab_stata("stata_model1_cstr.csv")

save(tab_sum_stata2, 
     file = "tab_sum_stata2.RData",
     compress = TRUE)

rm(tab_sum_stata2)

# Model 1, constant-only initial values
#--------------------------------------

tab_sum_stata3 <- rep_tab_stata("stata_model1_cons.csv")

save(tab_sum_stata3, 
     file = "tab_sum_stata3.RData",
     compress = TRUE)

rm(tab_sum_stata3)

# Model 2, user-defined initial values
#-------------------------------------

tab_sum_stata4 <- rep_tab_stata("stata_model2.csv")

save(tab_sum_stata4, 
     file = "tab_sum_stata4.RData",
     compress = TRUE)

rm(tab_sum_stata4)
rm(rep_tab_stata)

```  

# Introduction

Health-related quality of life is a key outcome in health technology assessments because it is patient-relevant and it is needed to calculate quality-adjusted life years. Quality of life instruments typically measure health problems in multiple domains using ordinal Likert scales. Value sets or valuation functions convert these profiles of ordinal measures into cardinal health state utilities between 1 (perfect health) and minus infinity, where 0 represents death, and negative values represent health states worse than death. Because 100% quality of life represents perfect health, health state utilities are limited at 1. The lowest possible utility in a local value set further defines a lower limit of health state utilities in a local population. Thus, health state utilities are limited dependent variables. In addition, health state utilities often show gaps between 1 and the next smaller utility in the value set. These gaps occur more frequently in  quality of life instruments with few levels in the Likert scales such as the EQ-5D-3L [@Mulhern2018]. A last but important particularity of health state utilities is that they can be the consequence of multiple latent classes, or they can exhibit multi-modal marginal densities [@HernandezAlava2014].

Adjusted limited dependent variable mixture models are finite mixtures of normal distributions that account for limits, gaps between 1 and the next smaller utility value, and multi-modality [@HernandezAlava2012; @HernandezAlava2013; @HernandezAlava2014; @HernandezAlava2015; @Mukuria2019]. These features can improve empirical fit, parameter identification and predictive accuracy compared to standard regression models. Thus, adjusted limited dependent variable mixture models are particularly useful for mapping studies [@Gray2018a; @Gray2018; @Dixon2020; @Yang2019; @Xu2020; @Fuller2017; @Pennington2020].

The R 'aldvmm' package is an implementation of the adjusted limited dependent variable mixture model proposed by @HernandezAlava2015 using normal component distributions and a multinomial logit model of probabilities of component membership.

The objectives of this vignette are to demonstrate the usage of the 'aldvmm' package, show important challenges of fitting adjusted limited dependent variable mixture models and validate the R implementation against the STATA\textsuperscript{\textregistered} package [@HernandezAlava2015] using publicly available data.

# Methods

Adjusted limited dependent variable mixture models are finite mixtures of normal distributions in $C$ components $c$ with conditional expectations $E[y|X, c] = X\beta^{c}$ and standard deviations $\sigma^{c}$. Probabilities of component membership are estimated using a multinomial logit model $P[c|X]=exp(X\delta^{c})/\sum_{k=1}^{K}exp(X\delta^{k})$. The model accumulates the density mass of outcomes per component $y_{i}|c$ below a minimum value $\Psi_1$ at the value $\Psi_1$, and the density mass above a maximum value $\Psi_{2}$ at 1. If the maximum value $\Psi_2$ is smaller than 1, the model emulates a value set with a gap between 1 and the next smaller value.

\begin{equation}
\label{eq:limits}
\begin{array}{ll}
y_{i}|c =& \begin{cases} \begin{array}{ll}
1        & \text{if } y_{i}|c > \Psi_{2}\\
\Psi_{1} & \text{if } y_{i}|c \leq \Psi_{1}\\
y_{i}|c  & \text{if } \Psi_{1} < y_{i}|c \leq \Psi_{2}\\
\end{array} \end{cases}
\end{array}
\end{equation}

In this vignette, we estimate the same models of post-operative EQ-5D-3L utilities as @HernandezAlava2015 and include post-operative Oxford Hip Scores (divided by 10) as the only explanatory variable $x$.

\begin{equation}
\label{eq:models}
\begin{array}{lrl}
\text{Model 1:}& E[y|c, X] &= \beta_{0}^{c} + \beta_{1}^{c}x\\
& P[c|X]    &= \text{mlogit}(\delta_{0}^{c})\\
&&\\
\text{Model 2:}& E[y|c, X] &= \beta_{0}^{c}  + \beta_{1}^{c}x\\
& P[c|X]    &= \text{mlogit}(\delta_{0}^{c} + \delta_{1}^{c}x)
\end{array}
\end{equation}

The `aldvmm()` function fits an adjusted limited dependent variable mixture model using the likelihood function from @HernandezAlava2015 ([appendix](#likfun)). The function calls `optimx::optimr()` to minimize the negative log-likelihood using analytical gradients from `aldvmm.gr()`. The `aldvmm()` function accepts all optimization methods available in `optimx::optimr()` except for "nlm", which requires a different implementation of the likelihood function. The default optimization method is "BFGS".

The model formula supplied to `aldvmm()` is an object of class "formula" with two parts on the right-hand side of `~`. The first part on the left of the `|` delimiter represents the model of expected values of $C$ normal distributions. The second part on the right of the `|` delimiter represents the model of probabilities of component membership from a multinomial logit model.

The 'aldvmm' package provides four options for the generation of starting values of the optimization algorithm. 

1. "zero": A vector of zeroes (default). 

2. "random": A vector of standard normal random values.

3. "constant": Parameter estimates of a constant-only model as starting values for intercepts and standard deviations, and zeroes for all other parameters. The auxiliary models for obtaining starting values are fitted using zero starting values.

4. "sann": Parameter estimates of a simulated annealing algorithm.

The 'aldvmm' package obtains fitted values using the expected value function from @HernandezAlava2015. Covariance matrices and standard errors of parameters are obtained from a numerical approximation of the hessian matrix using `numDeriv::hessian()`. Standard errors of fitted values in the estimation data $SE^{fit}_{i}$ and standard errors of predicted values in new data  $SE^{pred}_{i}$ are calculated using the delta method [@Dowd2014; @Whitmore1986]. $G_{i}$ denotes the gradient of a fitted value with respect to changes in parameter estimates, $\Sigma$ denotes the covariance matrix of parameters, and $MSE$ denotes the mean squared error of fitted versus observed values in the estimation data.
\begin{equation}
\begin{array}{rl}
SE^{fit}_{i} &= \sqrt{G_{i}'\Sigma G_{i}}
\end{array}
\end{equation}
\begin{equation}
\begin{array}{rl}
SE^{pred}_{i} &= \sqrt{MSE + G_{i}'\Sigma G_{i}}
\end{array}
\end{equation}

The `aldvmm()` function returns an object of S3 class "aldvmm" for which methods for generic functions `print()`, `summary()`, `stats::predict()`, `stats::coef()`, `stats::nobs()`, `stats::vcov()`, `stats::model.matrix()`, `stats::formula()`, `stats::residuals()`, `stats::update()` and `sandwich::estfun()` are available. Objects of class "aldvmm" can be supplied to `sandwich::sandwich()`, `sandwich::vcovCL()`, `sandwich::vcovPL()`, `sandwich::vcovHAC()` and `sandwich::vcovBS()` to estimate robust or clustered standard errors (see example in the [appendix](#swcode)). `sandwich::estfun()` calls `aldvmm.sc()` to return analytical gradients of the log-likelihood with respect to parameters for each observation (see gradient function in [appendix](#likfun)), which allows fast computation of robust standard errors. `sandwich::vcovBS()` allows re-estimating the covariance matrix using bootstrapping with and without clustering, which can be particularly useful in cases with no valid covariance matrix from model fitting.

# Installation

The latest version of the 'aldvmm' package can be installed from cran.

```{r install, eval = FALSE, echo = TRUE}
install.packages("aldvmm")
```

# Data

We analyze the same publicly available [EQ-5D-3L utility data](https://digital.nhs.uk/data-and-information/publications/statistical/patient-reported-outcome-measures-proms/finalised-patient-reported-outcome-measures-proms-in-england-april-2011-to-march-2012) from English patients after hip replacement in 2011 and 2012 [@NHSDigital2013] as @HernandezAlava2015 in their description of the STATA\textsuperscript{\textregistered} ALDVMM package.

```{r data, eval = FALSE, echo = TRUE}
temp <- tempfile()

url <- paste0("https://files.digital.nhs.uk/publicationimport/pub11xxx/",
              "pub11359/final-proms-eng-apr11-mar12-data-pack-csv.zip")

download.file(url, temp)
rm(url)

df <- read.table(unz(description = temp,
                     filename = "Hip Replacement 1112.csv"),
                 sep = ",",
                 header = TRUE)

unlink(temp)
rm(temp)

df <- df[, c("AGEBAND", "SEX", "Q2_EQ5D_INDEX", "HR_Q2_SCORE")]
df <- df[df$AGEBAND != "*" & df$SEX != "*", ]

df$eq5d <- df$Q2_EQ5D_INDEX
df$hr <- df$HR_Q2_SCORE/10

df <- df[stats::complete.cases(df), ]

set.seed(101010101)
df <- df[sample(1:nrow(df), size = nrow(df)*0.3), ]
```

```{r loadtextout, eval = TRUE, echo = FALSE}

load("textout.RData")

```

The data includes `r format(textout$ncplt, big.mark="'")` observations with complete information on patients' post-operative utilities, Oxford Hip Scores, age and sex. Like @HernandezAlava2015, we draw a 30% sub-sample of `r format(textout$nspl, big.mark="'")` observations from the population of complete observations of these variables. Although we follow a similar approach in data preparation as  @HernandezAlava2015, our random sample is not identical to the data used in their study. Post-operative EQ-5D-3L utilities from English value sets [@Dolan1997] show a bimodal distribution, limits at -0.594 and 1 and a gap between 1 and 0.883 (figure \@ref(fig:plot-hist-obs)).

```{r plot-hist-obs, out.width = "90%", fig.cap = "Frequency distribution of observed EQ-5D-3L utilities", echo = FALSE}
knitr::include_graphics("plot_hist_obs.png")
```

<!-- ![(#fig:plot-hist-obs) Frequency distribution of observed EQ-5D-3L utilities](./plot_hist_obs.png) -->

# Examples

## Model 1: Default settings {#sec:base}

We first fit model 1 with the default "BFGS" optimization method and "zero" initial values. The values 0.883 and -0.594 in the argument 'psi' represent the maximum and minimum values smaller than 1 in the English value set [@Dolan1997]. As the data shows a bi-modal distribution (figure \@ref(fig:plot-hist-obs)), we estimate a mixture of 2 normal distributions ('ncmp' = 2). `aldvmm()` returns an object of class "aldvmm".

```{r model1-fit, echo = TRUE, eval = FALSE}
library("aldvmm")

fit <- aldvmm::aldvmm(eq5d ~ hr | 1,
                      data = df,
                      psi = c(0.883, -0.594),
                      ncmp = 2)

summary(fit)

pred <- predict(fit,
                se.fit = TRUE,
                type = "fit")
```

```{r tab-sum-mod1-load, echo = FALSE}

load("tab_sum_mod1bfgs.RData")

textout[["mod1bfgs"]][["aic"]] <- format(
  as.numeric(
    gsub("[^0-9.-]", 
         "",
         tab_sum_mod1bfgs$table[nrow(tab_sum_mod1bfgs$table), 3])
  ), 
  big.mark = "'"
)

textout[["mod1bfgs"]][["ll"]] <- format(
  as.numeric(
    gsub("[^0-9.-]", 
         "",
         tab_sum_mod1bfgs$table[nrow(tab_sum_mod1bfgs$table), 2])
  ), 
  big.mark = "'"
)

textout[["mod1bfgs"]][["intp1"]] <- format(
  as.numeric(
    gsub("[^0-9.-]", 
         "",
         tab_sum_mod1bfgs$table[nrow(tab_sum_mod1bfgs$table) - 1, 3])
  ), 
  big.mark = "'"
)

rm(tab_sum_mod1bfgs)

```

We obtain a summary table of regression results using the generic function `summary()`. The model converges at a log-likelihood of `r gsub("-", "", textout[["mod1bfgs"]][["ll"]])` and an Akaike information criterion value of `r gsub("-", "", textout[["mod1bfgs"]][["aic"]])` (table \@ref(tab:tab-sum-mod1-bfgs)). Larger values of the log-likelihood (ll), the Akaike information criterion (AIC), and the Bayesian information criterion (BIC) suggest better fit. The summary table returned by the generic function `summary()` reports negative goodness of fit measures.

The coefficients of the intercepts and covariates for the expected values $E[y|c, X]$ of the normal distributions can be interpreted as marginal effects on component means. 'lnsigma' denotes the natural logarithm of the estimated standard deviation $\sigma^{c}$. The coefficients of covariates in the multinomial logit model of probabilities of component membership are log-transformed relative probabilities. Our model only includes two components, and the multinomial logit model collapses to a binomial logit model. The intercept of `r textout[["mod1bfgs"]][["intp1"]]` means that the average probability of an observation in the data to belong to component 1 is $\text{exp}(`r textout[["mod1bfgs"]][["intp1"]]`)$ or `r exp(as.numeric(textout[["mod1bfgs"]][["intp1"]]))` times the probability to belong to component 2.

```{r tab-sum-mod1-bfgs, echo = FALSE, results = "asis"}

load("tab_sum_mod1bfgs.RData")

rep_tab_reg(tab_sum_mod1bfgs$table,
            caption = 'Regression results from model 1 with "BFGS" optimization method and "zero" starting values')

rm(tab_sum_mod1bfgs)

```

We obtain expected values of observations in the estimation data using the generic function `predict()`. Standard errors of fitted (estimation data) or predicted (new data) values are calculated using the delta method. Expected values exhibit a smoother distribution than observed values and do not show a gap between 1 and 0.883, because they are weighted averages of component distributions and 1 (figure \@ref(fig:plot-hist-pred)).

```{r plot-hist-pred, out.width = "90%", fig.cap = "Expected values from base case model", echo = FALSE}
knitr::include_graphics("plot_hist_pred.png")
```

<!-- ![Expected values from base case model](./plot_hist_pred.png) -->

Expected values of observations in the estimation data can also be used to calculate average incremental effects or average treatment effects on the treated. Average treatment effects on the treated compare predictions for treated individuals to predictions for the same individuals without the effect of the treatment indicator. Standard errors of average treatment effects on the treated can be calculated using the delta method (see example code in the [appendix](#atetcode)).

A visual inspection of mean residuals over deciles of expected values (see example code of the modified Hosmer-Lemeshow test in the [appendix](#mhlcode)) shows that model 1 fits the data poorly (figure \@ref(fig:plot-comp-mhl-bfgs)). Although the test is overpowered with large data, and confidence bands should not be interpreted directly, the absolute over-prediction among individuals with low expected values is quite large.

```{r plot-comp-mhl-bfgs, out.width = "90%", fig.cap = "Mean residuals over deciles of expected values, BFGS with zero starting values", echo = FALSE}
knitr::include_graphics("plot_comp_mhl_bfgs.png")
```

<!-- ![(#fig:plot-comp-mhl-bfgs) Mean residuals over deciles of expected values, "BFGS" with "zero" starting values](./plot_comp_mhl_bfgs.png) -->

We can use the 'sandwich' package [@Zeileis2006] to calculate robust or clustered standard errors from objects of class "aldvmm" (see example code in the [appendix](#swcode)).

## Model 1: Comparison of optimization methods

```{r tab-comp-ll-load, echo = FALSE}
load("tab_comp_ll.RData")
```

@HernandezAlava2015 suggested that the likelihood function of the adjusted limited dependent variable mixture model with the English EQ-5D-3L data might have multiple local optima, and that the estimation is sensitive to initial values. We thus fit model 1 with all optimization algorithms and methods for generating initial values available in `aldvmm()` to assess the sensitivity of model fits to optimization settings and to find the maximum likelihood estimates.

```{r fit-comp, echo = TRUE, eval = FALSE}
init.method <- c("zero", "random", "constant", "sann")

optim.method <- c("Nelder-Mead", "BFGS", "CG", "L-BFGS-B", "nlminb", "Rcgmin", 
                  "Rvmmin","hjn")

fit1_all <- list()

for (i in init.method) {
  for (j in optim.method) {
    set.seed(101010101) # Seed for random starting values
    fit1_all[[i]][[j]] <- aldvmm::aldvmm(eq5d ~ hr | 1,
                                         data         = df,
                                         psi          = c(0.883, -0.594),
                                         ncmp         = 2,
                                         init.method  = i,
                                         optim.method = j)
  }
}
```

The maximum likelihood varies considerably across optimization methods and initial values which confirms the sensitivity of the model to changes in optimization settings (table \@ref(tab:tab-comp-ll)). In `r sum(max(round(tab_comp_ll, 2)) == round(tab_comp_ll, 2))` of `r dim(tab_comp_ll)[1] * dim(tab_comp_ll)[2]`scenarios, the algorithm converges at the maximum value of `r format( names( which.max(table(round(tab_comp_ll, 2)) ) ), big.mark = "'")`. The "CG" algorithm never converges at `r format( names( which.max(table(round(tab_comp_ll, 2)) ) ), big.mark = "'")`, while the "nlminb" algorithm always converges at the maximum value. Starting values from a simulated annealing algorithm lead to convergence at `r format( names( which.max(table(round(tab_comp_ll, 2)) ) ), big.mark = "'")` most frequently but also require much more computation time (table \@ref(tab:tab-comp-time)).

The stabilized multinomial logit likelihood implemented in version 0.9.0 of the 'aldvmm' package increased the likelihood of the example model to converge at the maximum log-likelihood considerably.

```{r tab-comp-ll, echo = FALSE, results = 'asis'}

knitr::kable(round(tab_comp_ll, 1),
             format = "simple",
             row.names = TRUE,
             caption = 'Log-likelihood by optimization method')

```

```{r tab-comp-time-load, echo = FALSE, results = 'asis'}

load("tab_comp_time.RData")

```

The computation times of optimization routines do vary quite considerably across methods. The "BFGS", "L-BFGS-B", "nlminb", and "Rvmmin" algorithms converge faster than the other algorithms. The "hjn" algorithm and simulated annealing "sann" starting values require more than average computation time (table \@ref(tab:tab-comp-time)).

The stabilized multinomial logit likelihood implemented in version 0.9.0 of the 'aldvmm' package decreases computation time for the optimization methods that did not converge to the maximum value with version 0.8.8 but do with version 0.9.0.

```{r tab-comp-time, echo = FALSE, results = 'asis'}

knitr::kable(round(tab_comp_time, 2),
             format = "simple",
             row.names = TRUE,
             caption = 'Estimation time [minutes] by optimization method')

rm(tab_comp_time)

```

Parameter estimates from optimization methods that did not converge at the maximum log-likelihood differ considerably from the values from methods that did converge at the maximum log-likelihood (table \@ref(tab:tab-comp-coef)). The solution of the "hjn" algorithm is identical to the solution of the "BFGS" algorithm, but the components are swapped in the table.

When the model converges at the maximum log-lokelihood, a complete covariance matrix is available (`TRUE`) but rarely otherwise (table \@ref(tab:tab-comp-cov) in the [appendix](#covmat))

```{r tab-comp-coef, echo = FALSE, results = "asis"}

load("tab_comp_coef.RData")

rep_tab_reg(tab_comp_coef,
            caption = 'Regression results of model 1 with zero starting values in BFGS, Nelder-Mead, nlminb and hjn algorithms')

rm(tab_comp_coef)

```

As adjusted limited dependent variable mixture models are frequently used for tasks that rely on predictions, we also compare expected values across algorithms with "zero" starting values. Predictions of models that do not converge at the maximum log-likelihood deviate from the predictions based on "BFGS" results. The deviation is least pronounced for the Nelder-Mead solution and most pronounced for the "L-BFGS-B" solution (figure \@ref(fig:plot-comp-pred)).

```{r plot-comp-pred, out.width = "90%", fig.cap = "Deviation of expected values from BFGS, Nelder-Mead and hjn versus nlminb with zero starting values", echo = FALSE}
knitr::include_graphics("plot_comp_pred.png")
```


<!-- ![(#fig:plot-comp-pred) Deviation of expected values from "BFGS", "Nelder-Mead" and "hjn" versus "nlminb" with zero starting values](./plot_comp_pred.png) -->

Based on the comparison of log-likelihoods, parameter estimates and predicted values, we deem the "nlminb" algorithm the most robust approach for the used data and model and use the "nlminb" algorithm with "zero" values for the remaining examples in this vignette. 

## Model 1: Constrained optimization with user-defined initial values

```{r tab-sum-cstr-load, echo = FALSE, results = "asis"}

load("tab_sum_cstr.RData")

textout[["cstr"]][["ll"]] <- format(
  as.numeric(
    gsub("[^0-9.-]", 
         "",
         tab_sum_cstr[[1]][nrow(tab_sum_cstr[[1]]), 2])
  ), 
  big.mark = "'"
)

```

We can also fit model 1 with user-defined starting values and box constraints. When constraints are imposed, the `aldvmm()` function uses the optimization method "L-BFGS-B", which shows to be very sensitive to starting values. We use zero initial values for all parameters except for the intercept in the multinomial logit which we set to the estimate from the "nlminb" optimization method with "zero" starting values (0.7283) (table \@ref(tab:tab-comp-coef)). We impose a lower limit of -3 to the log-standard deviations in both components. 

The `aldvmm()` function returns a warning that the covariance matrix included negative values on the diagonal. We see that these values are the variances of the intercept and the log-standard deviation in component 2 (table \@ref(tab:tab-sum-cstr)). The log-likelihood amounts to -`r textout[["cstr"]][["ll"]]`, which is smaller than the value of the unconstrained model 1 (`r format( names( which.max(table(round(tab_comp_ll, 2)) ) ), big.mark = "'")`) suggesting worse fit of the constrained model 1. The parameter estimates in component 1 resemble the coefficients in the poorly converged models with "zero" starting values (table \@ref(tab:tab-comp-coef)), but the other coefficients do not. These results further emphasize the difficulties in finding a global optimum of the likelihood with English EQ-5D-3L utilities after hip replacement.

```{r tab-cstr-show, echo = TRUE, eval = FALSE}
init <- c(0,    0,   0,   0,    0,    0,    0.7283)
lo   <- c(-Inf, -Inf, -3,  -Inf, -Inf, -3, -Inf)
hi   <- c(Inf,  Inf,  Inf, Inf,  Inf,  Inf,  Inf)

fit1_cstr <- aldvmm::aldvmm(eq5d ~ hr | 1,
                            data = df,
                            psi = c(0.883, -0.594),
                            ncmp = 2,
                            init.est = init,
                            init.lo = lo,
                            init.hi = hi)

summary(fit1_cstr)
```

```{r tab-sum-cstr, echo = FALSE, results = "asis"}

load("tab_sum_cstr.RData")

rep_tab_reg(tab_sum_cstr$table,
            caption = 'Regression results of model 1 with the L-BFGS-B method, parameter constraints and user-defined starting values')

rm(tab_sum_cstr)

```

## Model 1: Single-component model

As three algorithms converge at zero probability of belonging to component 2 (table \@ref(tab:tab-comp-coef)), we also estimate a single-component model.

```{r tobit-fit, echo = TRUE, eval = FALSE}
fit <- aldvmm::aldvmm(eq5d ~ hr,
                      data = df,
                      psi = c(0.883, -0.594),
                      ncmp = 1,
                      init.method = "zero",
                      optim.method = "nlminb")

summary(fit)

```

```{r tab-sum-tobit-load, echo = FALSE, results = "asis"}

load("tab_sum_tobit.RData")
load("tab_comp_coef.RData")

textout[["tobit"]][["aic"]] <- format(
  as.numeric(
    gsub("[^0-9.-]", 
         "",
         tab_sum_tobit[[1]][nrow(tab_sum_tobit[[1]]), 3])
  ), 
  big.mark = "'"
)

textout[["comp"]][["nlminb"]] <- format(
  as.numeric(
    gsub("[^0-9.-]", 
         "",
         tab_comp_coef[nrow(tab_comp_coef), "nlminb"])
  ), 
  big.mark = "'"
)

textout[["comp"]][["hjn"]] <- format(
  as.numeric(
    gsub("[^0-9.-]", 
         "",
         tab_comp_coef[nrow(tab_comp_coef), "hjn"])
  ), 
  big.mark = "'"
)

rm(tab_comp_coef, tab_sum_tobit)

```

The coefficients of the single-component model (table \@ref(tab:tab-sum-tobit)) are relatively similar to the parameters in the first component of the poorly converged model 1 from the "CG" and "Rcgmin" algorithms (table \@ref(tab:tab-comp-coef)). The Akaike information criterion amounts to -`r textout[["tobit"]][["aic"]]` which is smaller than the Akaike information criterion of the "nlminb" solution of the two-component model (`r gsub("-", "", textout[["mod1bfgs"]][["aic"]])`) and thus suggests worse fit of the single-component model even when model complexity is accounted for. Larger values of the log-likelihood (ll), the Akaike information criterion (AIC), and the Bayesian information criterion (BIC) suggest better fit. The summary table returned by the generic function `summary()` reports negative goodness of fit measures.

```{r tab-sum-tobit, echo = FALSE, results = "asis"}

load("tab_sum_tobit.RData")

rep_tab_reg(tab_sum_tobit$table,
            caption = 'Regression results of model 1 with 1 component, zero starting values in nlminb algorithm')

rm(tab_sum_tobit)

```

## Model 2: User-defined starting values

As an alternative specification, we explore model 2 with a coefficient of the Oxford Hip score in the multinomial logit model of component membership. For this fit, we use the method "nlminb" with estimates from @HernandezAlava2015 as starting values.

```{r model2-fit, echo = TRUE, eval = FALSE}
init <- c(-.40293118, .30502755, .22614716, .14801581, -.70755741, 0, 
          -1.2632051, -2.4541401)

fit2 <- aldvmm::aldvmm(eq5d ~ hr | hr,
                       data = df,
                       psi = c(0.883, -0.594),
                       ncmp = 2,
                       init.est = init,
                       optim.method = "nlminb")

summary(fit2)

```

```{r tab-sum-mod2-load, echo = FALSE, results = "asis"}

load("tab_sum_mod2.RData")

textout[["mod2"]][["aic"]] <- format(
  as.numeric(
    gsub("[^0-9.-]", 
         "",
         tab_sum_mod2[[1]][nrow(tab_sum_mod2[[1]]), 3])
  ), 
  big.mark = "'"
)

rm(tab_sum_mod2)

```

The Akaike information criterion of model 2 fitted using the "nlminb" method and user-defined starting values amounts to `r gsub("-", "", textout[["mod2"]][["aic"]])` which is larger than the Akaike information criterion of model 1 (`r gsub("-", "", textout[["mod1bfgs"]][["aic"]])`). The larger Akaike information criterion suggests that the increase in the log-likelihood after inclusion of a coefficient of the Oxford Hip Score on the probability of component membership is sufficiently large to justify the extra parameter. Larger values of the log-likelihood (ll), the Akaike information criterion (AIC), and the Bayesian information criterion (BIC) suggest better fit. The summary table returned by the generic function `summary()` reports negative goodness of fit measures.

```{r tab-sum-mod2, echo = FALSE, results = "asis"}

load("tab_sum_mod2.RData")

rep_tab_reg(tab_sum_mod2$table,
            caption = 'Regression results of model 2 with user-defined starting values in the nlminb algorithm')

rm(tab_sum_mod2)

```

# Comparison to STATA\textsuperscript{\textregistered} results

To validate the R implementation of adjusted limited dependent variable mixture models, we estimate the four models presented in @HernandezAlava2015 as reference cases in R and STATA\textsuperscript{\textregistered}. The STATA\textsuperscript{\textregistered} and R code for model estimation is included in the [appendix](#rcode).

1. Model 1 with default options

2. Model 1 with parameter constraints

3. Model 1 with initial values from constant-only model

4. Model 2 with user-defined initial values

```{r tab-calc-stata, echo = FALSE, results = "asis"}



```

```{r tab-comp-stata, echo = FALSE, results = "asis"}

load("tab_rstata_coef.RData")

rep_tab_reg(tab_rstata_coef,
            caption = 'Comparison of point estimates to the results of the STATA package')

rm(tab_rstata_coef)

```

The parameter estimates and standard errors obtained in R are very similar to the results from STATA\textsuperscript{\textregistered} (table \@ref(tab:tab-comp-stata) and table \@ref(tab:tab-compse-stata)). R did not obtain any standard errors in reference model 2 while STATA\textsuperscript{\textregistered} returns standard errors for the first component and the probability of belonging to component 1. Although reference models 1 and 3 yield different parameter estimates, they converge at the same log-likelihood which further supports the hypothesis of multiple local optima of the likelihood. The relative ordering of models is consistent across platforms. 


```{r tab-compse-stata, echo = FALSE, results = "asis"}

load("tab_rstata_se.RData")

rep_tab_reg(tab_rstata_se,
            caption = 'Comparison of standard errors to the results of the STATA package')

rm(tab_rstata_se)

```

Fitted values show very similar marginal distributions on both platforms (figure \@ref(fig:box-pred-yhat-stata)). R does not return fitted values in reference case 2. The summary statistics of differences in fitted values between R and STATA\textsuperscript{\textregistered} suggest that individual predictions are quite similar across platforms as well (table \@ref(tab:tab-pred-stata-yhat)).

Standard errors of fitted values differ visibly between platforms (figure \@ref(fig:box-pred-se-stata) and table \@ref(tab:tab-pred-stata-se)). The difference is particularly pronounced in reference case 1, but the standard errors from STATA\textsuperscript{\textregistered} seem quite extreme compared to all other reference cases. R does not return standard errors of fitted values in reference case 2.

```{r box-pred-yhat-stata, out.width="90%", fig.cap = "Fitted values in R and STATA", echo = FALSE}
knitr::include_graphics("plot_box_yhat.png")
```

<!-- ![(#fig:box-pred-yhat-stata) Fitted values in R and STATA](./plot_box_yhat.png) -->

```{r box-pred-se-stata, out.width="90%", fig.cap = "Standard errors of fitted values in R and STATA", echo = FALSE}
knitr::include_graphics("plot_box_se.png")
```

<!-- ![(#fig:box-pred-se-stata) Standard errors of fitted values in R and STATA](./plot_box_se.png) -->

```{r tab-pred-stata-yhat, echo = FALSE, results = "asis"}

load("tab_diff_yhat.RData")

knitr::kable(tab_diff_yhat,
             format = "simple",
             row.names = TRUE,
             caption = 'Summary statistics of differences of fitted values in R and STATA (positive values suggest larger values in STATA)')

rm(tab_diff_yhat)

```


```{r tab-pred-stata-se, echo = FALSE, results = "asis"}

load("tab_diff_se.RData")

knitr::kable(tab_diff_se,
             format = "simple",
             row.names = TRUE,
             caption = 'Summary statistics of differences in standard errors of fitted values in R and STATA (positive values suggest larger values in STATA)')

rm(tab_diff_se)

```

The comparison of the R and STATA\textsuperscript{\textregistered} packages shows that the R implementation sometimes behaves differently than the STATA\textsuperscript{\textregistered} package, but the results are not indicative of technical errors in the R implementation.

# Discussion

Adjusted limited dependent variable mixture models are powerful tools for regression analysis of health state utilities. Unlike standard regression models, adjusted limited dependent variable mixture models account for limits, gaps and multi-modal distributions.

The comparison of different optimization methods with EQ-5D-3L utility data from English patients after hip replacement in 2011 and 2012 [@NHSDigital2013] showed that the likelihood function can be challenging to maximize and can converge at local optima or extreme solutions. Parameter estimates varied considerably across optimization methods and even across methods with the same maximum log-likelihood. However, fitted values were very similar across the four reference cases which suggests that the model is more robust for the identification of incremental and average marginal effects than for parameter identification.

The 'aldvmm' package offers a variety of optimization algorithms and methods for generating initial values which is an important strength in such challenging situations. It is essential to assess different optimization algorithms and methods for generating initial values before interpreting the parameter estimates or predictions of adjusted limited dependent variable mixture models. 
The analysis of the EQ-5D-3L utility data also suggests that simpler models with fewer components should be considered when multi-component models are difficult to fit. Even single-component adjusted limited dependent variable mixture models can improve fit compared to traditional regression techniques because they account for limits and gaps.

Although coefficients can be interpreted as marginal effects within each component, they cannot be interpreted in terms of overall expected values. Thus, average marginal effects and average treatment effects need to be calculated from predictions using the generic function `predict()`. Standard errors of marginal effects or average treatment effects can be calculated using the delta method (see example code in the [appendix](#atetcode)).

When no valid covariance matrix of parameters can be obtained or the user questions the validity of standard errors, the function `sandwich::vcovBS()` can be used to obtain bootstrapped standard errors (see example code in the [appendix](#swcode)). Due to the large variability of results across model fits, a large number of iterations might be needed which increases computation time. Other functions from the 'sandwich' package can also be used to estimate robust or clustered standard errors.

In situations with repeated utility measures, the 'aldvmm' package only allows fixed effects estimations with group and time fixed effects which can be an important limitation in the analysis of clinical data. However, time fixed effects can be an appropriate modeling strategy in the presence of general time trends and dynamic selection in the population, e.g. because health state utilities decrease over time and treated individuals survive longer and thus are over-represented in later measurements.

Possible extensions of the 'aldvmm' package could include additional component distributions, a mixed model implementation for repeated measures and an implementation of functions for calculating average marginal effects and their standard errors.



# References {-}

<div id="refs"></div>


# Appendix {-}

## Availability of a complete covariance matrix by optimization method {- #covmat}

```{r tab-comp-cov, echo = FALSE, results = 'asis'}

load("tab_comp_cov.RData")

knitr::kable(tab_comp_cov,
             format = "simple",
             row.names = FALSE,
             caption = 'Availability of complete covariance matrix by optimization method')

rm(tab_comp_cov)

```

## Example calculation of average treatment effects on the treated {- #atetcode}

```{r atet-example, echo = TRUE, eval = FALSE, results = 'hide'}

# Create treatment indicator
#---------------------------

df$treated <- as.numeric(df$SEX == "2")

# Fit model
#----------

formula <- eq5d ~ treated + hr | 1

fit <- aldvmm(formula,
              data = df,
              psi = c(-0.594, 0.883))

# Predict treated
#----------------

# Subsample of treated observations
tmpdf1 <- df[df$treated == 1, ]

# Design matrix for treated observations
X1 <- aldvmm.mm(data = tmpdf1,
                formula = fit$formula,
                ncmp = fit$k,
                lcoef = fit$label$lcoef)

# Average expected outcome of treated observations
mean1 <- mean(predict(fit,
                      newdata = tmpdf1,
                      type = "fit",
                      se.fit = TRUE)[["yhat"]], na.rm = TRUE)

# Predict counterfactual
#-----------------------

# Subsample of counterfactual observations
tmpdf0 <- tmpdf1
rm(tmpdf1)
tmpdf0$treated <- 0

# Design matrix for counterfactual observations
X0 <- aldvmm.mm(data = tmpdf0,
                formula = fit$formula,
                ncmp = fit$k,
                lcoef = fit$label$lcoef)

# Average expected outcome of counterfactual osbervations
mean0 <- mean(predict(fit,
                      newdata = tmpdf0,
                      type = "fit",
                      se.fit = TRUE)[["yhat"]], na.rm = TRUE)

rm(tmpdf0)

# Standard error of ATET
#-----------------------

atet.grad <- numDeriv::jacobian(func = function(z) {
  
  yhat1 <- aldvmm.pred(par   = z,
                       X     = X1,
                       y     = rep(0, nrow(X1[[1]])),
                       psi   = fit$psi,
                       ncmp  = fit$k,
                       dist  = fit$dist,
                       lcoef = fit$label$lcoef,
                       lcmp  = fit$label$lcmp,
                       lcpar = fit$label$lcpar)[["yhat"]]
  
  yhat0 <- aldvmm.pred(par   = z,
                       X     = X0,
                       y     = rep(0, nrow(X0[[1]])),
                       psi   = fit$psi,
                       ncmp  = fit$k,
                       dist  = fit$dist,
                       lcoef = fit$label$lcoef,
                       lcmp  = fit$label$lcmp,
                       lcpar = fit$label$lcpar)[["yhat"]]
  
  mean(yhat1 - yhat0, na.rm = TRUE)
  
},
x = fit$coef)

se.atet <- sqrt(atet.grad %*% fit$cov %*% t(atet.grad))

# Summarize
#----------

out <- data.frame(atet = mean1 - mean0,
                  se = se.atet,
                  z = (mean1 - mean0) / se.atet)
out$p <- 2*stats::pnorm(abs(out$z), lower.tail = FALSE)
out$ul <- out$atet + stats::qnorm((1 + 0.95)/2) * out$se
out$ll <- out$atet - stats::qnorm((1 + 0.95)/2) * out$se

print(out)

```

## Example calculation of robust and clustered standard errors {- #swcode}

```{r sandwich-example, echo = TRUE, eval = FALSE, results = 'hide'}

# Create cluster indicator
#-------------------------

df$grp <- as.factor(round(0.5 + runif(nrow(df)) * 5, 0))

# Fit model
#----------

formula <- eq5d ~ hr | 1

fit <- aldvmm(formula,
              data = df,
              psi = c(-0.594, 0.883))

# Calculate robust and clustered standard errors
#-----------------------------------------------

vc1 <- sandwich::sandwich(fit)
vc2 <- sandwich::vcovCL(fit, cluster = ~ grp)
vc3 <- sandwich::vcovPL(fit, cluster = ~ grp)
vc4 <- sandwich::vcovHAC(fit, cluster = ~ grp)
vc5 <- sandwich::vcovBS(fit)
vc6 <- sandwich::vcovBS(fit, cluster = ~ grp)

# Calculate test statistics
#--------------------------

lmtest::coeftest(fit)
lmtest::coeftest(fit, vcov = vc1)
lmtest::coeftest(fit, vcov = vc2)
lmtest::coeftest(fit, vcov = vc3)
lmtest::coeftest(fit, vcov = vc4)
lmtest::coeftest(fit, vcov = vc5)
lmtest::coeftest(fit, vcov = vc6)

```

## Example calculation of modified Hosmer-Lemeshow test {- #mhlcode}

```{r mhl-show, echo = TRUE, eval = FALSE}
# Number of percentiles  
ngroup <- 10

# Extract expected values and residuals
yhat <- fit1_all[["zero"]][["Nelder-Mead"]][["pred"]][["yhat"]]
res <- fit1_all[["zero"]][["Nelder-Mead"]][["pred"]][["res"]]

# Make groups
group <- as.numeric(cut(yhat, breaks = ngroup), na.rm = TRUE)

# Auxiliary regression
aux <- stats::lm(res ~ factor(group))

# Data set of predictions from auxiliary regressions
newdf <- data.frame(group = unique(group)[order(unique(group))])
predict <- predict(aux, 
                   newdata = newdf, 
                   se.fit = TRUE, 
                   interval = 'confidence', 
                   level = 0.95)

plotdat <- as.data.frame(rbind(
  cbind(group = newdf$group, 
        outcome = "mean",
        value = predict$fit[ , 'fit']),
  cbind(group = newdf$group, 
        outcome = "ll",
        value = predict$fit[ , 'lwr']),
  cbind(group = newdf$group, 
        outcome = "ul",
        value = predict$fit[ , 'upr'])
))

# Make plot
plot <- ggplot2::ggplot(plotdat, aes(x = factor(as.numeric(group)), 
                                     y = as.numeric(value), 
                                     group = factor(outcome))) +
  geom_line(aes(linetype = factor(outcome)))
```  

## R code for the estimation of reference cases {- #rcode}

```{r tab-comp-stata-show, echo = TRUE, eval = FALSE}
# (1) Reference case 1: Model 1 with default optimization settings
fit1_default <- aldvmm::aldvmm(eq5d ~ hr | 1,
                               data = df,
                               psi = c(0.883, -0.594),
                               ncmp = 2,
                               init.method = "zero",
                               optim.method = "nlminb")

# (2) Reference case 2: Model 1 with user-defined initial values and constraints on parameters
init <- c(0,    0,   0,   0,    0,    0,    0.7283)
lo   <- c(-Inf, -Inf, -3,  -Inf, -Inf, -3, -Inf)
hi   <- c(Inf,  Inf,  Inf, Inf,  Inf,  Inf,  Inf)

fit1_cstr <- aldvmm::aldvmm(eq5d ~ hr | 1,
                            data = df,
                            psi = c(0.883, -0.594),
                            ncmp = 2,
                            init.est = init,
                            init.lo = lo,
                            init.hi = hi)

# (3) Reference case 3: Model 1 with initial values from constant-only model
fit1_const <- aldvmm::aldvmm(eq5d ~ hr | 1,
                             data = df,
                             psi = c(0.883, -0.594),
                             ncmp = 2,
                             init.method = "constant",
                             optim.method = "nlminb")

# (4) Reference case 4: Model 2 with user-defined initial values
init <- c(-.40293118, .30502755, .22614716, .14801581, -.70755741, 0, 
          -1.2632051, -2.4541401)

fit2 <- aldvmm::aldvmm(eq5d ~ hr | hr,
                       data = df,
                       psi = c(0.883, -0.594),
                       ncmp = 2,
                       init.est = init,
                       optim.method = "nlminb")
```

## STATA\textsuperscript{\textregistered} code for the estimation of reference cases {- #statacode}

```{r stata_code, echo = TRUE, eval = FALSE}
* (1) Reference case 1: Model 1 with default optimization settings
aldvmm eq5d hr, ncomponents(2)

* (2) Reference case 2: Model 1 with user-defined initial values and constraints on parameters
matrix  input a = (0, 0, 0, 0, 0, 0, 0.7283)
constraint 1 [Comp_2]:hr10 = 0
constraint 2 [Comp_2]:_cons = 100
constraint 3 [lns_2]:_cons = 1e-30
aldvmm eq5d hr, ncomp(2) from(a) c(1 2 3)

* (3) Reference case 3: Model 1 with initial values from constant-only model
aldvmm eq5d hr, ncomp(2) inim(cons)

* (4) Reference case 4: Model 2 with user-defined initial values
matrix  input start = (.14801581, .22614716, .30502755, -.40293118, 0, 
                       -.70755741, -2.4541401, -1.2632051)
aldvmm eq5d hr, ncomp(2) prob(hr) from(start)
```

## ALDVMM likelihood and gradient functions {- #likfun}

We write the ALDVMM log-likelihood function [@HernandezAlava2015] as the sum across observations of the log-transformed product of the multinomial logit density of component membership $A_{ic}$ and the density of expected values per component $f_{ic}$.

\begin{equation}
\label{eq:loglik}
ll=\sum_{i=1}^{n}\ln(A_{ic}f_{ic})
\end{equation}

We write the multinomial logit model as a softmax function and subtract the maximum linear predictor per observation across components from the observation's linear predictors to improve numerical stability of the likelihood function. This stabilized softmax density function is equivalent to the original multinomial logit density but has better numerical properties.

\begin{equation}
\label{eq:apart}
\begin{array}{ll}
A_{ic}&=\frac{\exp(w_{i}\delta_{c})}{\sum_{s=1}^{K}\exp(w_{i}\delta_{s})}\\
&=\frac{\exp(-\max(w_{i}\delta))}{\exp(-\max(w_{i}\delta))}\frac{\exp(w_{i}\delta_{c})}{\sum_{s=1}^{C}\exp(w_{i}\delta_{s})}\\
&=\frac{\exp(w_{i}\delta_{c} - \max(w_{i}\delta))}{\sum_{s=1}^{C}\exp(w_{i}\delta_{s} - \max(w_{i}\delta))}\\
\end{array}
\end{equation}

The normal density of expected values per component depends on the range in which $y_{i}$ falls relative to the maximum value $\Psi_{2}$ and minimum value $\Psi_{1}$ smaller than one in the value set.

\begin{equation}
\label{eq:fpart}
\begin{array}{ll}
f_{ic} =& \begin{cases} \begin{array}{ll}
1 - \Phi(\frac{\Psi_{2}-x_{i}\beta_{c}}{\sigma_{c}}) & \text{if } y_{i}|c > \Psi_{2}\\
\Phi(\frac{\Psi_{1}-x_{i}\beta_{c}}{\sigma_{c}}) & \text{if } y_{i}|c \leq \Psi_{1}\\
\frac{\phi(\frac{y_{i} - x_{i}\beta_{c}}{\sigma_{c}})}{\sigma_{c}}  & \text{if } \Psi_{1} < y_{i}|c \leq \Psi_{2}\\
\end{array} \end{cases}
\end{array}
\end{equation}

The gradient of the log-likelihood function with respect to all model parameters is the vector of first derivatives. The underlying score matrix includes the first derivatives of all observations' likelihood contributions to the log-likelihood.

\begin{equation}
\label{eq:gradll}
\Delta = \left[\frac{\partial ll}{\partial \delta_{1}}, ..., \frac{\partial ll}{\partial \delta_{C}}, \frac{\partial ll}{\partial \beta_{1}}, ..., \frac{\partial ll}{\partial \beta_{C}}, \frac{\partial ll}{\partial \sigma_{1}}, ..., \frac{\partial ll}{\partial \sigma_{C}}\right]
\end{equation}

The first derivative of the log-likelihood with respect to the parameters $delta_{c}$ must consider the first derivative of all elements of the denominator and depends on whether the elements are of the same component ($s = c$) or not ($s \neq c$).

\begin{equation}
\label{eq:dlldd}
\begin{array}{ll}
\frac{\partial ll}{\partial \delta_{c}} &= \sum_{i=1}^{n}\frac{1}{\sum_{c=1}^{C}A_{ic}f_{ic}}\cdot f_{ic}\cdot\frac{\partial A_{ic}}{\partial \delta_{c}}\\
\end{array}
\end{equation}

\begin{equation}
\label{eq:dadd}
\begin{array}{ll}
\frac{\partial A_{ic}}{\partial \delta_{c}} =& \begin{cases} \begin{array}{ll}
w_{i} A_{ic}(1-A_{ic}) & \text{if } s = c\\
w_{i} A_{is}A_{ic}& \text{if } s \neq c\\
\end{array} \end{cases}
\end{array}
\end{equation}

The first derivative of the log-likelihood with respect to the parameters $\beta_c$ depends on the range in which $y_{i}$ falls relative to the maximum value $\Psi_{2}$ and minimum value $\Psi_{1}$ smaller than one in the value set.

\begin{equation}
\label{eq:dllb}
\begin{array}{ll}
\frac{\partial ll}{\partial \beta_{c}} &=\sum_{i=1}^{n}A_{ic}\frac{\partial f_{ic}}{\partial \beta_{ic}} \\
\end{array}
\end{equation}

\begin{equation}
\label{eq:dfdb}
\begin{array}{ll}
\frac{\partial f_{ic}}{\partial \beta_{c}} =& \begin{cases} \begin{array}{ll}
\frac{x_{i}}{\sigma_{c}}\phi (\frac{\Psi_{2} - x_{i}\beta_{c}}{\sigma_{c}}) & \text{if } y_{i}|c > \Psi_{2}\\
-\frac{x_{i}}{\sigma_{c}}\phi (\frac{\Psi_{1} - x_{i}\beta_{c}}{\sigma_{c}})& \text{if } y_{i}|c \leq \Psi_{1}\\
\frac{x_{i}}{\sigma_{c}} \frac{y_{i} - x_{i}\beta_{c}}{\sigma_{c}} \phi (\frac{y_{i} - x_{i}\beta_{c}}{\sigma_{c}}) & \text{if } \Psi_{1} < y_{i}|c \leq \Psi_{2}\\
\end{array} \end{cases}
\end{array}
\end{equation}

The first derivative of the log-likelihood with respect to the parameters $\sigma_c$ depends on the range in which $y_{i}$ falls relative to the maximum value $\Psi_{2}$ and minimum value $\Psi_{1}$ smaller than one in the value set.

\begin{equation}
\label{eq:dllds}
\begin{array}{ll}
\frac{\partial ll}{\partial \sigma_{c}} &=\sum_{i=1}^{n}A_{ic}\frac{\partial f_{ic}}{\partial \sigma_{ic}} \\
\end{array}
\end{equation}

\begin{equation}
\label{eq:dfds}
\begin{array}{ll}
\frac{\partial f_{ic}}{\partial \sigma_{c}} =& \begin{cases} \begin{array}{ll}
\phi (\frac{\Psi_{2} - x_{i}\beta_{c}}{\sigma_{c}})\frac{\Psi_{2} - x_{i}\beta_{c}}{\sigma_{c}^{2}} & \text{if } y_{i}|c > \Psi_{2}\\
-\phi (\frac{\Psi_{1} - x_{i}\beta_{c}}{\sigma_{c}})\frac{\Psi_{1} - x_{i}\beta_{c}}{\sigma_{c}^{2}} & \text{if } y_{i}|c \leq \Psi_{1}\\
\phi (\frac{y_{i} - x_{i}\beta_{c}}{\sigma_{c}})\left[ (\frac{y_{i} - x_{i}\beta_{c}}{\sigma_{c}})^2 - 1 \right]\frac{1}{\sigma_{c}^2} & \text{if } \Psi_{1} < y_{i}|c \leq \Psi_{2}\\
\end{array} \end{cases}
\end{array}
\end{equation}

```{r end, echo = FALSE, results = 'hide'}

rm(rep_tab_reg)
rm(tab_comp_ll)
rm(textout)

```