1 Purpose

In this R Markdown file, we create a D-efficient experimental design for a descriptive system of ordinal attributes. This software is free for private, non-commercial use only. Please contact the authors for commercial usage.

2 Summary

This file provides a detailed R code for creating a D-efficient experimental design for a descriptive system of ordinal attributes. The code has five steps: (1) Construct all origin-destination pairs; (2) Select the candidate OD pairs; (3) Iteratively construct and evaluate blocks (i.e., sets of OD pairs); (4) Assess performance under worst-case scenarios; and (5) Assign hold-outs to OD pairs. As a demonstration, this code was used to construct the design for the second wave of the US EQ-5D-Y-3L valuation study (Jumamyradov et al., 2023) using priors from its first wave.

3 Load the libraries

# set directory
# setwd("C:/Users/maksat/Box Sync/USF PC/EuroQol/Kaisen task/paper1/wave2/experimental design")
# clear memory
rm(list = ls())
# List of libraries
library(weights) # for wtd.cors
library(psych) # for trace function tr()
library(reshape2) ## for command melt() and dcast()
library(dplyr) # for command %>%
library(stringr) # for str_locate
library(knitr) # for kable

4 Data notation

W is the vector representing the profile of the worst possible object
Y is the vector representing the profile of the best possible object
V is the absolute value of the object
H is the number of hold-outs in each OD pair
B is the number of blocks
P is the number of OD pairs in each block
I is the number of improvements
J is the number of steps in kaizen task
paths is the number of total paths from origin to destination
threshold1 is the threshold for choosing origin over destination (Step 2)
threshold2 is the threshold for choosing a response over alternatives (Step 2)
threshold3 is the threshold for minimum level frequency (Step 3)
threshold4 is the threshold for maximum absolute correlation (Step 3)
threshold5 is the threshold for minimum b-error (Step 3)
seed = 240121 is the seed number for random data generation

5 Specify the settings

W = 33333 # worst possible object
Y = 11111 # best possible object
H = 1  # number of hold-outs in each OD pair
B = 1  # number of blocks
P = 20 # number of OD pairs in each block
I = floor(log10(W)) + 1 - H # number of potential improvements
J = 3  # number of steps in kaizen task
paths = factorial(I)/factorial(I-J) # number of potential paths
threshold1 = 0.01  # threshold for choosing origin over destination (Step 2)
threshold2 = 0.025 # threshold for choosing a response over alternatives (Step 2)
threshold3 = 0.05   # threshold for minimum level frequency (Step 3)
threshold4 = 0.25  # threshold for maximum absolute correlation (Step 3)
threshold5 = 0.4   # threshold for minimum b-error (Step 3)
seed = 240223 # seed number for random data generation
block_save = 1 # the number of blocks to create 
block_iter = 2 # the number of block to evaluate

6 Specify the functions

In this section we define functions used in the R Markdown file. First we define the functions used in the estimation of the 15 Y-3L parameters. Then we define the functions used in the process of experimental design.

6.1 Functions for Y-3L estimation

# function to generate A0, A1, A2, A3, A4, and A5
gen_A <- function(a_i, b_i, c_i, alt1, alt2) {
  alt1 <- as.character(alt1)
  alt2 <- as.character(alt2)
  indices <- c(a_i, b_i, c_i)
  for (i in indices) {
    if (i %in% 0:6) {
      substr(alt1, i + 1, i + 1) <- substr(alt2, i + 1, i + 1)
    }
  } 
  return(alt1)
}

# function to generate the log-likelihood
llf1 <- function(param1, X) {
  param1 <- matrix(param1,nrow=1)
  suffixes <- c(".A0",".A1", ".A2", ".A3", ".A4", ".A5")
  v <- matrix(0, nrow = nrow(X), ncol = length(suffixes))
  for (i in seq_along(suffixes)) {
    v[, i] <- exp(as.matrix(X %>% select(ends_with(suffixes[i]))) %*% t(param1))
    v[, i] <- ifelse(as.matrix(rowSums(as.matrix(X %>% select(ends_with(suffixes[i])))))==0,0,v[, i])
  }
  return(colSums(log( v[,1] / (v %*% matrix(data = 1, nrow = length(suffixes), ncol = 1))) ))
}

6.2 Functions for experimental design

6.2.1 Origin and destination

# Function to generate all combinations of digits from an integer
generate_digit_combinations <- function(n) {
  # Convert the input integer to a character vector
  num_str <- as.character(n)
  # Extract individual digits
  digits <- as.integer(strsplit(num_str, NULL)[[1]])
  # Generate all combinations of digits
  combinations <- expand.grid(replicate(length(digits), 1:max(digits), 
                                        simplify = FALSE))
  # Combine the digits for each row
  result <- apply(combinations, 1, function(row) {
    combined_digits <- paste(row, collapse = "")
    as.integer(combined_digits)
  })
  return(result)
}

# Function to filter rows of a data frame based on a condition
keep_rows_condition <- function(data_frame,h) {
  # Convert the two columns to character vectors
  col1 <- as.character(data_frame[, 1])
  col2 <- as.character(data_frame[, 2])
  # Check the condition for each row
  condition <- sapply(seq_along(col1), function(i) {
    digits1 <- as.integer(strsplit(col1[i], NULL)[[1]])
    digits2 <- as.integer(strsplit(col2[i], NULL)[[1]])
    # Check if only one digit is equal and the rest are greater
    sum(digits1 == digits2) == h && all(digits1 >= digits2)
  })
  # Keep rows that satisfy the condition
  result <- data_frame[condition, ]
  O = t(apply(data.frame(result[,1]),1,function(x) 
    as.numeric(strsplit(as.character(x),"")[[1]])))
  D = t(apply(data.frame(result[,2]),1,function(x) 
    as.numeric(strsplit(as.character(x),"")[[1]])))
  hold_out = data.frame((O!=D)*1)
  O = apply(O*hold_out,1,function(x) paste(x,collapse = ""))
  D = apply(D*hold_out,1,function(x) paste(x,collapse = ""))
  result = cbind(O,D)
  return(result)
}

# Function that creates set of alternatives given the origin, destination 
# and the number of steps to get to the destination
set_of_alternatives = function(O,D,J0) {
  num_attr <- nchar(O) 
  num_imp <- sum(unlist(strsplit(O,""))!=unlist(strsplit(D,"")))
  J0 <- ifelse(J0==0,num_imp,J0)
  versions <- expand.grid(rep(list(0:1), num_attr))
  #Set values to 0 where characters are the same in D and O
  for (i in 1:num_attr) {
    if (substr(D,i,i)==substr(O,i,i)) {
      versions[,i] <- 0 
    }
  }
  #Filter rows where the number of 1s is equal to J
  versions <- unique(versions[rowSums(versions, dims = 1) == J0, ])
  #Replace 1s with corresponding characters from D, and 0s with corresponding characters
  versions[] <- lapply(1:num_attr, function(i)
    ifelse(versions[, i] == 1, substr(D, i, i), substr(O, i, i)))
  #Return the combined characters in each row into a single string
  return(as.matrix(apply(versions, 1, paste, collapse = ""),ncol=1))
}

6.2.2 Incremental and multi-level effects

# Function that creates incremental effects from a profile
incremental_effects = function(alt,W) {
  # Determine the number of attributes
  num_attr <- nchar(W)
  alt = data.frame(alt)
  # Iterate over each attribute
  for (i in 1:num_attr) {
    # Generate levels for the current attribute
    levels <- sequence(as.numeric(substr(W,i,i))-1)
    # Iterate over each level
    for (j in seq_along(levels)) {
      new_name <- paste0("x",i,"_",j)
      # Determine the incremental effect based on whether the attribute level 
      # is greater than the current level
      alt[,new_name] <- as.numeric(substr(alt$alt,i,i)>j)
    }
  }
  return(alt)
}

# Combine single-level effects into the five multi-level effects
multi_level_effects = function(data, vars){
  for (i in seq_along(vars)){
    a = data %>% select(starts_with(vars[i]))
    a = abs(apply(a,1,function(x) prod(x)))
    #name_third = paste0(vars[i],"_3")
    data[,paste0(vars[i],"_3")] = ifelse(a == 0, 0, -1)
    data[,paste0(vars[i],"_1")] = ifelse(data[,paste0(vars[i],"_3")] == 0,
                                         data[,paste0(vars[i],"_1")] ,0)
    data[,paste0(vars[i],"_2")] = ifelse(data[,paste0(vars[i],"_3")] == 0,
                                         data[,paste0(vars[i],"_2")] ,0)    
  }
  return(data)
}

6.2.3 Predict response probabilities and create blocks

# Function to predict response probabilities given priors
predict_probability = function(data, beta){
  uniq_od = unique(data$OD)
  uniq_set = unique(data$set)
  data$prob = 0
  for (i in seq_along(uniq_od)) {
    for (j in seq_along(uniq_set)) {
      x <- as.matrix(data[data$OD == uniq_od[i] & data$set == uniq_set[j],] %>% 
                       select(contains("x")))
      denom = sum(exp(x %*% as.matrix(beta)))
      data$prob[data$OD == uniq_od[i] & data$set == uniq_set[j]] <-
        apply(x,1,function(x) 
        exp(x%*%as.matrix(beta))/denom)
    }
  }
  return(data$prob)
}

# Function to create a block of data based on attribute levels
create_block = function(data,W){
  block = NULL # Initialize an empty block
  # Iterate over each attribute in W
  for (i in 1:nchar(W)) {
    # Select candidates where the attribute level is '0'
    candidates <- data[substr(data$O,i,i)=="0",]
    # Group candidates by 'O', arrange by 'O' and 'U', and select the first row 
    # for each 'O'
    candidates <- candidates %>% group_by(O) %>% arrange(O,U) %>% 
      filter(row_number()==1)
    # Arrange candidates by 'U' and select the first four rows
    candidates <- (candidates %>% arrange(U))[1:4,] 
    # Add candidates to the block
    block = rbind(block,candidates)
    data <- merge(data,candidates,by = c("O"),all.x = TRUE)
    data <- data[is.na(data$U.y),1:3]
    colnames(data)<-c("O","D","U")
  }
  return(block[,1:2])
}

6.2.4 Assign hold-outs

# Function to assign hold-outs
assign_holdout = function(data, W){
  vec <- data.frame(t(matrix(as.numeric(unlist(
    strsplit(data$O,""))=="0"),nrow=nchar(W))))
  data$vec <-  apply(vec,1,function(x) 
    paste(x,collapse = ""))

  holdouts <- rownames(as.matrix(table(data$vec)))
  data$severity <- rowSums(as.data.frame(t(matrix(as.numeric(unlist(
    strsplit(data$O,""))),nrow=nchar(W)))))
  data <- data[order(data$vec,data$severity),]
  
  data$maxvec <- ""
  for (i in 1:nchar(W)) {
    data$maxvec <- ifelse(vec[,i]==1,paste0(data$maxvec,substr(W,i,i)),data$maxvec)
  }
  
  h_alt <- NULL
  for (i in 1:nrow(data)) {
  h_replace <- as.data.frame(as.character(
    generate_digit_combinations(data$maxvec[i]),ncol=1))
  names(h_replace) <- "subvec"
  h_replace$vec <- data[i,3]
  h_replace$sev <- rowSums(as.data.frame(t(matrix(as.numeric(unlist(
    strsplit(h_replace$subvec,""))),nrow=nchar(data$maxvec[i])))))
  h_alt <- rbind(h_alt,h_replace)
  }
  h_alt <- unique(h_alt)
  
  # match the holdouts with potential replacements by inverse severity
  # i.e., lower severity of origin differential attributes is
  # matched with high severity of holdout and vice versa
  
  # assign half of the holdouts to the least severe holdout
  data_most_sev <- data %>% group_by(vec) %>% 
    filter(row_number()>(n()/2)) %>% ungroup()
  h_alt_least_sev <- h_alt %>% group_by(vec) %>% 
    filter(row_number()==1) %>% ungroup()
  data_most_sev <- merge(data_most_sev,h_alt_least_sev, by = "vec")
  
  # assign other half of the holdouts to a more severe holdout
  data_least_sev <- data %>% group_by(vec) %>% 
    filter(row_number()<=(n()/2)) %>% ungroup()
  h_alt_more_sev <- h_alt %>% group_by(vec) %>% filter(row_number()!=1) %>% ungroup()
  
  # create an assignment indicator (CODE is NOT SCALABLE)
  data_least_sev <- data_least_sev %>% group_by(vec) %>% mutate(ind=row_number())
  h_alt_more_sev$U <- runif(nrow(h_alt_more_sev)) ## randomize row order
  h_alt_more_sev <-   h_alt_more_sev[order(h_alt_more_sev$vec,h_alt_more_sev$U),]
  h_alt_more_sev <- h_alt_more_sev %>% 
    group_by(vec) %>% mutate(ind=row_number()) %>% ungroup()
  
  data_less_sev <- merge(data_least_sev,h_alt_more_sev, by = c("vec","ind"))
  data_less_sev <-  data_less_sev  %>% select(-c("ind","U"))
                  
  data_new <- rbind(data_most_sev,data_less_sev) %>%
    select(-c("maxvec","sev","severity"))
  data_new <- data_new[order(data_new$vec,data_new$subvec),]

  # replace holdouts in O and D (CODE is NOT SCALABLE)
  data_new$O = apply(data_new,1,function(x) gsub("0" , as.character(x[4]),x[2]))
  data_new$D = apply(data_new,1,function(x) gsub("0" , as.character(x[4]),x[3]))
  
  data_new <-  data_new  %>% select(c("O","D"))
   return(data_new)
}

6.2.5 Design efficiency

This function analyzes the efficiency of a block given a prior, and comperes the efficiencies across different blocks.

# Function for comparing different priors
compare_priors = function(data,beta, D_x, alt_x){
  x <- as.matrix(data %>% select(contains("x")))
  data$p = predict_probability(data,beta)
  AVC = cov.wt(x, wt=data$p)$cov
  D_err = det(AVC)^(1/ncol(x))
  A_err = tr(AVC)^(1/ncol(x))
  
  # Trade-off frequency
  mean_x <- matrix(0, ncol = ncol(x))
  for (j in 1:ncol(x)) {
    mean_x[,j] <- -(weighted.mean(x[,j],data$p))
  }
  min_freq = min(mean_x)
  
  # Multicollinearity
  corr_x <- wtd.cors(x, weight=data$p)
  max_corr = max(corr_x[row(corr_x) != col(corr_x)]) # max off-diagonal element
  min_corr = min(corr_x[row(corr_x) != col(corr_x)]) # min off-diagonal element
  
  # Utility Balance (B-error)
  data <- data %>% group_by(OD,set) %>% mutate(J=n()) %>% ungroup()
  data$logb <- log(data$p * data$J)
  data <- data %>% group_by(OD,set) %>% mutate(sumb=sum(logb)) %>% ungroup()
  data$sumb <- exp(data$sumb)
  B_err <- unique(data[,c("OD","set","sumb")])
  B_err = mean(B_err$sumb)
  
  table_x = t(data.frame(c(D_err,A_err,B_err,min_freq,min_corr,max_corr)))
  colnames(table_x) = c("D.error", "A.error", "B.error", "Min.frequency", 
                     "Min.correlation", "Max.correlation")
  return(table_x)
}

7 Load the data

# load the kaizen data
kz_it <- read.csv("kz_it_231010.csv") 
kz_it <- kz_it[(kz_it$tnum>1), ]

8 Recode the data

In this section we recode the data to create the variables defined above

8.1 Kaizen sets

We first extract actual choices of respondent and create a1, a2, a3 and a4, which are the first, second , third and fourth choices, respectively. Afterwards, we create three sets based on the sequential choices.

# kaizen sets
kz3_it<- subset(kz_it, select=c("id", "task", "tnum", "actual_choice", "alt1", "alt2"))

# extract actual choices and create a1, a2, a3, a4 
kz3_it$a1<-as.numeric(substr(kz3_it$actual_choice,1,1))
kz3_it$a2<-as.numeric(substr(kz3_it$actual_choice,3,3))
kz3_it$a3<-as.numeric(substr(kz3_it$actual_choice,5,5))
kz3_it$a4<-as.numeric(substr(kz3_it$actual_choice,7,7))

8.1.1 Set 1 of kaizen tasks

# Set 1 of kaizen tasks
set1 <- subset(kz3_it, select=c("id", "task","tnum"))

# Apply the function to generate A0, A1, A2 and A3 columns
set1$A0 <- mapply(gen_A, kz3_it$a1, -1, -1, kz3_it$alt1, kz3_it$alt2)
set1$A1 <- mapply(gen_A, kz3_it$a2, -1, -1, kz3_it$alt1, kz3_it$alt2)
set1$A2 <- mapply(gen_A, kz3_it$a3, -1, -1, kz3_it$alt1, kz3_it$alt2)
set1$A3 <- mapply(gen_A, kz3_it$a4, -1, -1, kz3_it$alt1, kz3_it$alt2)
set1_ita<-melt(set1, id.vars = c("id", "task","tnum"))
set1_ita$set <- 1

8.1.2 Set 2 of kaizen tasks

# Set 2 of kaizen tasks
set2 <- subset(kz3_it, select=c("id", "task","tnum"))

# Apply the function to generate A0, A1, A2, A3, A4 and A5 columns
set2$A0 <- mapply(gen_A, kz3_it$a1, kz3_it$a2, -1, kz3_it$alt1, kz3_it$alt2)
set2$A1 <- mapply(gen_A, kz3_it$a1, kz3_it$a3, -1, kz3_it$alt1, kz3_it$alt2)
set2$A2 <- mapply(gen_A, kz3_it$a1, kz3_it$a4, -1, kz3_it$alt1, kz3_it$alt2)
set2$A3 <- mapply(gen_A, kz3_it$a2, kz3_it$a3, -1, kz3_it$alt1, kz3_it$alt2)
set2$A4 <- mapply(gen_A, kz3_it$a2, kz3_it$a4, -1, kz3_it$alt1, kz3_it$alt2)
set2$A5 <- mapply(gen_A, kz3_it$a3, kz3_it$a4, -1, kz3_it$alt1, kz3_it$alt2)
set2_ita<-melt(set2, id.vars = c("id", "task", "tnum"))
set2_ita$set <- 2

8.1.3 Set 3 of kaizen tasks

# Set 3 of kaizen tasks
set3 <- subset(kz3_it, select=c("id", "task","tnum"))

# Apply the function to generate A0, A1, A2, and A3 columns
set3$A0 <- mapply(gen_A, kz3_it$a1, kz3_it$a2, kz3_it$a3, kz3_it$alt1, kz3_it$alt2)
set3$A1 <- mapply(gen_A, kz3_it$a1, kz3_it$a2, kz3_it$a4, kz3_it$alt1, kz3_it$alt2)
set3$A2 <- mapply(gen_A, kz3_it$a1, kz3_it$a3, kz3_it$a4, kz3_it$alt1, kz3_it$alt2)
set3$A3 <- mapply(gen_A, kz3_it$a2, kz3_it$a3, kz3_it$a4, kz3_it$alt1, kz3_it$alt2)
set3_ita<-melt(set3, id.vars = c("id", "task","tnum"))
set3_ita$set <- 3

8.1.4 Combine sets

# combine sets
kz3_ita <- rbind(set1_ita,set2_ita,set3_ita)
kz3_ita <- kz3_ita[kz3_ita$value!=-1,]
task_it <- dcast(kz3_ita, id+set+task+tnum~variable)

# Format from wide to long
id_var <- c("id","task","tnum","set")
task_ita = data.frame(melt(task_it, id.vars = id_var, value.name = "A"))

# Create attribute variables 
prefixes <- c("X1_", "X2_", "X3_", "X4_", "X5_", "X6_", "X7_")
for (i in c(1:7)) {
  task_ita[paste0(prefixes[[i]], 1)] <- as.numeric(as.numeric(
    substr(task_ita$A, i, i)) > 1 )
  task_ita[paste0(prefixes[[i]], 2)] <- as.numeric(as.numeric(
    substr(task_ita$A, i, i)) > 2 )
  task_ita[paste0(prefixes[[i]], 3)] <- as.numeric(as.numeric(
    substr(task_ita$A, i, i)) > 3 )
  task_ita[paste0(prefixes[[i]], 4)] <- as.numeric(as.numeric(
    substr(task_ita$A, i, i)) > 4 )
}
task_ita <- task_ita[, colSums(task_ita != 0, na.rm = TRUE) > 0]

# Format from long to wide
task_ita <- task_ita[,!(names(task_ita) %in% c("A"))]

# Save only the id and main effect variables
id_var <- c("id","task","tnum","set")
attr_var <- c("X1_1","X1_2", "X2_1","X2_2","X3_1","X3_2",
              "X4_1","X4_2","X5_1","X5_2")
suffixes <- c(".A0",".A1", ".A2", ".A3", ".A4", ".A5")

task_ita = task_ita %>% rename("alt" = "variable") ## rename variable 
id_var2 <- c("id","task","tnum","set","alt")

task_ita2 <- melt(task_ita, id.vars = id_var2)
task_ita2$variable <- paste(task_ita2$variable,task_ita2$alt, sep=".")
logit_it <- dcast(task_ita2,id+set+task+tnum~variable, value.var = "value")

logit_it[is.na(logit_it)] <- 0
logit_it <- logit_it %>% select(sort(names(.)))
logit_it <- data.frame(logit_it[order(logit_it$id,logit_it$task,logit_it$set),])

8.1.5 Remove overlaps

v <- matrix(0, nrow = nrow(logit_it), ncol = length(suffixes))
for (i in seq_along(suffixes)) {
  v[,i] <- as.matrix(rowSums(logit_it %>% select(ends_with(suffixes[i]))))
  v[v>0] <- 1
}
max_count <- as.matrix(rowSums(v))

v <- matrix(0, nrow = nrow(logit_it), ncol = length(attr_var))
for (i in seq_along(attr_var)) {
  v[,i] <- as.matrix(rowSums(logit_it %>% select(starts_with(attr_var[i]))))
}

for (i in seq_along(suffixes)) {
  for (j in seq_along(attr_var)) {
    var_name <- paste0(attr_var[j], suffixes[i])
    logit_it[, var_name] <- logit_it[, var_name] - as.numeric(v[, j]==max_count)*
      as.numeric(i <= max_count[, 1])
  }
}

for (i in seq_along(suffixes)) {
  for (j in seq_along(attr_var)) {
    var_name <- paste0(attr_var[j], suffixes[i])
    logit_it[, var_name] <- abs(logit_it[, var_name] - as.numeric(v[, j]>0)*
      as.numeric(v[, j]<max_count)*as.numeric(i <= max_count[, 1]))
  }
}

8.1.6 Add multi-level improvements

# add multi-level improvements
suffixes <- c(".A0",".A1", ".A2", ".A3", ".A4", ".A5")
dom <- c("X1_", "X2_", "X3_", "X4_", "X5_")
dom2 <- c("X1", "X2", "X3", "X4", "X5")
for (j in seq_along(dom)) {
  for (i in seq_along(suffixes)) {
    col_name_1 <- paste0(dom[j], "1", suffixes[i])
    col_name_2 <- paste0(dom[j], "2", suffixes[i])
    col_name_3 <- paste0(dom[j], "3", suffixes[i])
    col_name_4 <- paste0(dom[j], "4", suffixes[i])
    
    # 2-way interactions
    new_name_2_1 <- paste0(dom2[j], "21", suffixes[i])
    logit_it[, new_name_2_1] <- logit_it[, col_name_1] * logit_it[, col_name_2]
    logit_it[ , c(col_name_1, col_name_2)] <- 
      logit_it[ , c(col_name_1, col_name_2)] - 
      cbind(logit_it[, new_name_2_1], logit_it[, new_name_2_1])
    
  }
}

9 Estimation of priors (Step 0)

In this section we use the logit model to estimate coefficients for 15 improvements.

seed <- 231012  # random number generator
tol_ml <- 1e-8 # tolerance criterion for maximum likelihood
method_ml <- "BFGS" # maximization estimation technique for maximum likelihood
iteration_ml <- 150 # number of iteration for maximum likelihood 

## Y3L Estimation
id_var <- c("id","task","tnum","set")
y3l_var <- c("X1_1","X1_2","X2_1","X2_2","X3_1","X3_2","X4_1","X4_2",
             "X5_1","X5_2","X121","X221","X321","X421","X521")
logit_it <- logit_it %>% select(contains(id_var) | contains(y3l_var) )  # 15 improvements
y3l_param <- matrix(rep(0,15),nrow=1)
colnames(y3l_param) <- y3l_var
y3l_model <- (maxLik::maxLik(llf1, X=logit_it,
                             start = c(unlist(y3l_param[1,1:ncol(y3l_param)])),
                             method = method_ml, fixed = "X5_1",
                             control = list(iterlim=iteration_ml , tol=tol_ml, printLevel=0)))
summary(y3l_model)
## --------------------------------------------
## Maximum Likelihood estimation
## BFGS maximization, 94 iterations
## Return code 0: successful convergence 
## Log-Likelihood: -23670.1 
## 14  free parameters
## Estimates:
##      Estimate Std. error t value  Pr(> t)    
## X1_1  0.43096    0.04739   9.094  < 2e-16 ***
## X1_2  0.71539    0.04997  14.315  < 2e-16 ***
## X2_1  0.15793    0.04975   3.174   0.0015 ** 
## X2_2  0.36665    0.04795   7.646 2.07e-14 ***
## X3_1  0.20026    0.04824   4.151 3.31e-05 ***
## X3_2  0.43074    0.05194   8.293  < 2e-16 ***
## X4_1  1.38532    0.04892  28.319  < 2e-16 ***
## X4_2  1.74379    0.05161  33.788  < 2e-16 ***
## X5_1  0.00000    0.00000      NA       NA    
## X5_2  0.54288    0.04859  11.173  < 2e-16 ***
## X121  1.12740    0.04716  23.905  < 2e-16 ***
## X221  0.61774    0.04857  12.719  < 2e-16 ***
## X321  0.94674    0.04546  20.826  < 2e-16 ***
## X421  2.30461    0.05664  40.691  < 2e-16 ***
## X521  0.88326    0.05466  16.159  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## --------------------------------------------

10 Candidate sets (Step 1)

We now create all possible OD pairs given three conditions using the function generate_digit_combinations:
(1) keep OD pairs that have destination levels smaller or equal to origin levels
(2) keep OD pairs that have exactly H hold-outs and assign hold-out to be zero
(3) keep OD pairs that are unique

# create all possible pairs
object <- generate_digit_combinations(W) # all objects
OD_pairs <- expand.grid(object,object) # all pairs

# create all OD pairs
OD_pairs = data.frame(keep_rows_condition(OD_pairs,H)) %>% distinct()

10.0.1 Remove dominant destinations

Now we select the candidate OD pairs based on their destination. Specifically, we keep only the OD pairs whose predictive probability of destinations are higher than threshold1. In other words, the origin should be dominated by the destination.

# Remove OD pairs with an OD probability less than the 
# threshold (i.e., Prob(origin<destination) < threshold). In other words, 
# the origin should be dominated by the destination.

beta = matrix(y3l_model$estimate, ncol=1)
prefixes = c("x1","x2","x3","x4","x5")

# Identify the differential incremental effects (x) in the OD pairs
D_x = NULL
for (i in 1:nrow(OD_pairs)){
    alt_X <- incremental_effects(OD_pairs[i,2],W) # create single-level effects
    O_X <- incremental_effects(OD_pairs[i,1],W) # create single-level effects
    alt_x0 <- cbind(alt_X[,1],alt_X[,2:ncol(alt_X)]-as.data.frame(lapply(
                       O_X[,2:ncol(O_X)], rep, nrow(alt_X)))) # differences in X
    alt_x1 = multi_level_effects(alt_x0,prefixes) # create multi-level effects
    alt_new = cbind(rep(i,nrow(alt_x1)),rep(j,nrow(alt_x1)),
                    c(1:nrow(alt_x1)),rep(OD_pairs[i,1],nrow(alt_x1)),
                    rep(OD_pairs[i,2],nrow(alt_x1)),alt_x1)
    names(alt_new)[1:6] = c("OD","set","alt","O","D","object")
    D_x = rbind(D_x,alt_new)
}

# predict the likelihood of choosing each alternative in each set
D_x$p = predict_probability(D_x,beta)

# probability of choosing origin over destination > threshold1
OD_pairs2 = OD_pairs[D_x$p > threshold1,]

10.0.2 Remove OD pairs that with small responses probabilities

Similar to removing dominant destinations, we now remove the OD pairs with response probability smaller than threshold2.

# Remove OD pairs with an response probability less than the threshold 
# (i.e.,  Prob(response) < threshold). Under McFadden's positivity assumption, 
# each response must have a non-zero likelihood of being chosen. 

# Identify the differential incremental effects (x) in the sets of alternatives
alt_x = NULL
for (i in 1:nrow(OD_pairs2)){
  for (j in 1:J) {
    alt <- set_of_alternatives(OD_pairs2[i,1],OD_pairs2[i,2],j) # jth alt
    alt_X <- incremental_effects(alt,W) # create single-level effects
    O_X <- incremental_effects(OD_pairs2[i,1],W) # create single-level effects
    alt_x0 <- cbind(alt_X[,1],alt_X[,2:ncol(alt_X)]- 
                     as.data.frame(lapply(
                       O_X[,2:ncol(O_X)], rep, nrow(alt_X)))) # differences in X
    alt_x1 = multi_level_effects(alt_x0,prefixes) # create multi-level effects
    alt_new = cbind(rep(i,nrow(alt_x1)),rep(j,nrow(alt_x1)),
                    c(1:nrow(alt_x1)),rep(OD_pairs2[i,1],nrow(alt_x1)),
                    rep(OD_pairs2[i,2],nrow(alt_x1)),alt_x1)
    names(alt_new)[1:6] = c("OD","set","alt","O","D","object")
    alt_x = rbind(alt_x,alt_new)
  }
}

# predict the likelihood of choosing each alternative in each set
alt_x$p = predict_probability(alt_x,beta)

# probability of choosing alternative > threshold2
alt_x$small = as.numeric(alt_x$p < threshold2)
alt_x <- alt_x %>% group_by(OD) %>% mutate(sum = sum(small)) %>% ungroup()
OD_pairs3 <- unique(alt_x[alt_x$sum==0,c("O","D")])
alt_x <- alt_x[alt_x$sum==0,]

11 Block construction (Step 2)

In this section we iteratively select a candidate block based on three criteria: (1) Trade-off frequency, (2) Multicollinearity, (3) Utility balance. Trade-off frequency can be defined as the proportion of non-zero elements fro each trade-off. Balance of trade-offs is necessary to make sure that the block has enough variation to estimate the effect of an attribute. Multicollineraty is the pairwise relationship between ternary variables. If two variables are perfectly correlated (perfect collinearity), the design lacks the variation needed to allow the estimation of their independent effects. Finally, utility balance implies observing each row of the response matrix with equal likelihood.

set.seed(seed) # set the seed
start_time = Sys.time()
filename = NULL # empty array for the names of saved blocks
eval_table = NULL
block_final = NULL
block_x_final = NULL
for (b2 in 1:block_save){
  b = 1 # initial iteration value for the while loop
  D_crit = 100 # initial value for D-error
  while (b <= block_iter) { # start the while loop
    OD_pairs4 <- OD_pairs3
    OD_pairs4$U <- runif(nrow(OD_pairs4),0,1)
    
    # Generate the block 
    D_alt <- merge(OD_pairs4,D_x,by=c("O","D"),all.x = TRUE) %>%
      select(c("O","D")|contains("x"))
    
    # start a for loop select a block that fit two criteria: 
      # (1) OD pairs for each holdout location has to have a level-balance, 
      # (2) all origins within the selected block has to be unique
    block = NULL
    for (i in 1:nchar(D_alt$O[1])) {
      D_alt0 <- D_alt[substr(D_alt$O,i,i)==0,]
      D_alt0 <- Filter(function(x)!all((x==0)),D_alt0)
      
      # randomly sample holdout rows until all improvements occur once or more
      j = 0
      while (j<1){
        candidate = D_alt0[sample(1:nrow(D_alt0),4),]
        any_zero = any(colSums(candidate[3:14]) == 0)*1
        if ((any_zero == 0) & (length(unique(candidate$O)) == nrow(candidate))){
          # append holdout_block
          block = rbind(block, candidate[,1:2])
          j = j + 1}
      }
    }
    
    # Evaluate the block using three criteria
    crit1 <- 0
    crit2 <- 0
    crit3 <- 0
    
    # Criteria 1: Trade-off frequency
    block_x <- merge(alt_x,block,by = c("O","D"))
    x <- as.matrix(block_x %>% select(contains("x")))
    results = data.frame(compare_priors(block_x, beta, D_x, alt_x))
    if (results$Min.frequency < threshold3) {crit1 <- 1}
    
    # Criteria 2: Multicollinearity
    corr_x <- wtd.cors(x, weight=block_x$p)
    corr_count <- sum(abs(corr_x)> threshold4)
    if (sum(is.nan(corr_x))!=0){
      print("Correlation matrix has NaN")
      next}
    if(corr_count > ncol(corr_x)) {crit2 <- 1}
    
    # Criteria 3: Utility Balance (B-error)
    if (results$B.error< threshold5) {crit3 <- 1}
    
    # Save the block if the conditions are met
    if((crit1==0)&(crit2==0)&(crit3==0)&(results$D.error<D_crit)){
      
      # save as 'block_final' and 'block_x_final' if the blocks are D-efficient
      if (b == block_iter){
        block_final = rbind(block_final, cbind(b2, block))
        block_x_final = rbind(block_x_final, cbind(b2, block_x))
      }
      D_crit <- results$D.error
      total_time = difftime(Sys.time(), start_time, units = "mins")
      print_iter = t(data.frame(round(c(b2, b, results$D.error, 
                                        crit1, crit2, crit3, total_time), digits = 4)))
      print_iter_label = paste0("**Criteria and TIme of Block ",b2, " iteration ", b, "**")
      colnames(print_iter) = c("Block", "Iteration", "D-error", "Criteria 1", 
                                     "Criteria 2", "Criteria 3", "Minutes")
      rownames(print_iter) = ""
      print(kable(print_iter, align = "c", caption = print_iter_label, 
                  row.names = F, escape = FALSE, centering = T))
      
      # save the block
      fname = paste0("block_", b, "_", b2, ".csv")
      write.csv(block, fname, row.names = F)
      filename = rbind(filename,fname)
      assign(fname, block, envir = .GlobalEnv)  
      
      eval_table0 = cbind(b2,b,round(data.frame(compare_priors(block_x, 
                                                               beta, D_x, alt_x)), digits = 4))
      eval_table = rbind(eval_table,eval_table0)
      b = b + 1
    }
  }
}
Criteria and TIme of Block 1 iteration 1
Block Iteration D-error Criteria 1 Criteria 2 Criteria 3 Minutes
1 1 0.1 0 0 0 0.0432
Criteria and TIme of Block 1 iteration 2
Block Iteration D-error Criteria 1 Criteria 2 Criteria 3 Minutes
1 2 0.0984 0 0 0 0.0542
colnames(eval_table)[1:2] = c("Block1", "Iteration")
kable(eval_table, align = "c", caption = "**Comparison of Block Effectiveness Based 
      on Different Priors**", row.names = F, escape = FALSE, centering = T)
Comparison of Block Effectiveness Based on Different Priors
Block1 Iteration D.error A.error B.error Min.frequency Min.correlation Max.correlation
1 1 0.1000 1.0365 0.5166 0.0501 -0.2216 0.2413
1 2 0.0984 1.0359 0.4749 0.0508 -0.2326 0.1727

12 Block evaluation (Step 3)

In this section, we evaluate the chosen block with different priors and compute their A-error, B-error, D-error, minimum frequency and maximum correlation.

colnames(block_final)[1] = "block"
colnames(block_x_final)[1] = "block"

for (j in 1:block_save){
  
  block_final0 = block_final[block_final$block == j,2:ncol(block_final)]
  block_x_final0 = block_x_final[block_x_final$block == j,2:ncol(block_x_final)]
  
  # merge 'alt_x' with 'block_final'
  final_x <- merge(alt_x,block_final0,by = c("O","D"))
  x <- as.matrix(final_x %>% select(contains("x")))
  
  # Analyze block efficiency based on different priors
  beta0 = beta*1   # base priors
  beta1 = beta*0   # unweighted priors
  beta2 = beta*0.5 # half priors
  beta3 = beta*5   # five-times priors
  beta_all = cbind(beta0,beta1,beta2,beta3)
  table_x = NULL
  for (i in 1:ncol(beta_all)){
    results = round(data.frame(compare_priors(block_x_final0, beta_all[,i], D_x, alt_x)),
                    digits = 4)
    table_x = rbind(table_x,results)
  }
  rownames(table_x) = c("Base", "Unweighted", "Half", "Five-times")
  table_label = paste0("**Effectiveness of Block ",j, "**")
  print(kable(table_x, align = "c", caption = table_label, row.names = TRUE, 
              escape = FALSE, centering = T))
}
Effectiveness of Block 1
D.error A.error B.error Min.frequency Min.correlation Max.correlation
Base 0.0984 1.0359 0.4749 0.0508 -0.2326 0.1727
Unweighted 0.1060 1.0371 1.0000 0.1000 -0.1741 0.1389
Half 0.1037 1.0367 0.7917 0.0738 -0.2049 0.1551
Five-times 0.0437 1.0324 0.0318 0.0001 -0.3471 0.2331

13 Block formation (Step 4)

In this section we first assign hold-outs to the selected block and confirm confirm that the selected block is unique after holdout assignment

for (j in 1:block_save){
    
  block_final0 = block_final[block_final$block == j,2:ncol(block_final)]
  block_x_final0 = block_x_final[block_x_final$block == j,2:ncol(block_x_final)]
  
  # merge 'alt_x' with 'block_final'
  final_x <- merge(alt_x,block_final0,by = c("O","D"))
  
  # assign hold-outs and check if the new origins are unique
  final_block <- unique(final_x[,c("O","D")])
  i = 1
  while (i<2) {
    FINAL_BLOCK = assign_holdout(final_block,W)
    if (length(unique(FINAL_BLOCK$O)) == nrow(FINAL_BLOCK)) {
      print(paste0("The final block ", j, " consists of ", 
                   length(unique(FINAL_BLOCK$O)), " unique origins"))
      
      # save the block
      fname = paste0("FINAL_BLOCK_", j, ".csv")
      write.csv(FINAL_BLOCK, fname, row.names = F)
      i = i + 1} 
  }  
}
## [1] "The final block 1 consists of 20 unique origins"