Appendix 3: Fake Data Simulations
22.11 Performance of Blocking relative to a linear model
# use these to debug code
sigma=1
sigma_b0=1
sigma_b1=1
beta_0=10
beta_1=1
n_batch=4
n_subsamp=2
y_label="Y"
trt_levels=c("cn","tr")
block_label="block"
fake_lmm_data <- function(iterations=1000, sigma=1, sigma_b0=1, sigma_b1=1, beta_0=10, beta_1=1, n_batch=6, n_subsamp=10, y_label="y", trt_levels=c("cn","tr"), batch_label="block", confound=FALSE){
# this function is constrained to simulate a single treatment with two levels
#
# arguments
# iterations - number of datasets to generate
# sigma: conditional error sd
# sigma_b0: sd of random intercepts
# sigma_b1: sd of random slope
# beta_0: fixed intercept (mean of reference)
# beta_1: fixed slope (difference tr - cn)
# n_batch: number of batches
# n_subsamp: number of observations per batch per treatment level
# confound: FALSE is randomized complete block, TRUE is confounded case where
# there is only one treatment level per batch
#
# output
# A single matrix with each dataset stacked. The datasets are identified by
# the first column ("data_id")
if(sigma_b0==0){sigma_b0 <- 1e-10}
if(sigma_b1==0){sigma_b1 <- 1e-10}
n_iter <- iterations
levels_per_batch <- ifelse(confound==FALSE, 2, 1)
fake_data <- data.table(
data_id = rep(1:n_iter, each=n_batch*n_subsamp*levels_per_batch),
sigma = sigma,
sigma_b0 = sigma_b0,
sigma_b1 = sigma_b1,
beta_0 = beta_0,
beta_1 = beta_1,
treatment = rep(rep(trt_levels, each=n_subsamp), n_batch*levels_per_batch/2*n_iter),
batch = rep(rep(paste0("batch_", 1:(n_batch)), each=n_subsamp*levels_per_batch), n_iter),
beta_0_j = rep(rnorm(n_batch*n_iter, mean=0, sd=sigma_b0), each=n_subsamp*levels_per_batch),
beta_1_j = rep(rnorm(n_batch*n_iter, mean=0, sd=sigma_b1), each=n_subsamp*levels_per_batch),
x = rep(rep(c(0, 1), each=n_subsamp), n_batch*levels_per_batch/2*n_iter),
e = rnorm(n_subsamp*n_batch*levels_per_batch*n_iter, mean=0, sd=sigma)
)
fake_data[, y:= (beta_0 + beta_0_j) + (beta_1 + beta_1_j)*x + e]
setnames(fake_data, old=c("y", "batch"), new=c(y_label, batch_label))
fake_data[, treatment := factor(treatment)]
return(fake_data)
}
# depending on parameterization, can get many "failed to converge"
# and "isSingular" warnings
write_it <- FALSE
n_iter <- 5000
beta_1_i <- 0 # 0 = Type I, !0 = Power.
confound_i <- FALSE # FALSE is randomized complete block, TRUE is confounded
#case where there is only one treatment level per batch
n <- 3 # subsamples
k <- 8 # batches
# model_list <- c("lm_complete", "lm_mean", "lmm_slope", "lmm_inter")
model_list <- c("lm_complete", "lm_mean", "lmm_inter")
se <- matrix(NA, nrow=n_iter, ncol=length(model_list))
colnames(se) <- model_list
prob <- matrix(NA, nrow=n_iter, ncol=length(model_list))
colnames(prob) <- model_list
ci <- matrix(NA, nrow=n_iter, ncol=length(model_list))
colnames(ci) <- model_list
fd_set <- fake_lmm_data(n_iter,
sigma = 1,
sigma_b0 = 1, # 1 for big, 0.1 for small
sigma_b1 = 0.1,
beta_0 = 10,
beta_1 = beta_1_i,
n_batch = k,
n_subsamp = n,
confound = confound_i)
for(iter in 1:n_iter){
fd <- fd_set[data_id==iter,]
m1 <- lm(y ~ treatment, data=fd)
m2 <- lm(y ~ treatment, data=fd[, .(y=mean(y)), by=.(treatment, block)])
if("lmm_slope" %in% length(model_list)){
m3 <- lmer(y ~ treatment + (treatment|block), data=fd)
m3.pairs <- summary(contrast(emmeans(m3, specs="treatment"), method="revpairwise"), infer=c(TRUE, TRUE))
}
m4 <- lmer(y ~ treatment + (1|block), data=fd)
m4.pairs <- summary(contrast(emmeans(m4, specs="treatment"), method="revpairwise"), infer=c(TRUE, TRUE))
se[iter, "lm_complete"] <- coef(summary(m1))["treatmenttr", "Std. Error"]
se[iter, "lm_mean"] <- coef(summary(m2))["treatmenttr", "Std. Error"]
if("lmm_slope" %in% length(model_list)){
se[iter, "lmm_slope"] <- coef(summary(m3))["treatmenttr", "Std. Error"]
}
se[iter, "lmm_inter"] <- coef(summary(m4))["treatmenttr", "Std. Error"]
prob[iter, "lm_complete"] <- coef(summary(m1))["treatmenttr", "Pr(>|t|)"]
prob[iter, "lm_mean"] <- coef(summary(m2))["treatmenttr", "Pr(>|t|)"]
if("lmm_slope" %in% length(model_list)){
prob[iter, "lmm_slope"] <- coef(summary(m3))["treatmenttr", "Pr(>|t|)"]
}
prob[iter, "lmm_inter"] <- coef(summary(m4))["treatmenttr", "Pr(>|t|)"]
ci[iter, "lm_complete"] <- confint(m1)["treatmenttr", 2] -
confint(m1)["treatmenttr", 1]
ci[iter, "lm_mean"] <- confint(m2)["treatmenttr", 2] -
confint(m2)["treatmenttr", 1]
if("lmm_slope" %in% length(model_list)){
ci[iter, "lmm_slope"] <- m3.pairs[,"upper.CL"] - m3.pairs[,"lower.CL"]
}
ci[iter, "lmm_inter"] <- m4.pairs[,"upper.CL"] - m4.pairs[,"lower.CL"]
# m4.pairs.LT <- difflsmeans(m4, which="treatment", ddf="Kenward-Roger")
# m4.pairs.LT[, "upper"] - m4.pairs.LT[, "lower"]
}
if(write_it ==TRUE){
id <- paste(sample(c(letters, LETTERS), 4), collapse="")
fn <- paste0("lmm_fd_beta1=", beta_1_i,
"_confound=",confound_i,
"_id=", id,
".txt")
fp <- here("output", "chapter_lmm", fn)
write.table(fd_set, fp, sep="\t", quote=FALSE, row.names=FALSE)
fp <- here("output", "chapter_lmm", paste0("lmm_se-",id,".txt"))
write.table(se, fp, sep="\t", quote=FALSE, row.names=FALSE)
fp <- here("output", "chapter_lmm", paste0("lmm_prob-",id,".txt"))
write.table(prob, fp, sep="\t", quote=FALSE, row.names=FALSE)
fp <- here("output", "chapter_lmm", paste0("lmm_ci-",id,".txt"))
write.table(ci, fp, sep="\t", quote=FALSE, row.names=FALSE)
}
apply(se, 2, quantile, c(0.1, 0.5, 0.9))
apply(prob, 2, function(x) sum(x<0.05)/n_iter)
apply(ci, 2, quantile, c(0.1, 0.5, 0.9))
Gegenrezer↩︎
“The Pretty Good House - Finding the right balance between construction cost and energy performance”. https://www.greenbuildingadvisor.com/article/the-pretty-good-house↩︎
Rolig, A.S., Mittge, E.K., Ganz, J., Troll, J.V., Melancon, E., Wiles, T.J., Alligood, K., Stephens, W.Z., Eisen, J.S. and Guillemin, K., 2017. The enteric nervous system promotes intestinal health by constraining microbiota composition. PLoS biology, 15(2), p.e2000689↩︎
Amrhein, V., Trafimow, D. and Greenland, S., 2019. Inferential statistics as descriptive statistics: There is no replication crisis if we don’t expect replication. The American Statistician, 73(sup1), pp.262-270.↩︎
Giffard, B., Corcket, E., Barbaro, L., & Jactel, H. (2012). Bird predation enhances tree seedling resistance to insect herbivores in contrasting forest habitats. Oecologia, 168(2), 415-424↩︎
Sharon, G., Cruz, N.J., Kang, D.W., Gandal, M.J., Wang, B., Kim, Y.M., Zink, E.M., Casey, C.P., Taylor, B.C., Lane, C.J. and Bramer, L.M., 2019. Human Gut Microbiota from Autism Spectrum Disorder Promote Behavioral Symptoms in Mice. Cell, 177(6), pp.1600-1618.↩︎
a random slope is not modeled because many of the models specifying a random slope fail to converge, which is expected given the relatively small variance of the random slope↩︎
Benesh, D. P., & Kalbe, M. (2016). Experimental parasite community ecology: intraspecific variation in a large tapeworm affects community assembly. Journal of Animal Ecology, 85(4), 1004-1013↩︎
fitted values are the predicted values, \(\hat{Y}\)↩︎
the variance is less than that expected by the probability model↩︎
the variance is greater than that expected by the probability model↩︎