Chapter 22 Simulations – Count data (alternatives to a t-test)
library(here)
library(data.table)
library(readxl)
library(MASS)
library(lmPerm)
library(nlme)
library(glmmTMB)
library(emmeans)
library(knitr)
library(kableExtra)
<- here::here here
22.1 Use data similar to Figure 6f from Example 1
The simulated data are modeled to look like the Figure 6f data used in Example 1 of the Violations chapter.
<- "data"
data_folder <- "Exercise reduces inflammatory cell production and cardiovascular inflammation via instruction of hematopoietic progenitor cells"
data_from <- "41591_2019_633_MOESM8_ESM.xlsx"
file_name <- here(data_folder, data_from, file_name)
file_path
# assuming mice are independent and not same mouse used for all three treatment
<- c("Sedentary", "Exercise")
melt_col_names <- read_excel(file_path,
fig6f sheet = "Figure 6f",
range = "A7:B29",
col_names = TRUE) %>%
data.table() %>%
melt(measure.vars = melt_col_names,
variable.name = "treatment",
value.name = "neutrophils") %>%
na.omit() # danger!
<- melt_col_names
treatment_levels := factor(treatment,
fig6f[, treatment levels = treatment_levels)]
# neutrophils is count/10^6
:= round(neutrophils*10^6, 0)]
fig6f[, neutrophil_count
# fig6f[1, neutrophil_count]
#View(fig6f)
<- glmmTMB(neutrophil_count ~ treatment,
m1 data = fig6f,
family = nbinom2(link = "log"))
<- exp(coef(summary(m1))$cond["(Intercept)", "Estimate"])
mu_cn_obs <- sigma(m1) %>%
theta_obs round(1)
# compare with glm.nb, which is used for the simulation
<- glm.nb(neutrophil_count ~ treatment,
m2 data = fig6f)
mu_cn_obs
## [1] 3633140
$theta m2
## [1] 6.607988
22.2 Functions
<- function(n_sim = 1000,
simulator n_levels = c(10, 10),
treatment_levels = c("cn", "tr"),
mu_levels = c(10, 10),
theta_levels = c(1, 1),
sigma_levels = c(1,1),
normal = FALSE,
seed = 1){
set.seed(seed)
<- sum(n_levels)
N
<- rep(mu_levels, n_levels)
mu_sim <- rep(theta_levels, n_levels)
theta_sim <- rep(sigma_levels, n_levels)
sigma_sim
<- c("lm", "gls", "glm_nb", "glm_qp", "lmp", "mww")
method_list <- matrix(as.numeric(NA),
p_values nrow = n_sim,
ncol = length(method_list)) %>%
data.table()
colnames(p_values) <- method_list
<- matrix(as.numeric(NA),
fake_counts nrow = N,
ncol = n_sim)
if(normal == TRUE){
for(j in 1:n_sim){
<- rnorm(N,
fake_counts[, j] mean = mu_sim,
sd = sigma_sim) %>%
round(0)
}else{
}for(j in 1:n_sim){
<- rnegbin(N,
fake_counts[, j] mu = mu_sim,
theta = theta_sim)
}
}
for(j in 1:n_sim){
:= fake_counts[,j]]
fake_data[, count <- lm(count ~ treatment, data = fake_data)
m1 <- gls(count ~ treatment,
m2 data = fake_data,
weights = varIdent(form = ~ 1 | treatment))
<- contrast(emmeans(m2, specs = "treatment"),
m2_pairs method = "revpairwise") %>%
summary()
<- glm.nb(count ~ treatment, data = fake_data)
m3 <- glm(count ~ treatment,
m4 data = fake_data,
family = quasipoisson)
# m3 <- glmmTMB(count ~ treatment,
# data = fake_data,
# family = nbinom2(link = "log"))$cond
# m4 <- glmmTMB(count ~ treatment,
# data = fake_data,
# family = nbinom1(link = "log"))$cond
<- lmp(count ~ treatment,
m5 data = fake_data,
perm = "Exact",
settings = FALSE)
<- wilcox.test(count ~ treatment,
m6 data = fake_data)
:= coef(summary(m1))[2, "Pr(>|t|)"]]
p_values[j, lm := m2_pairs[1, "p.value"]]
p_values[j, gls := coef(summary(m3))[2, "Pr(>|z|)"]]
p_values[j, glm_nb := coef(summary(m4))[2, "Pr(>|t|)"]]
p_values[j, glm_qp := coef(summary(m5))[2, "Pr(Prob)"]]
p_values[j, lmp := m6$p.value]
p_values[j, mww
}
return(p_values)
}
<- function(
binder
p_values,
sim_res,
sim_i,
n_levels_sim,
mu_levels_sim,
theta_levels_sim,
sigma_levels_sim,
normal_sim){
<- data.table(
sim_table sim = sim_i,
n_sim = n_sim_i,
n1 = n_levels_sim[1],
n2 = n_levels_sim[2],
n = ifelse(n_levels_sim[1] == n_levels_sim[2],
"=",
"!="),
mu1 = mu_levels_sim[1],
mu2 = mu_levels_sim[2],
type = ifelse(mu_levels_sim[1] == mu_levels_sim[2],
"type 1",
"power"),
theta1 = theta_levels_sim[1],
theta2 = theta_levels_sim[2],
sigma1 = sigma_levels_sim[1],
sigma1 = sigma_levels_sim[2],
normal = normal_sim,
sim_res
)
<- rbind(p_values, sim_table)
p_values return(p_values)
}
22.3 Simulations
<- FALSE
do_it <- data.table(NULL)
p_values <- 0
sim_i = 4000 n_sim_i
22.3.1 Type I, Pseudo-Normal distribution
<- sim_i + 1
sim_i <- c(10, 10)
n_levels_sim <- c(1000, 1000)
mu_levels_sim <- c(NA, NA) # not used
theta_levels_sim <- c(100, 100)
sigma_levels_sim <- TRUE
normal_sim if(do_it == TRUE){
<- simulator(
sim_res n_sim = n_sim_i,
n_levels = n_levels_sim,
mu_levels = mu_levels_sim,
theta_levels = theta_levels_sim, # not used
sigma_levels = sigma_levels_sim,
normal = normal_sim
)<- binder(p_values,
p_values
sim_res,
sim_i,
n_levels_sim,
mu_levels_sim,
theta_levels_sim,
sigma_levels_sim,
normal_sim)# apply(sim_res, 2, function(x) sum(x < 0.05)/n_sim_i)
}
22.3.2 Type I, neg binom, equal n
<- sim_i + 1
sim_i <- c(10, 10)
n_levels_sim <- c(mu_cn_obs, mu_cn_obs)
mu_levels_sim <- c(theta_obs, theta_obs)
theta_levels_sim <- c(NA, NA) # not used
sigma_levels_sim <- FALSE
normal_sim
# fake_data has to be global for emmeans to run inside
# a function
<- c("cn", "tr")
treatment_levels <- data.table(
fake_data treatment = factor(rep(treatment_levels, n_levels_sim),
levels = treatment_levels)
)
if(do_it == TRUE){
<- simulator(
sim_res n_sim = n_sim_i,
n_levels = n_levels_sim,
mu_levels = mu_levels_sim,
theta_levels = theta_levels_sim, # not used
sigma_levels = sigma_levels_sim,
normal = normal_sim
)<- binder(p_values,
p_values
sim_res,
sim_i,
n_levels_sim,
mu_levels_sim,
theta_levels_sim,
sigma_levels_sim,
normal_sim)# apply(sim_res, 2, function(x) sum(x < 0.05)/n_sim_i)
}
22.3.3 Type I, neg binom, equal n, small theta
<- sim_i + 1
sim_i <- c(10, 10)
n_levels_sim <- c(mu_cn_obs, mu_cn_obs)
mu_levels_sim <- c(1, 1)
theta_levels_sim <- c(NA, NA) # not used
sigma_levels_sim <- FALSE
normal_sim
# fake_data has to be global for emmeans to run inside
# a function
<- c("cn", "tr")
treatment_levels <- data.table(
fake_data treatment = factor(rep(treatment_levels, n_levels_sim),
levels = treatment_levels)
)
if(do_it == TRUE){
<- simulator(
sim_res n_sim = n_sim_i,
n_levels = n_levels_sim,
mu_levels = mu_levels_sim,
theta_levels = theta_levels_sim, # not used
sigma_levels = sigma_levels_sim,
normal = normal_sim
)<- binder(p_values,
p_values
sim_res,
sim_i,
n_levels_sim,
mu_levels_sim,
theta_levels_sim,
sigma_levels_sim,
normal_sim)# apply(sim_res, 2, function(x) sum(x < 0.05)/n_sim_i)
}
22.3.4 Type I, neg binom, unequal n
<- sim_i + 1
sim_i <- c(12, 8)
n_levels_sim <- c(mu_cn_obs, mu_cn_obs)
mu_levels_sim <- c(theta_obs, theta_obs)
theta_levels_sim <- c(NA, NA) # not used
sigma_levels_sim <- FALSE
normal_sim
# fake_data has to be global for emmeans to run inside
# a function
<- c("cn", "tr")
treatment_levels <- data.table(
fake_data treatment = factor(rep(treatment_levels, n_levels_sim),
levels = treatment_levels)
)
if(do_it == TRUE){
<- simulator(
sim_res n_sim = n_sim_i,
n_levels = n_levels_sim,
mu_levels = mu_levels_sim,
theta_levels = theta_levels_sim, # not used
sigma_levels = sigma_levels_sim,
normal = normal_sim
)<- binder(p_values,
p_values
sim_res,
sim_i,
n_levels_sim,
mu_levels_sim,
theta_levels_sim,
sigma_levels_sim,
normal_sim)# apply(sim_res, 2, function(x) sum(x < 0.05)/n_sim_i)
}
22.3.5 Power, Pseudo-Normal distribution, equal n
<- sim_i + 1
sim_i <- c(10, 10)
n_levels_sim <- c(1000, 1100)
mu_levels_sim <- c(NA, NA) # not used
theta_levels_sim <- c(100, 100)
sigma_levels_sim <- TRUE
normal_sim
# fake_data has to be global for emmeans to run inside
# a function
<- c("cn", "tr")
treatment_levels <- data.table(
fake_data treatment = factor(rep(treatment_levels, n_levels_sim),
levels = treatment_levels)
)
if(do_it == TRUE){
<- simulator(
sim_res n_sim = n_sim_i,
n_levels = n_levels_sim,
mu_levels = mu_levels_sim,
theta_levels = theta_levels_sim, # not used
sigma_levels = sigma_levels_sim,
normal = normal_sim
)<- binder(p_values,
p_values
sim_res,
sim_i,
n_levels_sim,
mu_levels_sim,
theta_levels_sim,
sigma_levels_sim,
normal_sim)
# apply(sim_res, 2, function(x) sum(x < 0.05)/n_sim_i)
}
22.3.6 Power, neg binom, equal n
<- sim_i + 1
sim_i <- c(10, 10)
n_levels_sim <- c(mu_cn_obs, mu_cn_obs*.7)
mu_levels_sim <- c(theta_obs, theta_obs)
theta_levels_sim <- c(NA, NA) # not used
sigma_levels_sim <- FALSE
normal_sim
# fake_data has to be global for emmeans to run inside
# a function
<- c("cn", "tr")
treatment_levels <- data.table(
fake_data treatment = factor(rep(treatment_levels, n_levels_sim),
levels = treatment_levels)
)
if(do_it == TRUE){
<- simulator(
sim_res n_sim = n_sim_i,
n_levels = n_levels_sim,
mu_levels = mu_levels_sim,
theta_levels = theta_levels_sim, # not used
sigma_levels = sigma_levels_sim,
normal = normal_sim
)<- binder(p_values,
p_values
sim_res,
sim_i,
n_levels_sim,
mu_levels_sim,
theta_levels_sim,
sigma_levels_sim,
normal_sim)# apply(sim_res, 2, function(x) sum(x < 0.05)/n_sim_i)
}
22.3.7 Power, neg binom, small theta
<- sim_i + 1
sim_i <- c(10, 10)
n_levels_sim <- c(mu_cn_obs, mu_cn_obs*.7)
mu_levels_sim <- c(1, 1)
theta_levels_sim <- c(NA, NA) # not used
sigma_levels_sim <- FALSE
normal_sim
# fake_data has to be global for emmeans to run inside
# a function
<- c("cn", "tr")
treatment_levels <- data.table(
fake_data treatment = factor(rep(treatment_levels, n_levels_sim),
levels = treatment_levels)
)
if(do_it == TRUE){
<- simulator(
sim_res n_sim = n_sim_i,
n_levels = n_levels_sim,
mu_levels = mu_levels_sim,
theta_levels = theta_levels_sim, # not used
sigma_levels = sigma_levels_sim,
normal = normal_sim
)<- binder(p_values,
p_values
sim_res,
sim_i,
n_levels_sim,
mu_levels_sim,
theta_levels_sim,
sigma_levels_sim,
normal_sim)# apply(sim_res, 2, function(x) sum(x < 0.05)/n_sim_i)
}
22.3.8 Power, neg binom, unequal n
<- sim_i + 1
sim_i <- c(12, 8)
n_levels_sim <- c(mu_cn_obs, mu_cn_obs*.7)
mu_levels_sim <- c(theta_obs, theta_obs)
theta_levels_sim <- c(NA, NA) # not used
sigma_levels_sim <- FALSE
normal_sim
# fake_data has to be global for emmeans to run inside
# a function
<- c("cn", "tr")
treatment_levels <- data.table(
fake_data treatment = factor(rep(treatment_levels, n_levels_sim),
levels = treatment_levels)
)
if(do_it == TRUE){
<- simulator(
sim_res n_sim = n_sim_i,
n_levels = n_levels_sim,
mu_levels = mu_levels_sim,
theta_levels = theta_levels_sim, # not used
sigma_levels = sigma_levels_sim,
normal = normal_sim
)<- binder(p_values,
p_values
sim_res,
sim_i,
n_levels_sim,
mu_levels_sim,
theta_levels_sim,
sigma_levels_sim,
normal_sim)# apply(sim_res, 2, function(x) sum(x < 0.05)/n_sim_i)
}
22.3.9 Power, neg binom, unequal n, unequal theta
What if the treatment affects the variance and the mean?
<- sim_i + 1
sim_i <- c(12, 8)
n_levels_sim <- c(mu_cn_obs, mu_cn_obs*.7)
mu_levels_sim <- c(1, theta_obs)
theta_levels_sim <- c(NA, NA) # not used
sigma_levels_sim <- FALSE
normal_sim
# fake_data has to be global for emmeans to run inside
# a function
<- c("cn", "tr")
treatment_levels <- data.table(
fake_data treatment = factor(rep(treatment_levels, n_levels_sim),
levels = treatment_levels)
)
if(do_it == TRUE){
<- simulator(
sim_res n_sim = n_sim_i,
n_levels = n_levels_sim,
mu_levels = mu_levels_sim,
theta_levels = theta_levels_sim, # not used
sigma_levels = sigma_levels_sim,
normal = normal_sim
)<- binder(p_values,
p_values
sim_res,
sim_i,
n_levels_sim,
mu_levels_sim,
theta_levels_sim,
sigma_levels_sim,
normal_sim)# apply(sim_res, 2, function(x) sum(x < 0.05)/n_sim_i)
}
22.3.10 Type 1, neg binom, equal n, unequal theta
What if the treatment affects the variance but not the mean?
<- sim_i + 1
sim_i <- c(12, 8)
n_levels_sim <- c(mu_cn_obs, mu_cn_obs)
mu_levels_sim <- c(theta_obs, 1)
theta_levels_sim <- c(NA, NA) # not used
sigma_levels_sim <- FALSE
normal_sim
# fake_data has to be global for emmeans to run inside
# a function
<- c("cn", "tr")
treatment_levels <- data.table(
fake_data treatment = factor(rep(treatment_levels, n_levels_sim),
levels = treatment_levels)
)
if(do_it == TRUE){
<- simulator(
sim_res n_sim = n_sim_i,
n_levels = n_levels_sim,
mu_levels = mu_levels_sim,
theta_levels = theta_levels_sim, # not used
sigma_levels = sigma_levels_sim,
normal = normal_sim
)<- binder(p_values,
p_values
sim_res,
sim_i,
n_levels_sim,
mu_levels_sim,
theta_levels_sim,
sigma_levels_sim,
normal_sim)# apply(sim_res, 2, function(x) sum(x < 0.05)/n_sim_i)
}
22.4 Save it, Read it
<- "sim_data"
sim_data_folder <- "counts_t_test_alternative.Rds"
sim_fn <- here(sim_data_folder, sim_fn)
sim_path if(do_it == TRUE){
saveRDS(p_values, sim_path)
else{
}<- readRDS(sim_path)
p_values }
22.5 Analysis
<- c("lm", "gls", "glm_nb", "glm_qp", "lmp", "mww")
method_list
lapply(.SD,
p_values[, function(x) sum(x < 0.05)/mean(n_sim)),
= method_list,
.SDcols = c("sim", "normal","type", "n", "theta1", "theta2")] %>%
by kable(digits = c(0,0,0,0,2,2,3,3,3,3,3,3)) %>%
kable_styling()
sim | normal | type | n | theta1 | theta2 | lm | gls | glm_nb | glm_qp | lmp | mww |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | TRUE | type 1 | = | 0.053 | 0.050 | 0.076 | 0.052 | 0.053 | 0.046 | ||
2 | FALSE | type 1 | = | 6.6 | 6.6 | 0.043 | 0.041 | 0.075 | 0.044 | 0.046 | 0.039 |
3 | FALSE | type 1 | = | 1.0 | 1.0 | 0.039 | 0.034 | 0.075 | 0.046 | 0.046 | 0.041 |
4 | FALSE | type 1 | != | 6.6 | 6.6 | 0.047 | 0.049 | 0.080 | 0.048 | 0.050 | 0.046 |
5 | TRUE | power | = | 0.573 | 0.567 | 0.653 | 0.572 | 0.571 | 0.528 | ||
6 | FALSE | power | = | 6.6 | 6.6 | 0.488 | 0.476 | 0.590 | 0.494 | 0.499 | 0.440 |
7 | FALSE | power | = | 1.0 | 1.0 | 0.096 | 0.082 | 0.171 | 0.113 | 0.114 | 0.088 |
8 | FALSE | power | != | 6.6 | 6.6 | 0.442 | 0.501 | 0.578 | 0.452 | 0.444 | 0.438 |
9 | FALSE | power | != | 1.0 | 6.6 | 0.034 | 0.082 | 0.144 | 0.054 | 0.041 | 0.045 |
10 | FALSE | type 1 | != | 6.6 | 1.0 | 0.127 | 0.103 | 0.139 | 0.116 | 0.132 | 0.144 |