Using data on voting behavior in European elections from http://www.jkarreth.net/files/full_cses3.dta, estimate a Bayesian logistic regression that predicts whether someone voted or not in 2007 as a function of their age, gender, education, and income. Present point estimates and measures of uncertainty for coefficient posterior distributions in a table. Calculate the predicted probability of voting as a function of age, and present in graphical form.
If you don’t want to write your own Stan model, there’s one available on my GitHub. Johannes Karreth has helpfully written a function that automatically performs predicted probability calcuations on the output of a Bayesian model. Combining the summary()
function with kable()
can produce a serviceable regression table, but I’ve written a function which makes much nicer looking tables with minimal effort.
library(rstan) # Bayesian inference
rstan_options(auto_write = TRUE) # avoid recompiling model
options(mc.cores = parallel::detectCores()) # parallelizing multiple chains
library(xtable) # table output
# read in data from Johannes Karreth's website and subset to 2007
elections <- rio::import('http://www.jkarreth.net/files/full_cses3.dta')
elections <- elections[grepl('2007', elections$election), ]
# drop NAs in right and left side variables
elections <- na.omit(elections[, c("voted", "age", "female", "education",
"income")])
# response variable
Y <- elections$voted
# predictors
X <- model.matrix(voted ~ scale(age) + female + as.factor(education)
+ as.factor(income), data = elections)
# create named list of data to pass to Stan
stan_data <- list(Y = Y, X = X, N = nrow(X), K = ncol(X))
# download Stan model
download.file('https://raw.githubusercontent.com/jayrobwilliams/JRWmisc/master/logit.stan',
destfile = 'logit.stan')
# fit model
vote_fit <- stan(file = 'logit.stan', data = stan_data, pars = 'beta',
iter = 1000, chains = 4)
# rename betas
names(vote_fit)[1:14] <- gsub('as.factor', '', colnames(X))
# source MCMC regression table function
source('https://raw.githubusercontent.com/jayrobwilliams/RWmisc/master/mcmcreg.R')
# create regression table
mcmcreg(vote_fit, pars = 'beta', format = 'html', caption = 'Bayesian logistic regression.
Point estimates are posterior means.')
Model 1 | ||
---|---|---|
(Intercept) | 0.47 | |
[-0.02; 0.98] | ||
scale(age) | 0.40* | |
[0.35; 0.45] | ||
female | -0.28* | |
[-0.37; -0.19] | ||
(education)2 | 0.57 | |
[-0.03; 1.16] | ||
(education)3 | 0.97* | |
[0.45; 1.47] | ||
(education)4 | 0.52 | |
[-0.01; 1.02] | ||
(education)5 | 0.70* | |
[0.16; 1.19] | ||
(education)6 | 0.53* | |
[0.01; 1.02] | ||
(education)7 | 1.27* | |
[0.71; 1.81] | ||
(education)8 | 1.33* | |
[0.79; 1.86] | ||
(income)2 | 0.40* | |
[0.28; 0.52] | ||
(income)3 | 0.66* | |
[0.52; 0.79] | ||
(income)4 | 0.91* | |
[0.77; 1.06] | ||
(income)5 | 1.10* | |
[0.94; 1.25] | ||
* 0 outside 95% credible interval |
# source predicted probabilities function
source('https://raw.githubusercontent.com/jkarreth/JKmisc/master/MCMC_simcase_probs.R')
# extract post-warmup beta draws from all four chains
vote_betas <- extract(vote_fit, pars = 'beta')$beta
# calculate predicted probabilities based on median of observed variables
vote_pp <- MCMC_simcase_probs(model_matrix = X, mcmc_out = vote_betas, x_col = 2,
x_range_vec = seq(min(X[, 2]), max(X[, 2]), length.out = 100),
link = 'logit',
lower = .025, upper = .975)
# source clean ggplot theme
source('https://raw.githubusercontent.com/jayrobwilliams/RWmisc/master/theme_rw.R')
# plot predicted probabilities, with density plot of observed values
ggplot(data = vote_pp, aes(x = predictor, y = median_pp, ymin = lower_pp, ymax = upper_pp)) +
geom_ribbon(alpha = .5) +
geom_line() +
geom_density(data = elections, aes(x = as.numeric(scale(age))), inherit.aes = F, color = NA,
fill = 'darkblue', alpha = .25) +
coord_cartesian(ylim = c(0,1)) +
labs(x = '(scaled) age', y = 'Pr(voting)') +
theme_rw()