Multilevel Regression with Poststratification

Individual Exercise Solution

Use the simulation approach to generating predicted probabilities to obtain 1000 simulated predicted probabilities for all demographic-state combinations of supporting marriage equality. Use these simulations to generate point estimates and 95% confidence intervals for the proportion of people within each state supporting marriage equality. Use at least 10 simulations.

Hint: the simulate() function is your friend because it has a method for objects of class merMod, which allows it to simulate new responses from the fitted model object – but be sure to account for the random effects in the model. You want to carry out the MRP process multiple times, substituting in a different simulated outcome vector each time. You can accomplish this either with a loop, or through a series of calls to apply() with anonymous functions.

library(foreign) # load .dta files
library(lme4) # multilevel models

# read in megapoll
marriage_data <- read.dta('https://www.princeton.edu/~jkastell/MRP_primer/gay_marriage_megapoll.dta', convert.underscore = TRUE) 


# read in state-level dataset
Statelevel <- read.dta('https://www.princeton.edu/~jkastell/MRP_primer/state_level_update.dta', convert.underscore = TRUE)
Statelevel <- Statelevel[order(Statelevel$sstate.initnum), ]

# read in sensus data
Census <- read.dta('https://www.princeton.edu/~jkastell/MRP_primer/poststratification%202000.dta', convert.underscore = TRUE)
Census <- Census[order(Census$cstate), ]
Census$cstate.initnum <-  match(Census$cstate, Statelevel$sstate)

# from 1 for white males to 6 for hispanic females
marriage_data$race.female <- (marriage_data$female *3) + marriage_data$race.wbh
# from 1 for 18-29 with low edu to 16 for 65+ with high edu
marriage_data$age.edu.cat <- 4 * (marriage_data$age.cat -1) + marriage_data$edu.cat
# proportion of evangelicals in respondent's state
marriage_data$p.evang.full <- Statelevel$p.evang[marriage_data$state.initnum]
# proportion of mormon's in respondent's state
marriage_data$p.mormon.full <-Statelevel$p.mormon[marriage_data$state.initnum]
# combined evangelical + mormom proportions
marriage_data$p.relig.full <- marriage_data$p.evang.full + marriage_data$p.mormon.full
# kerry's % of 2-party vote in respondent's state in 2004
marriage_data$p.kerry.full <- Statelevel$kerry.04[marriage_data$state.initnum]
# same coding as above
Census$crace.female <- (Census$cfemale *3) + Census$crace.WBH 
Census$cage.edu.cat <- 4 * (Census$cage.cat -1) + Census$cedu.cat 
Census$cp.evang.full<-  Statelevel$p.evang[Census$cstate.initnum]
Census$cp.mormon.full <- Statelevel$p.mormon[Census$cstate.initnum]
Census$cp.relig.full <- Census$cp.evang.full + Census$cp.mormon.full
Census$cp.kerry.full <-  Statelevel$kerry.04[Census$cstate.initnum]

individual_model <- glmer(formula = yes.of.all ~ (1 | race.female) + (1 | age.cat)
                          + ( 1 |edu.cat) + (1 | age.edu.cat) + (1 | state) + (1 | region)
                          + (1 | poll) + p.relig.full + p.kerry.full, data = marriage_data,
                          family = binomial(link = 'logit'))

library(arm) # inverse logit function
library(doParallel) # parallelize simulation
library(ggplot2) # plots
library(plotly) # interactive plots

# parallel computing setup
registerDoParallel(makeCluster(parallel::detectCores()))

# simulate responses from model
sim <- simulate(individual_model, nsim = 50, re.form = NULL)

# create dataframe to merge with simulated outcome vectors; account for listwise deletion
sim_dat <- na.omit(marriage_data[, c('race.female', 'age.cat', 'edu.cat', 'age.edu.cat',
                        'state', 'region', 'poll', 'p.relig.full', 'p.kerry.full')])

# check that simulated outcome vectors have same length as dataframe
stopifnot(nrow(sim_dat) == nrow(sim))

# parallel loop for simulations
sim_outcome <- foreach(i = 1:ncol(sim), .packages = c('lme4', 'arm'), .combine = cbind) %dopar% {
  
  # get ith simulated outcome vector
  temp_dat <- data.frame(sim_dat, yes_sim = sim[, i])
  
  # run model on simulated outcome vector
  temp_mod <- glmer(yes_sim ~ (1 | race.female) + (1 | age.cat) + ( 1 |edu.cat)
                    + (1 | age.edu.cat) + (1 | state) + (1 | region) + (1 | poll)
                    + p.relig.full + p.kerry.full, data = temp_dat,
                    family = binomial(link = 'logit'))
  
  # empty vector to hold state random effects
  state_ranefs <- array(NA, c(51, 1))
  
  # set state names as row names
  dimnames(state_ranefs) <- list(c(Statelevel$sstate), 'effect')
  
  # assign state random effects to array while preserving NAs
  for (i in Statelevel$sstate) {
    
    state_ranefs[i, ] <- ranef(temp_mod)$state[i, 1]
    
  }
  
  # set states with missing REs (b/c not in data) to zero
  state_ranefs[, 1][is.na(state_ranefs[, 1])] <- 0
  
  # prediction for each cell in census data
  temp_pred <- invlogit(fixef(temp_mod)["(Intercept)"]
                     + ranef(temp_mod)$race.female[Census$crace.female, 1]
                     + ranef(temp_mod)$age.cat[Census$cage.cat, 1]
                     + ranef(temp_mod)$edu.cat[Census$cedu.cat, 1]
                     + ranef(temp_mod)$age.edu.cat[Census$cage.edu.cat, 1]
                     + state_ranefs[Census$cstate, 1]
                     + ranef(temp_mod)$region[Census$cregion, 1]
                     + (fixef(individual_model)["p.relig.full"] * Census$cp.relig.full)
                     + (fixef(temp_mod)["p.kerry.full"] * Census$cp.kerry.full))
  
  # weight prediction by cell frequency
  temp_wgt <- temp_pred * Census$cpercent.state
  
  # sum proportion of cells supporting in each state and convert to percent
  100 * as.vector(tapply(temp_wgt, Census$cstate, sum))
  
}

# collect mean, .025 and .975 quantiles, and proportion religious
sim_gg <- data.frame(State = Statelevel$sstate,
           Lower = apply(sim_outcome, 1, quantile, probs = .025),
           Mean = apply(sim_outcome, 1, mean),
           Upper = apply(sim_outcome, 1, quantile, probs = .975),
           Religion = Statelevel$p.evang + Statelevel$p.mormon)

# present estimates
sim_gg[, -ncol(sim_gg)]

# plot estimates with uncertainty, colored by religion, label to avoid reorder() in plotly
sim_plot <- ggplot(sim_gg, aes(x = reorder(State, Mean), y = Mean, ymin = Lower,
                               ymax = Upper, color = Religion, label = State)) +
  geom_linerange(alpha = .75, size = 1) +
  geom_point() +
  labs(x = 'State', y = 'Estimated Support') +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x  = element_text(angle = 90),
        legend.position = 'right',
        plot.background = element_blank(),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        panel.border = element_blank())

# interactive plot; ggplotly needs colour, not color
ggplotly(sim_plot, tooltip = c('label', 'ymax', 'y', 'ymin', 'colour'))

Multilevel Regression with Poststratification

Rob Williams

September 20, 2018

Individual Exercise Solution