Reading in the data

library(ggplot2)
library(arm)
srrs2 <- read.table ("srrs2.dat", header=T, sep=",")
mn <- srrs2[srrs2$state=="MN",]
mn$radon <- mn$activity
mn$log.radon <- log (ifelse (mn$radon==0, 0.0001, mn$radon))

Complete pooling

mean(mn$log.radon)
## [1] 1.202073
sd(mn$log.radon)
## [1] 1.021198

Let’s compute the average log-radon for all counties

fit <- lm(log.radon~0+county, data=mn)
s <- summary(fit)
coefs <- as.data.frame(s$coefficients)

Add a column with County nmaes

cleanup_name <- function(s){
  gsub(" ", "", gsub("county", "", s))
}


coefs$county <- as.character(lapply(rownames(s$coefficients), cleanup_name))
ggplot(coefs, aes(x = county, y = Estimate)) +  
  geom_errorbar(aes(ymin=Estimate-`Std. Error`, ymax=Estimate+`Std. Error`)) + 
  theme(axis.text.x = element_text(angle = 90))

subset_coefs <- coefs[c(2, 10, 22, 30, 36, 40, 50, 55, 60, 65),]
ggplot(subset_coefs, aes(x = county, y = Estimate)) +  
  geom_errorbar(aes(ymin=Estimate-`Std. Error`, ymax=Estimate+`Std. Error`)) + 
  theme(axis.text.x = element_text(angle = 90))

How many datapoints for Lac Qui Parle?

mn$county_clean <- as.character(lapply(mn$county, cleanup_name))
mn[mn$county_clean=="LACQUIPARLE", c("zip", "log.radon")]
##        zip log.radon
## 5498 56256  2.424803
## 5499 56256  2.772589

Just two datapoints.

What about Anoka?

mn[mn$county_clean=="ANOKA", c("zip", "log.radon")]
##        zip   log.radon
## 5085 55011  1.13140211
## 5086 55014  0.91629073
## 5087 55014  0.40546511
## 5088 55014  0.00000000
## 5089 55014 -0.35667494
## 5090 55014  0.18232156
## 5091 55014  0.18232156
## 5092 55014  0.26236426
## 5093 55014  0.33647224
## 5094 55025 -0.91629073
## 5095 55025  0.09531018
## 5096 55025  1.50407740
## 5097 55025  0.26236426
## 5098 55025  0.74193734
## 5099 55038  1.77495235
## 5100 55070  1.19392247
## 5101 55303  0.58778666
## 5102 55303  1.68639895
## 5103 55303  1.84054963
## 5104 55303  0.64185389
## 5105 55303  1.88706965
## 5106 55303  1.13140211
## 5107 55303  1.91692261
## 5108 55303  1.94591015
## 5109 55303  2.04122033
## 5110 55303  1.64865863
## 5111 55303  1.50407740
## 5112 55303  1.48160454
## 5113 55303  1.02961942
## 5114 55303  2.09186406
## 5115 55303  0.47000363
## 5116 55304  1.43508453
## 5117 55304  1.68639895
## 5118 55304  1.38629436
## 5119 55304  0.83290912
## 5120 55304  1.06471074
## 5121 55304  0.33647224
## 5122 55304  1.19392247
## 5123 55304  1.06471074
## 5124 55304  0.58778666
## 5125 55304 -1.60943791
## 5126 55421  0.87546874
## 5127 55421  0.09531018
## 5128 55432  0.78845736
## 5129 55432 -0.51082562
## 5130 55432  0.53062825
## 5131 55433  1.06471074
## 5132 55433  0.78845736
## 5133 55434  0.53062825
## 5134 55434  0.33647224
## 5135 55434  0.64185389
## 5136 55434  0.58778666

Lots more. In general, larger error bars means smaller sample size. We should trust that estimate less.

Multilvel Model – fake data

Let’s generate county data

mu_a <- 1.1
sigma_a <- .3
sigma_y <- 0.8

n_counties <- 10
county_alphas <- rnorm(n_counties, mu_a, sigma_a)
county_alphas
##  [1] 1.0223914 0.4763150 1.6578417 1.3256806 1.3742734 1.2144768 1.4397130
##  [8] 0.5161406 1.1590883 1.1223003

Now, 10 measurements for county 2 would look like this:

rnorm(10, county_alphas[2], sigma_y)
##  [1]  0.31755287 -0.30276395 -0.10094444  1.56763453  1.46959676  1.22597694
##  [7] -0.87499724  2.39693148 -0.65364121 -0.04306499

Another 15 measurements for county 8 would look like this:

rnorm(15, county_alphas[8], sigma_y)
##  [1]  1.36856904  1.35195827  0.67652686  0.99592914 -0.50312683  0.82775417
##  [7]  0.02079169  2.05401592  0.42529953  2.49011689  0.32098607  2.07001064
## [13]  1.42401532 -0.21382099  0.46099973

Partial Pooling

Let’s now fit the Partial Pooling model

library(lme4)
summary(lmer(log.radon~(1|county), data=mn))
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ (1 | county)
##    Data: mn
## 
## REML criterion at convergence: 2614.1
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -10.3224  -0.4459   0.0615   0.5504   2.7445 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  county   (Intercept) 0.1117   0.3342  
##  Residual             0.9461   0.9727  
## Number of obs: 919, groups:  county, 85
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  1.27160    0.05592   22.74

Let’s get the individual intercepts

coef(lmer(log.radon~(1|county), data=mn))
## $county
##                      (Intercept)
## AITKIN                 1.0755314
## ANOKA                  0.8946430
## BECKER                 1.2131991
## BELTRAMI               1.2124937
## BENTON                 1.2654507
## BIG STONE              1.3347448
## BLUE EARTH             1.6688905
## BROWN                  1.3863487
## CARLTON                1.0871734
## CARVER                 0.7660005
## CASS                   1.3196842
## CHIPPEWA               1.4187295
## CHISAGO                1.1750292
## CLAY                   1.5899397
## CLEARWATER             1.1773177
## COOK                   1.1556877
## COTTONWOOD             0.5452748
## CROW WING              1.0780270
## DAKOTA                 1.2900755
## DODGE                  1.4098961
## DOUGLAS                1.4685743
## FARIBAULT              0.9585591
## FILLMORE               1.2273003
## FREEBORN               1.6167304
## GOODHUE                1.6306103
## HENNEPIN               1.2840885
## HOUSTON                1.3800696
## HUBBARD                1.0999706
## ISANTI                 1.2152035
## ITASCA                 1.0761991
## JACKSON                1.5496315
## KANABEC                1.2602714
## KANDIYOHI              1.5251135
## KITTSON                1.2293490
## KOOCHICHING            0.8805625
## LAC QUI PARLE          1.5251251
## LAKE                   0.7823833
## LAKE OF THE WOODS      1.3472564
## LE SUEUR               1.3949768
## LINCOLN                1.5487868
## LYON                   1.5657185
## MAHNOMEN               1.2810368
## MARSHALL               1.2357096
## MARTIN                 1.1206635
## MCLEOD                 0.8300185
## MEEKER                 1.2504431
## MILLE LACS             1.1287125
## MORRISON               1.1667358
## MOWER                  1.4651584
## MURRAY                 1.4006081
## NICOLLET               1.5582081
## NOBLES                 1.4432142
## NORMAN                 1.2033340
## OLMSTED                1.2286032
## OTTER TAIL             1.3086100
## PENNINGTON             1.1013508
## PINE                   1.0105570
## PIPESTONE              1.4028640
## POLK                   1.3004417
## POPE                   1.2730861
## RAMSEY                 1.1289335
## REDWOOD                1.4821937
## RENVILLE               1.3131671
## RICE                   1.5601621
## ROCK                   1.2768552
## ROSEAU                 1.2606360
## SCOTT                  1.4591992
## SHERBURNE              1.1833959
## SIBLEY                 1.2622484
## STEARNS                1.3499980
## STEELE                 1.4385272
## STEVENS                1.3709689
## ST LOUIS               0.8024961
## SWIFT                  1.1803136
## TODD                   1.3270361
## TRAVERSE               1.4501858
## WABASHA                1.4739400
## WADENA                 1.1698370
## WASECA                 1.0031416
## WASHINGTON             1.2543716
## WATONWAN               1.5220740
## WILKIN                 1.3728133
## WINONA                 1.3836476
## WRIGHT                 1.4643536
## YELLOW MEDICINE        1.2553450
## 
## attr(,"class")
## [1] "coef.mer"

Here, we are saying that we want to fit an individual intercept for each county, and each individual intercept for each county is modelled: that is, we are assuming that \(\alpha_j~N(\mu_\alpha, \sigma_\alpha)\).

Let’s look at Lac Qui Parle County specifically

#Partial pooling
coef(lmer(log.radon~(1|county), data=mn))$county["LAC QUI PARLE       ",]
## [1] 1.525125
#No-pooling
as.data.frame(lm(log.radon~county, data=mn)$coefficients)["countyLAC QUI PARLE       ",]
## [1] 1.938289
#Complete pooling  
lm(log.radon~1, data=mn)
## 
## Call:
## lm(formula = log.radon ~ 1, data = mn)
## 
## Coefficients:
## (Intercept)  
##       1.202

As you can see, partial pooling helps move the estimate towards the mean, by using information about the other counties.

Adding predictors

Let’s add the floor predictor

lmer(log.radon~(1|county) + floor, data=mn) 
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ (1 | county) + floor
##    Data: mn
## REML criterion at convergence: 2533.209
## Random effects:
##  Groups   Name        Std.Dev.
##  county   (Intercept) 0.3389  
##  Residual             0.9275  
## Number of obs: 919, groups:  county, 85
## Fixed Effects:
## (Intercept)        floor  
##      1.4406      -0.8051

Let’s fit a different intercept for each county as well.

lmer(log.radon~floor+(floor|county) , data=mn) 
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00234467 (tol = 0.002, component 1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ floor + (floor | county)
##    Data: mn
## REML criterion at convergence: 2518.887
## Random effects:
##  Groups   Name        Std.Dev. Corr 
##  county   (Intercept) 0.3026        
##           floor       0.6127   -0.01
##  Residual             0.9053        
## Number of obs: 919, groups:  county, 85
## Fixed Effects:
## (Intercept)        floor  
##      1.4416      -0.7672  
## convergence code 0; 1 optimizer warnings; 0 lme4 warnings

Let’s get the modelled/random effects

fit<-lmer(log.radon~floor+(floor|county) , data=mn) 
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00234467 (tol = 0.002, component 1)
ranef(fit)
## $county
##                       (Intercept)         floor
## AITKIN               -0.189868218  0.0978208813
## ANOKA                -0.466522074 -0.2955024893
## BECKER                0.017337531  0.0722466305
## BELTRAMI              0.063493857 -0.0120137008
## BENTON               -0.009274453  0.1256405338
## BIG STONE             0.017917212 -0.0003228185
## BLUE EARTH            0.299860106  0.5875348213
## BROWN                 0.152206785  0.1537969837
## CARLTON              -0.251511955  0.4114592692
## CARVER               -0.146573103 -1.2642108290
## CASS                 -0.014511592  0.0002614586
## CHIPPEWA              0.089134224 -0.0016059515
## CHISAGO              -0.161678154  0.0029129920
## CLAY                  0.344275617 -0.0150516989
## CLEARWATER           -0.031529881  0.0427270937
## COOK                 -0.141847202  0.0025556932
## COTTONWOOD           -0.250350173 -1.3793442116
## CROW WING            -0.217373344  0.2808760672
## DAKOTA               -0.081562059 -0.1123372099
## DODGE                 0.090031148 -0.0016221116
## DOUGLAS               0.140288523  0.1533408427
## FARIBAULT            -0.318048627 -0.0255986537
## FILLMORE             -0.010889012  0.0795418459
## FREEBORN              0.322350851  0.1207262407
## GOODHUE               0.265223831  0.5556027445
## HENNEPIN             -0.063769475 -0.0660968135
## HOUSTON               0.114066611  0.1854159327
## HUBBARD              -0.075086718  0.0616980430
## ISANTI               -0.096792541  0.0017439332
## ITASCA               -0.284408299  0.0051242488
## JACKSON               0.207474269 -0.0037381109
## KANABEC              -0.063413972  0.0011425439
## KANDIYOHI             0.191550448 -0.0034512079
## KITTSON               0.047037383 -0.0110027864
## KOOCHICHING          -0.278264275  0.0665504915
## LAC QUI PARLE         0.236476436  0.4726717941
## LAKE                 -0.520886562  0.0406289793
## LAKE OF THE WOODS     0.102447333  0.2308231571
## LE SUEUR              0.097734410  0.2070537767
## LINCOLN               0.265949548  0.0948547112
## LYON                  0.231162859  0.3194433602
## MAHNOMEN             -0.008103843  0.0001460088
## MARSHALL              0.250042201 -0.5576289174
## MARTIN               -0.148031273 -0.3812822183
## MCLEOD               -0.172160367 -1.1988106841
## MEEKER               -0.081355324  0.0014657973
## MILLE LACS           -0.058623765 -0.4104742313
## MORRISON             -0.155187350  0.1839319531
## MOWER                 0.189077283 -0.3291098195
## MURRAY                0.105663551 -0.0019037642
## NICOLLET              0.223410742 -0.0040252420
## NOBLES                0.122002663 -0.0021981496
## NORMAN               -0.039527520 -0.0513077393
## OLMSTED              -0.061090478 -0.3323114622
## OTTER TAIL            0.050142953  0.2306127096
## PENNINGTON           -0.029341965 -0.2831844293
## PINE                 -0.264776240 -0.0681376206
## PIPESTONE             0.116369012  0.2083615420
## POLK                  0.105430025 -0.0743598530
## POPE                 -0.029626885  0.0005337943
## RAMSEY               -0.254317298  0.4942659675
## REDWOOD               0.145770447  0.6924512898
## RENVILLE              0.032732557  0.3296758369
## RICE                  0.226850222 -0.0109053897
## ROCK                 -0.026024042  0.0004688811
## ROSEAU                0.087674122  0.1864157333
## SCOTT                 0.120872681  0.5527192804
## SHERBURNE            -0.165928216  0.0029895663
## SIBLEY               -0.061510706  0.0011082523
## STEARNS               0.057490029 -0.1262168013
## STEELE                0.072959501 -0.0013145279
## STEVENS               0.063938624 -0.0011519967
## ST LOUIS             -0.544177651  0.1346676463
## SWIFT                -0.140389667  0.0025294325
## TODD                  0.045475974  0.3361651505
## TRAVERSE              0.173345358  0.0660142226
## WABASHA               0.185489348 -0.2424861106
## WADENA               -0.032560500 -0.1137740875
## WASECA               -0.238292362 -0.1671594944
## WASHINGTON           -0.071420389 -0.1982006344
## WATONWAN              0.234880130  0.5342868519
## WILKIN                0.079217874 -0.0014272864
## WINONA                0.196036772 -0.5944354112
## WRIGHT                0.122301968  0.0098601369
## YELLOW MEDICINE      -0.046585485  0.0008393412
## 
## with conditional variances for "county"

Now, let’s do prediction. Suppose we want to make a prediction for a new measurement in Lac Qui Parle county.

fit<-lmer(log.radon~floor+(floor|county) , data=mn) 
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00234467 (tol = 0.002, component 1)
sigma.y.hat <- sigma.hat(fit)$sigma$data
coef.hat <- as.matrix(coef(fit)$county["LAC QUI PARLE       ",])

x_new <- 1  #first floor
y_new <- rnorm(1000, coef.hat %*% c(1, x_new), sigma.y.hat  )
qplot(y_new)

What we got is what the log-radon measurements could possibly look like in Lac Qui Parle county. What about the radon measurements?

qplot(exp(y_new))

Let’s get a 95% predictive interval

quantile(y_new, c(0.025,  0.975))
##       2.5%      97.5% 
## -0.2687799  3.1985124
quantile(exp(y_new), c(0.025,  0.975))
##       2.5%      97.5% 
##  0.7643115 24.4960738

In our simulation, 95% of the data fell within the above interval.

Now, let’s predict a new observation in a new county that we haven’t seen.

sigma_a <- 0.3487
sigma_b <- 0.3436
rho <- -0.34
Sigma <-  matrix(c(sigma_a^2           , rho*sigma_a*sigma_b, 
                   rho*sigma_a*sigma_b , sigma_b^2),
   nrow=2,       
   ncol=2,       
   byrow = TRUE)   

mu_a <- 1.4628
mu_b <- -0.6811
mu <- matrix(c(mu_a,
               mu_b),
               nrow=2,
               byrow=TRUE)
             
sigma_y <- 0.7462

random_params <- mvrnorm(n = 1000, mu, Sigma)


x_new = 1  # first floor

y_new <- rnorm(1000, mean=random_params %*% c(1, x_new), sd=sigma_y)
qplot(y_new)

Let’s now construct a predictive interval again

quantile(y_new, c(0.025,  0.975))
##      2.5%     97.5% 
## -0.816665  2.443631

As you can see, the predictive interval is wider for an unknown county (of course).