library(ggplot2)
library(arm)
srrs2 <- read.table ("srrs2.dat", header=T, sep=",")
mn <- srrs2[srrs2$state=="MN",]
mn$radon <- mn$activity
mn$log.radon <- log (ifelse (mn$radon==0, 0.0001, mn$radon))
mean(mn$log.radon)
## [1] 1.202073
sd(mn$log.radon)
## [1] 1.021198
Let’s compute the average log-radon for all counties
fit <- lm(log.radon~0+county, data=mn)
s <- summary(fit)
coefs <- as.data.frame(s$coefficients)
Add a column with County nmaes
cleanup_name <- function(s){
gsub(" ", "", gsub("county", "", s))
}
coefs$county <- as.character(lapply(rownames(s$coefficients), cleanup_name))
ggplot(coefs, aes(x = county, y = Estimate)) +
geom_errorbar(aes(ymin=Estimate-`Std. Error`, ymax=Estimate+`Std. Error`)) +
theme(axis.text.x = element_text(angle = 90))
subset_coefs <- coefs[c(2, 10, 22, 30, 36, 40, 50, 55, 60, 65),]
ggplot(subset_coefs, aes(x = county, y = Estimate)) +
geom_errorbar(aes(ymin=Estimate-`Std. Error`, ymax=Estimate+`Std. Error`)) +
theme(axis.text.x = element_text(angle = 90))
How many datapoints for Lac Qui Parle?
mn$county_clean <- as.character(lapply(mn$county, cleanup_name))
mn[mn$county_clean=="LACQUIPARLE", c("zip", "log.radon")]
## zip log.radon
## 5498 56256 2.424803
## 5499 56256 2.772589
Just two datapoints.
What about Anoka?
mn[mn$county_clean=="ANOKA", c("zip", "log.radon")]
## zip log.radon
## 5085 55011 1.13140211
## 5086 55014 0.91629073
## 5087 55014 0.40546511
## 5088 55014 0.00000000
## 5089 55014 -0.35667494
## 5090 55014 0.18232156
## 5091 55014 0.18232156
## 5092 55014 0.26236426
## 5093 55014 0.33647224
## 5094 55025 -0.91629073
## 5095 55025 0.09531018
## 5096 55025 1.50407740
## 5097 55025 0.26236426
## 5098 55025 0.74193734
## 5099 55038 1.77495235
## 5100 55070 1.19392247
## 5101 55303 0.58778666
## 5102 55303 1.68639895
## 5103 55303 1.84054963
## 5104 55303 0.64185389
## 5105 55303 1.88706965
## 5106 55303 1.13140211
## 5107 55303 1.91692261
## 5108 55303 1.94591015
## 5109 55303 2.04122033
## 5110 55303 1.64865863
## 5111 55303 1.50407740
## 5112 55303 1.48160454
## 5113 55303 1.02961942
## 5114 55303 2.09186406
## 5115 55303 0.47000363
## 5116 55304 1.43508453
## 5117 55304 1.68639895
## 5118 55304 1.38629436
## 5119 55304 0.83290912
## 5120 55304 1.06471074
## 5121 55304 0.33647224
## 5122 55304 1.19392247
## 5123 55304 1.06471074
## 5124 55304 0.58778666
## 5125 55304 -1.60943791
## 5126 55421 0.87546874
## 5127 55421 0.09531018
## 5128 55432 0.78845736
## 5129 55432 -0.51082562
## 5130 55432 0.53062825
## 5131 55433 1.06471074
## 5132 55433 0.78845736
## 5133 55434 0.53062825
## 5134 55434 0.33647224
## 5135 55434 0.64185389
## 5136 55434 0.58778666
Lots more. In general, larger error bars means smaller sample size. We should trust that estimate less.
Let’s generate county data
mu_a <- 1.1
sigma_a <- .3
sigma_y <- 0.8
n_counties <- 10
county_alphas <- rnorm(n_counties, mu_a, sigma_a)
county_alphas
## [1] 1.0223914 0.4763150 1.6578417 1.3256806 1.3742734 1.2144768 1.4397130
## [8] 0.5161406 1.1590883 1.1223003
Now, 10 measurements for county 2 would look like this:
rnorm(10, county_alphas[2], sigma_y)
## [1] 0.31755287 -0.30276395 -0.10094444 1.56763453 1.46959676 1.22597694
## [7] -0.87499724 2.39693148 -0.65364121 -0.04306499
Another 15 measurements for county 8 would look like this:
rnorm(15, county_alphas[8], sigma_y)
## [1] 1.36856904 1.35195827 0.67652686 0.99592914 -0.50312683 0.82775417
## [7] 0.02079169 2.05401592 0.42529953 2.49011689 0.32098607 2.07001064
## [13] 1.42401532 -0.21382099 0.46099973
Let’s now fit the Partial Pooling model
library(lme4)
summary(lmer(log.radon~(1|county), data=mn))
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ (1 | county)
## Data: mn
##
## REML criterion at convergence: 2614.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -10.3224 -0.4459 0.0615 0.5504 2.7445
##
## Random effects:
## Groups Name Variance Std.Dev.
## county (Intercept) 0.1117 0.3342
## Residual 0.9461 0.9727
## Number of obs: 919, groups: county, 85
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1.27160 0.05592 22.74
Let’s get the individual intercepts
coef(lmer(log.radon~(1|county), data=mn))
## $county
## (Intercept)
## AITKIN 1.0755314
## ANOKA 0.8946430
## BECKER 1.2131991
## BELTRAMI 1.2124937
## BENTON 1.2654507
## BIG STONE 1.3347448
## BLUE EARTH 1.6688905
## BROWN 1.3863487
## CARLTON 1.0871734
## CARVER 0.7660005
## CASS 1.3196842
## CHIPPEWA 1.4187295
## CHISAGO 1.1750292
## CLAY 1.5899397
## CLEARWATER 1.1773177
## COOK 1.1556877
## COTTONWOOD 0.5452748
## CROW WING 1.0780270
## DAKOTA 1.2900755
## DODGE 1.4098961
## DOUGLAS 1.4685743
## FARIBAULT 0.9585591
## FILLMORE 1.2273003
## FREEBORN 1.6167304
## GOODHUE 1.6306103
## HENNEPIN 1.2840885
## HOUSTON 1.3800696
## HUBBARD 1.0999706
## ISANTI 1.2152035
## ITASCA 1.0761991
## JACKSON 1.5496315
## KANABEC 1.2602714
## KANDIYOHI 1.5251135
## KITTSON 1.2293490
## KOOCHICHING 0.8805625
## LAC QUI PARLE 1.5251251
## LAKE 0.7823833
## LAKE OF THE WOODS 1.3472564
## LE SUEUR 1.3949768
## LINCOLN 1.5487868
## LYON 1.5657185
## MAHNOMEN 1.2810368
## MARSHALL 1.2357096
## MARTIN 1.1206635
## MCLEOD 0.8300185
## MEEKER 1.2504431
## MILLE LACS 1.1287125
## MORRISON 1.1667358
## MOWER 1.4651584
## MURRAY 1.4006081
## NICOLLET 1.5582081
## NOBLES 1.4432142
## NORMAN 1.2033340
## OLMSTED 1.2286032
## OTTER TAIL 1.3086100
## PENNINGTON 1.1013508
## PINE 1.0105570
## PIPESTONE 1.4028640
## POLK 1.3004417
## POPE 1.2730861
## RAMSEY 1.1289335
## REDWOOD 1.4821937
## RENVILLE 1.3131671
## RICE 1.5601621
## ROCK 1.2768552
## ROSEAU 1.2606360
## SCOTT 1.4591992
## SHERBURNE 1.1833959
## SIBLEY 1.2622484
## STEARNS 1.3499980
## STEELE 1.4385272
## STEVENS 1.3709689
## ST LOUIS 0.8024961
## SWIFT 1.1803136
## TODD 1.3270361
## TRAVERSE 1.4501858
## WABASHA 1.4739400
## WADENA 1.1698370
## WASECA 1.0031416
## WASHINGTON 1.2543716
## WATONWAN 1.5220740
## WILKIN 1.3728133
## WINONA 1.3836476
## WRIGHT 1.4643536
## YELLOW MEDICINE 1.2553450
##
## attr(,"class")
## [1] "coef.mer"
Here, we are saying that we want to fit an individual intercept for each county, and each individual intercept for each county is modelled: that is, we are assuming that \(\alpha_j~N(\mu_\alpha, \sigma_\alpha)\).
Let’s look at Lac Qui Parle County specifically
#Partial pooling
coef(lmer(log.radon~(1|county), data=mn))$county["LAC QUI PARLE ",]
## [1] 1.525125
#No-pooling
as.data.frame(lm(log.radon~county, data=mn)$coefficients)["countyLAC QUI PARLE ",]
## [1] 1.938289
#Complete pooling
lm(log.radon~1, data=mn)
##
## Call:
## lm(formula = log.radon ~ 1, data = mn)
##
## Coefficients:
## (Intercept)
## 1.202
As you can see, partial pooling helps move the estimate towards the mean, by using information about the other counties.
Let’s add the floor predictor
lmer(log.radon~(1|county) + floor, data=mn)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ (1 | county) + floor
## Data: mn
## REML criterion at convergence: 2533.209
## Random effects:
## Groups Name Std.Dev.
## county (Intercept) 0.3389
## Residual 0.9275
## Number of obs: 919, groups: county, 85
## Fixed Effects:
## (Intercept) floor
## 1.4406 -0.8051
Let’s fit a different intercept for each county as well.
lmer(log.radon~floor+(floor|county) , data=mn)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00234467 (tol = 0.002, component 1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: log.radon ~ floor + (floor | county)
## Data: mn
## REML criterion at convergence: 2518.887
## Random effects:
## Groups Name Std.Dev. Corr
## county (Intercept) 0.3026
## floor 0.6127 -0.01
## Residual 0.9053
## Number of obs: 919, groups: county, 85
## Fixed Effects:
## (Intercept) floor
## 1.4416 -0.7672
## convergence code 0; 1 optimizer warnings; 0 lme4 warnings
Let’s get the modelled/random effects
fit<-lmer(log.radon~floor+(floor|county) , data=mn)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00234467 (tol = 0.002, component 1)
ranef(fit)
## $county
## (Intercept) floor
## AITKIN -0.189868218 0.0978208813
## ANOKA -0.466522074 -0.2955024893
## BECKER 0.017337531 0.0722466305
## BELTRAMI 0.063493857 -0.0120137008
## BENTON -0.009274453 0.1256405338
## BIG STONE 0.017917212 -0.0003228185
## BLUE EARTH 0.299860106 0.5875348213
## BROWN 0.152206785 0.1537969837
## CARLTON -0.251511955 0.4114592692
## CARVER -0.146573103 -1.2642108290
## CASS -0.014511592 0.0002614586
## CHIPPEWA 0.089134224 -0.0016059515
## CHISAGO -0.161678154 0.0029129920
## CLAY 0.344275617 -0.0150516989
## CLEARWATER -0.031529881 0.0427270937
## COOK -0.141847202 0.0025556932
## COTTONWOOD -0.250350173 -1.3793442116
## CROW WING -0.217373344 0.2808760672
## DAKOTA -0.081562059 -0.1123372099
## DODGE 0.090031148 -0.0016221116
## DOUGLAS 0.140288523 0.1533408427
## FARIBAULT -0.318048627 -0.0255986537
## FILLMORE -0.010889012 0.0795418459
## FREEBORN 0.322350851 0.1207262407
## GOODHUE 0.265223831 0.5556027445
## HENNEPIN -0.063769475 -0.0660968135
## HOUSTON 0.114066611 0.1854159327
## HUBBARD -0.075086718 0.0616980430
## ISANTI -0.096792541 0.0017439332
## ITASCA -0.284408299 0.0051242488
## JACKSON 0.207474269 -0.0037381109
## KANABEC -0.063413972 0.0011425439
## KANDIYOHI 0.191550448 -0.0034512079
## KITTSON 0.047037383 -0.0110027864
## KOOCHICHING -0.278264275 0.0665504915
## LAC QUI PARLE 0.236476436 0.4726717941
## LAKE -0.520886562 0.0406289793
## LAKE OF THE WOODS 0.102447333 0.2308231571
## LE SUEUR 0.097734410 0.2070537767
## LINCOLN 0.265949548 0.0948547112
## LYON 0.231162859 0.3194433602
## MAHNOMEN -0.008103843 0.0001460088
## MARSHALL 0.250042201 -0.5576289174
## MARTIN -0.148031273 -0.3812822183
## MCLEOD -0.172160367 -1.1988106841
## MEEKER -0.081355324 0.0014657973
## MILLE LACS -0.058623765 -0.4104742313
## MORRISON -0.155187350 0.1839319531
## MOWER 0.189077283 -0.3291098195
## MURRAY 0.105663551 -0.0019037642
## NICOLLET 0.223410742 -0.0040252420
## NOBLES 0.122002663 -0.0021981496
## NORMAN -0.039527520 -0.0513077393
## OLMSTED -0.061090478 -0.3323114622
## OTTER TAIL 0.050142953 0.2306127096
## PENNINGTON -0.029341965 -0.2831844293
## PINE -0.264776240 -0.0681376206
## PIPESTONE 0.116369012 0.2083615420
## POLK 0.105430025 -0.0743598530
## POPE -0.029626885 0.0005337943
## RAMSEY -0.254317298 0.4942659675
## REDWOOD 0.145770447 0.6924512898
## RENVILLE 0.032732557 0.3296758369
## RICE 0.226850222 -0.0109053897
## ROCK -0.026024042 0.0004688811
## ROSEAU 0.087674122 0.1864157333
## SCOTT 0.120872681 0.5527192804
## SHERBURNE -0.165928216 0.0029895663
## SIBLEY -0.061510706 0.0011082523
## STEARNS 0.057490029 -0.1262168013
## STEELE 0.072959501 -0.0013145279
## STEVENS 0.063938624 -0.0011519967
## ST LOUIS -0.544177651 0.1346676463
## SWIFT -0.140389667 0.0025294325
## TODD 0.045475974 0.3361651505
## TRAVERSE 0.173345358 0.0660142226
## WABASHA 0.185489348 -0.2424861106
## WADENA -0.032560500 -0.1137740875
## WASECA -0.238292362 -0.1671594944
## WASHINGTON -0.071420389 -0.1982006344
## WATONWAN 0.234880130 0.5342868519
## WILKIN 0.079217874 -0.0014272864
## WINONA 0.196036772 -0.5944354112
## WRIGHT 0.122301968 0.0098601369
## YELLOW MEDICINE -0.046585485 0.0008393412
##
## with conditional variances for "county"
Now, let’s do prediction. Suppose we want to make a prediction for a new measurement in Lac Qui Parle county.
fit<-lmer(log.radon~floor+(floor|county) , data=mn)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00234467 (tol = 0.002, component 1)
sigma.y.hat <- sigma.hat(fit)$sigma$data
coef.hat <- as.matrix(coef(fit)$county["LAC QUI PARLE ",])
x_new <- 1 #first floor
y_new <- rnorm(1000, coef.hat %*% c(1, x_new), sigma.y.hat )
qplot(y_new)
What we got is what the log-radon measurements could possibly look like in Lac Qui Parle county. What about the radon measurements?
qplot(exp(y_new))
Let’s get a 95% predictive interval
quantile(y_new, c(0.025, 0.975))
## 2.5% 97.5%
## -0.2687799 3.1985124
quantile(exp(y_new), c(0.025, 0.975))
## 2.5% 97.5%
## 0.7643115 24.4960738
In our simulation, 95% of the data fell within the above interval.
Now, let’s predict a new observation in a new county that we haven’t seen.
sigma_a <- 0.3487
sigma_b <- 0.3436
rho <- -0.34
Sigma <- matrix(c(sigma_a^2 , rho*sigma_a*sigma_b,
rho*sigma_a*sigma_b , sigma_b^2),
nrow=2,
ncol=2,
byrow = TRUE)
mu_a <- 1.4628
mu_b <- -0.6811
mu <- matrix(c(mu_a,
mu_b),
nrow=2,
byrow=TRUE)
sigma_y <- 0.7462
random_params <- mvrnorm(n = 1000, mu, Sigma)
x_new = 1 # first floor
y_new <- rnorm(1000, mean=random_params %*% c(1, x_new), sd=sigma_y)
qplot(y_new)
Let’s now construct a predictive interval again
quantile(y_new, c(0.025, 0.975))
## 2.5% 97.5%
## -0.816665 2.443631
As you can see, the predictive interval is wider for an unknown county (of course).