DPI R Bootcamp
Jared Knowles
In this lesson we hope to learn:
In this tutorial we will use a number of datasets of different types:
stulong
: student-level assessment and demographics data (simulated and research ready)midwest_schools.csv
: aggregate school level test score averages from a large Midwest stateload("data/midwest_schools.rda")
head(midsch[, 1:12])
## district_id school_id subject grade n1 ss1 n2 ss2 predicted
## 1 14 130 math 4 44 433.1 40 463.0 468.7
## 2 70 20 math 4 18 443.0 20 477.2 476.5
## 3 112 80 math 4 86 445.4 94 472.6 478.4
## 4 119 50 math 4 95 427.1 94 460.7 464.1
## 5 147 60 math 4 27 424.2 27 458.7 461.8
## 6 147 125 math 4 17 423.5 26 463.1 461.2
## residuals resid_z resid_t
## 1 -5.7446 -0.59190 -0.59171
## 2 0.7235 0.07456 0.07452
## 3 -5.7509 -0.59267 -0.59248
## 4 -3.3586 -0.34606 -0.34591
## 5 -3.0937 -0.31877 -0.31863
## 6 1.8530 0.19094 0.19085
table(midsch$test_year, midsch$grade)
##
## 4 5 6 7 8
## 2007 1150 1094 472 638 734
## 2008 1204 1146 462 588 692
## 2009 1173 1092 434 592 668
## 2010 1120 1090 428 610 686
## 2011 1126 1060 420 618 688
length(unique(midsch$district_id))
## [1] 357
length(unique(midsch$school_id))
## [1] 247
table(midsch$subject, midsch$grade)
##
## 4 5 6 7 8
## math 2886 2741 1108 1523 1734
## read 2887 2741 1108 1523 1734
table(midsch$district_id,midsch$grade)
library(ggplot2)
qplot(ss1, ss2, data = midsch, alpha = I(0.07)) + theme_dpi() + geom_smooth() +
geom_smooth(method = "lm", se = FALSE, color = "purple")
data(mtcars) # load the data from R
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
shapiro.test(mtcars$mpg)
##
## Shapiro-Wilk normality test
##
## data: mtcars$mpg
## W = 0.9476, p-value = 0.1229
shapiro.test(mtcars$hp)
##
## Shapiro-Wilk normality test
##
## data: mtcars$hp
## W = 0.9334, p-value = 0.04881
mpg
variable thenmean(mtcars$mpg)
## [1] 20.09
t.test(mtcars$mpg, mu = 18, alternative = "greater")
##
## One Sample t-test
##
## data: mtcars$mpg
## t = 1.962, df = 31, p-value = 0.02938
## alternative hypothesis: true mean is greater than 18
## 95 percent confidence interval:
## 18.28 Inf
## sample estimates:
## mean of x
## 20.09
t.test(mtcars$mpg, mu = 22, alternative = "less")
##
## One Sample t-test
##
## data: mtcars$mpg
## t = -1.792, df = 31, p-value = 0.04144
## alternative hypothesis: true mean is less than 22
## 95 percent confidence interval:
## -Inf 21.9
## sample estimates:
## mean of x
## 20.09
1.96224703002802, 31, 0.0587642243155615, c(17.9176785087462, 22.2635714912538), 20.090625, 18, two.sided, One Sample t-test, mtcars$mpg
test?t.test(mpg ~ am, data = mtcars)
##
## Welch Two Sample t-test
##
## data: mpg by am
## t = -3.767, df = 18.33, p-value = 0.001374
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -11.28 -3.21
## sample estimates:
## mean in group 0 mean in group 1
## 17.15 24.39
428, NULL, 0.00222462952316261, 102, two.sided, Wilcoxon signed rank test with continuity correction, mtcars$hp
176, NULL, 0.0457013160663359, 0, two.sided, Wilcoxon rank sum test with continuity correction, hp by am
aspirin <- matrix(c(189, 104, 10845, 10933), ncol = 2, dimnames = list(c("Placebo",
"Aspirin"), c("MI", "No MI")))
aspirin
## MI No MI
## Placebo 189 10845
## Aspirin 104 10933
chisq.test(aspirin, correct = FALSE)
##
## Pearson's Chi-squared test
##
## data: aspirin
## X-squared = 25.01, df = 1, p-value = 5.692e-07
fisher.test(aspirin)
##
## Fisher's Exact Test for Count Data
##
## data: aspirin
## p-value = 5.033e-07
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.432 2.354
## sample estimates:
## odds ratio
## 1.832
test_year
, grade
, and subject
?nrow(unique(midsch[, c(3, 4, 14)]))
## [1] 50
5th grade, 2011, math scores
midsch_sub <- subset(midsch, midsch$grade == 5 & midsch$test_year == 2011 &
midsch$subject == "math")
midsch_sub
?my_mod<-lm(ss2~ss1,data=midsch_sub)
lm
function~
character divides the dependent variable ss2
from the independent variable ss1
my_mod<-
data
means we don’t have to write: lm(midsch_sub$ss2~midsch_sub$ss1)
ss_mod <- lm(ss2 ~ ss1, data = midsch_sub)
summary(ss_mod)
##
## Call:
## lm(formula = ss2 ~ ss1, data = midsch_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.36 -7.60 -0.42 6.49 58.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.1687 11.3446 -0.46 0.65
## ss1 1.0644 0.0242 44.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.2 on 528 degrees of freedom
## Multiple R-squared: 0.786, Adjusted R-squared: 0.785
## F-statistic: 1.94e+03 on 1 and 528 DF, p-value: <2e-16
objects(ss_mod)
## [1] "assign" "call" "coefficients" "df.residual"
## [5] "effects" "fitted.values" "model" "qr"
## [9] "rank" "residuals" "terms" "xlevels"
coefficients
fitted.values
and call
qplot(n2, ss2 - ss1, data = midsch, alpha = I(0.1)) + theme_dpi() + geom_smooth()
ssN1_mod <- lm(ss2 ~ ss1 + n1, data = midsch_sub)
summary(ssN1_mod)
##
## Call:
## lm(formula = ss2 ~ ss1 + n1, data = midsch_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.39 -7.73 -0.52 6.42 59.67
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.6849 11.7688 0.14 0.886
## ss1 1.0450 0.0258 40.49 <2e-16 ***
## n1 0.0406 0.0193 2.10 0.036 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.2 on 527 degrees of freedom
## Multiple R-squared: 0.787, Adjusted R-squared: 0.787
## F-statistic: 976 on 2 and 527 DF, p-value: <2e-16
ssN2_mod <- lm(ss2 ~ ss1 + n2, data = midsch_sub)
summary(ssN2_mod)
##
## Call:
## lm(formula = ss2 ~ ss1 + n2, data = midsch_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.60 -7.62 -0.53 6.52 59.64
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.7971 11.8544 0.15 0.88
## ss1 1.0450 0.0260 40.12 <2e-16 ***
## n2 0.0377 0.0192 1.97 0.05 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.2 on 527 degrees of freedom
## Multiple R-squared: 0.787, Adjusted R-squared: 0.786
## F-statistic: 975 on 2 and 527 DF, p-value: <2e-16
anova(ss_mod, ssN1_mod, ssN2_mod)
## Analysis of Variance Table
##
## Model 1: ss2 ~ ss1
## Model 2: ss2 ~ ss1 + n1
## Model 3: ss2 ~ ss1 + n2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 528 66239
## 2 527 65688 1 551 4.42 0.036 *
## 3 527 65755 0 -67
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(ssN2_mod)
## [1] 4067
AIC(ssN1_mod)
## [1] 4067
n1
and n2
but either improves model fit over the model without itlibrary(lmtest)
resettest(ss_mod, power = 2:4)
##
## RESET test
##
## data: ss_mod
## RESET = 2.642, df1 = 3, df2 = 525, p-value = 0.04866
raintest(ss2 ~ ss1, fraction = 0.5, order.by = ~ss1, data = midsch_sub)
##
## Rainbow test
##
## data: ss2 ~ ss1
## Rain = 1.402, df1 = 265, df2 = 263, p-value = 0.003105
harvtest(ss2 ~ ss1, order.by = ~ss1, data = midsch_sub)
##
## Harvey-Collier test
##
## data: ss2 ~ ss1
## HC = 2.734, df = 527, p-value = 0.006462
ss_poly <- lm(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4), data = midsch_sub)
summary(ss_poly)
##
## Call:
## lm(formula = ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4), data = midsch_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44.89 -6.92 -0.20 6.76 59.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.61e+03 3.61e+04 0.07 0.94
## ss1 -8.72e+00 3.18e+02 -0.03 0.98
## I(ss1^2) -9.51e-03 1.05e+00 -0.01 0.99
## I(ss1^3) 7.21e-05 1.54e-03 0.05 0.96
## I(ss1^4) -6.98e-08 8.42e-07 -0.08 0.93
##
## Residual standard error: 11.1 on 525 degrees of freedom
## Multiple R-squared: 0.789, Adjusted R-squared: 0.787
## F-statistic: 490 on 4 and 525 DF, p-value: <2e-16
anova(ss_mod, ss_poly)
## Analysis of Variance Table
##
## Model 1: ss2 ~ ss1
## Model 2: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 528 66239
## 2 525 65253 3 985 2.64 0.049 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
resettest(ss_poly, power = 2:4)
##
## RESET test
##
## data: ss_poly
## RESET = 1.562, df1 = 3, df2 = 522, p-value = 0.1976
raintest(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4), fraction = 0.5, order.by = ~ss1,
data = midsch_sub)
##
## Rainbow test
##
## data: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4)
## Rain = 1.392, df1 = 265, df2 = 260, p-value = 0.003804
harvtest(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4), order.by = ~ss1, data = midsch_sub)
##
## Harvey-Collier test
##
## data: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4)
## HC = NA, df = 524, p-value = NA
ss_polyn <- lm(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2, data = midsch_sub)
anova(ss_mod, ssN2_mod, ss_poly, ss_polyn)
## Analysis of Variance Table
##
## Model 1: ss2 ~ ss1
## Model 2: ss2 ~ ss1 + n2
## Model 3: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4)
## Model 4: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 528 66239
## 2 527 65755 1 483 3.91 0.049 *
## 3 525 65253 2 502 2.03 0.133
## 4 524 64842 1 411 3.32 0.069 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
resettest(ss_polyn, power = 2:4)
##
## RESET test
##
## data: ss_polyn
## RESET = 2.485, df1 = 3, df2 = 521, p-value = 0.05991
raintest(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2, fraction = 0.5, order.by = ~ss1,
data = midsch_sub)
##
## Rainbow test
##
## data: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2
## Rain = 1.381, df1 = 265, df2 = 259, p-value = 0.004606
harvtest(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2, order.by = ~ss1, data = midsch_sub)
##
## Harvey-Collier test
##
## data: ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2
## HC = NA, df = 523, p-value = NA
library(quantreg)
ss_quant <- rq(ss2 ~ ss1, tau = c(seq(0.1, 0.9, 0.1)), data = midsch_sub)
plot(summary(ss_quant, se = "boot", method = "wild"))
ss_quant
shows that in the lower quantiles the coefficients for the intercept and ss1
fall outside the confidence interval around the base coefficientss_quant2 <- rq(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2, tau = c(seq(0.1,
0.9, 0.1)), data = midsch_sub)
plot(summary(ss_quant2, se = "boot", method = "wild"))
ss_quant3 <- rq(ss2 ~ ss1, tau = -1, data = midsch_sub)
qplot(ss_quant3$sol[1, ], ss_quant3$sol[5, ], geom = "line", main = "Continuous Quantiles") +
theme_dpi() + xlab("Quantile") + ylab(expression(beta)) + geom_hline(yintercept = coef(summary(ss_mod))[2,
1]) + geom_hline(yintercept = coef(summary(ss_mod))[2, 1] + (2 * coef(summary(ss_mod))[2,
2]), linetype = 3) + geom_hline(yintercept = coef(summary(ss_mod))[2, 1] -
(2 * coef(summary(ss_mod))[2, 2]), linetype = 3)
ss_quant4 <- rq(ss2 ~ ss1 + I(ss1^2) + I(ss1^3) + I(ss1^4) + n2, tau = -1, data = midsch_sub)
qplot(ss_quant4$sol[1, ], ss_quant4$sol[5, ], geom = "line", main = "Continuous Quantiles") +
theme_dpi() + xlab("Quantile") + ylab(expression(beta)) + geom_hline(yintercept = coef(summary(ss_mod))[2,
1]) + geom_hline(yintercept = coef(summary(ss_mod))[2, 1] + (2 * coef(summary(ss_mod))[2,
2]), linetype = 3) + geom_hline(yintercept = coef(summary(ss_mod))[2, 1] -
(2 * coef(summary(ss_mod))[2, 2]), linetype = 3)
dlply
library(plyr)
midsch$id <- interaction(midsch$test_year, midsch$grade, midsch$subject)
mods <- dlply(midsch, .(id), lm, formula = ss2 ~ ss1)
objects(mods)[1:10]
## [1] "2007.4.math" "2007.4.read" "2007.5.math" "2007.5.read" "2007.6.math"
## [6] "2007.6.read" "2007.7.math" "2007.7.read" "2007.8.math" "2007.8.read"
mytest <- llply(mods, function(x) resettest(x, power = 2:4))
mytest[[1]]
##
## RESET test
##
## data: x
## RESET = 2.499, df1 = 3, df2 = 570, p-value = 0.05876
mytest[[2]]
##
## RESET test
##
## data: x
## RESET = 0.8864, df1 = 3, df2 = 597, p-value = 0.4478
a1 <- qplot(id, residmean, data = ddply(midsch, .(id), summarize, residmean = mean(residuals)),
geom = "bar", main = "Provided Residuals") + theme_dpi() + opts(axis.text.x = theme_blank(),
axis.ticks = theme_blank()) + ylab("Mean of Residuals") + xlab("Model") +
geom_text(aes(x = 12, y = 0.3), label = "SD of Residuals = 9")
a2 <- qplot(id, V1, data = ldply(mods, function(x) mean(x$residuals)), geom = "bar",
main = "Replication Models") + theme_dpi() + opts(axis.text.x = theme_blank(),
axis.ticks = theme_blank()) + ylab("Mean of Residuals") + xlab("Model") +
geom_text(aes(x = 7, y = 0.3), label = paste("SD =", round(mean(ldply(mods,
function(x) sd(x$residuals))$V1), 2)))
grid.arrange(a1, a2, main = "Comparing Replication and Provided Residual Means by Model")
qplot(residuals, data = midsch, geom = "density") + stat_function(fun = dnorm,
aes(colour = "Normal")) + geom_histogram(aes(y = ..density..), alpha = I(0.4)) +
geom_line(aes(y = ..density.., colour = "Empirical"), stat = "density") +
scale_colour_manual(name = "Density", values = c("red", "blue")) + opts(legend.position = c(0.85,
0.85)) + theme_dpi()
b <- 2 * rnorm(5000)
c <- b + runif(5000)
dem <- lm(c ~ b)
a1 <- qplot(midsch$ss1, abs(midsch$residuals), main = "Residual Plot of Replication Data",
geom = "point", alpha = I(0.1)) + geom_smooth(method = "lm", se = TRUE) +
xlab("SS1") + ylab("Residuals") + geom_smooth(se = FALSE) + ylim(c(0, 50)) +
theme_dpi()
a2 <- qplot(b, abs(lm(c ~ b)$residuals), main = "Well Specified OLS", alpha = I(0.3)) +
theme_dpi() + geom_smooth()
grid.arrange(a1, a2, ncol = 2)
bptest(ss_mod)
##
## studentized Breusch-Pagan test
##
## data: ss_mod
## BP = 7.499, df = 1, p-value = 0.006172
gqtest(ss_mod)
##
## Goldfeld-Quandt test
##
## data: ss_mod
## GQ = 0.8302, df1 = 263, df2 = 263, p-value = 0.9341
lm
objectggplot2
we can run something called fortify
on our linear model to get a data frame that tells us a lot of diagnostics about each observationdamodel <- fortify(ss_mod)
summary(damodel)
## ss2 ss1 .hat .sigma
## Min. :416 Min. :392 Min. :0.00189 Min. :10.9
## 1st Qu.:478 1st Qu.:457 1st Qu.:0.00207 1st Qu.:11.2
## Median :495 Median :471 Median :0.00275 Median :11.2
## Mean :494 Mean :468 Mean :0.00377 Mean :11.2
## 3rd Qu.:510 3rd Qu.:483 3rd Qu.:0.00416 3rd Qu.:11.2
## Max. :560 Max. :511 Max. :0.02920 Max. :11.2
## .cooksd .fitted .resid .stdresid
## Min. :0.00000 Min. :412 Min. :-46.36 Min. :-4.148
## 1st Qu.:0.00015 1st Qu.:481 1st Qu.: -7.60 1st Qu.:-0.680
## Median :0.00062 Median :496 Median : -0.42 Median :-0.038
## Mean :0.00225 Mean :494 Mean : 0.00 Mean : 0.000
## 3rd Qu.:0.00179 3rd Qu.:509 3rd Qu.: 6.49 3rd Qu.: 0.581
## Max. :0.06596 Max. :539 Max. : 58.36 Max. : 5.218
dv
iv
.hat
.sigma
.cooksd
.fitted
.resid
and .stdresid
.fitted
is the prediction from our model.resid
= dv
- .fitted
.stdresid
= normalized .resid
.sigma
= estimate of residual standard deviation when observation is dropped from the model.hat
is more obscure, but is a measure of the influence an individual observation has on overall model fita <- rnorm(500)
b <- runif(500)
c <- a + b
goodsim <- lm(c ~ a)
goodsim_a <- fortify(goodsim)
qplot(c, .hat, data = goodsim_a) + theme_dpi() + geom_smooth(se = FALSE)
qplot(ss2, .hat, data = damodel) + theme_dpi() + geom_smooth(se = FALSE)
a <- qplot(c, .hat, data = goodsim_a) + theme_dpi() + geom_smooth(se = FALSE)
b <- qplot(ss2, .hat, data = damodel) + theme_dpi() + geom_smooth(se = FALSE)
grid.arrange(a, b, ncol = 2)
hat
value are what we call “high leverage” observations, and on their own are not bad–in fact our good model has lots of themqplot(ss2, .hat, data = damodel) + theme_dpi() + geom_smooth(se = FALSE) + geom_hline(yintercept = 3 *
mean(damodel$.hat), color = I("red"), size = I(1.1))
infobs <- which(apply(influence.measures(ss_mod)$is.inf, 1, any))
ssdata <- cbind(fortify(ss_mod), midsch_sub)
ssdata$id3 <- paste(ssdata$district_id, ssdata$school_id, sep = ".")
noinf <- lm(ss2 ~ ss1, data = midsch_sub[-infobs, ])
noinff <- fortify(noinf)
qplot(ss1, ss2, data = ssdata, alpha = I(0.5)) + geom_line(aes(ss1, .fitted,
group = 1), data = ssdata, size = I(1.02)) + geom_line(aes(x = ss1, y = .fitted,
group = 1), data = noinff, linetype = 6, size = 1.1, color = "blue") + theme_dpi() +
xlab("SS1") + ylab("Y")
my_megamod <- lm(ss2 ~ ss1 + grade + test_year + subject, data = midsch)
summary(my_megamod)
##
## Call:
## lm(formula = ss2 ~ ss1 + grade + test_year + subject, data = midsch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83.58 -6.38 0.69 6.93 62.80
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 415.92105 108.01823 3.85 0.00012 ***
## ss1 0.89548 0.00321 278.85 < 2e-16 ***
## grade -0.72909 0.08014 -9.10 < 2e-16 ***
## test_year -0.16754 0.05380 -3.11 0.00185 **
## subjectread -11.53144 0.15245 -75.64 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.7 on 19980 degrees of freedom
## Multiple R-squared: 0.9, Adjusted R-squared: 0.9
## F-statistic: 4.5e+04 on 4 and 19980 DF, p-value: <2e-16
my_megamod2 <- lm(ss2 ~ ss1 + as.factor(grade) + as.factor(test_year) + subject,
data = midsch)
summary(my_megamod2)
##
## Call:
## lm(formula = ss2 ~ ss1 + as.factor(grade) + as.factor(test_year) +
## subject, data = midsch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -77.43 -5.78 0.36 6.18 60.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.93813 1.35590 53.79 < 2e-16 ***
## ss1 0.91197 0.00306 298.17 < 2e-16 ***
## as.factor(grade)5 -8.39756 0.20701 -40.57 < 2e-16 ***
## as.factor(grade)6 -0.69535 0.27917 -2.49 0.013 *
## as.factor(grade)7 -2.92812 0.29120 -10.06 < 2e-16 ***
## as.factor(grade)8 -7.64546 0.32318 -23.66 < 2e-16 ***
## as.factor(test_year)2008 -3.08623 0.22493 -13.72 < 2e-16 ***
## as.factor(test_year)2009 -0.46178 0.22667 -2.04 0.042 *
## as.factor(test_year)2010 -1.86967 0.22716 -8.23 < 2e-16 ***
## as.factor(test_year)2011 -1.49652 0.22769 -6.57 5.1e-11 ***
## subjectread -11.59171 0.14416 -80.41 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.2 on 19974 degrees of freedom
## Multiple R-squared: 0.911, Adjusted R-squared: 0.911
## F-statistic: 2.04e+04 on 10 and 19974 DF, p-value: <2e-16
anova(my_megamod, my_megamod2)
## Analysis of Variance Table
##
## Model 1: ss2 ~ ss1 + grade + test_year + subject
## Model 2: ss2 ~ ss1 + as.factor(grade) + as.factor(test_year) + subject
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 19980 2306425
## 2 19974 2061668 6 244757 395 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
*
megamodeli <- lm(ss2 ~ ss1 + as.factor(grade) + subject * factor(test_year),
data = midsch)
summary(megamodeli)
##
## Call:
## lm(formula = ss2 ~ ss1 + as.factor(grade) + subject * factor(test_year),
## data = midsch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.38 -5.74 0.32 6.18 60.86
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.82489 1.35266 53.84 < 2e-16
## ss1 0.91331 0.00305 299.53 < 2e-16
## as.factor(grade)5 -8.43203 0.20601 -40.93 < 2e-16
## as.factor(grade)6 -0.74629 0.27784 -2.69 0.0072
## as.factor(grade)7 -3.00776 0.28994 -10.37 < 2e-16
## as.factor(grade)8 -7.74983 0.32188 -24.08 < 2e-16
## subjectread -12.54107 0.31707 -39.55 < 2e-16
## factor(test_year)2008 -4.46270 0.31648 -14.10 < 2e-16
## factor(test_year)2009 0.26387 0.31898 0.83 0.4081
## factor(test_year)2010 -1.58019 0.31991 -4.94 7.9e-07
## factor(test_year)2011 -3.51305 0.32088 -10.95 < 2e-16
## subjectread:factor(test_year)2008 2.74390 0.44713 6.14 8.6e-10
## subjectread:factor(test_year)2009 -1.45665 0.45085 -3.23 0.0012
## subjectread:factor(test_year)2010 -0.58815 0.45192 -1.30 0.1931
## subjectread:factor(test_year)2011 4.02058 0.45290 8.88 < 2e-16
##
## (Intercept) ***
## ss1 ***
## as.factor(grade)5 ***
## as.factor(grade)6 **
## as.factor(grade)7 ***
## as.factor(grade)8 ***
## subjectread ***
## factor(test_year)2008 ***
## factor(test_year)2009
## factor(test_year)2010 ***
## factor(test_year)2011 ***
## subjectread:factor(test_year)2008 ***
## subjectread:factor(test_year)2009 **
## subjectread:factor(test_year)2010
## subjectread:factor(test_year)2011 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.1 on 19970 degrees of freedom
## Multiple R-squared: 0.912, Adjusted R-squared: 0.912
## F-statistic: 1.47e+04 on 14 and 19970 DF, p-value: <2e-16
:
in this case only includes the interactions, but not the main effects in every combinationmegamodelii <- lm(ss2 ~ ss1 + as.factor(grade) + subject:factor(test_year),
data = midsch)
summary(megamodelii)
##
## Call:
## lm(formula = ss2 ~ ss1 + as.factor(grade) + subject:factor(test_year),
## data = midsch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.38 -5.74 0.32 6.18 60.86
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.79135 1.37799 44.12 < 2e-16 ***
## ss1 0.91331 0.00305 299.53 < 2e-16 ***
## as.factor(grade)5 -8.43203 0.20601 -40.93 < 2e-16 ***
## as.factor(grade)6 -0.74629 0.27784 -2.69 0.0072 **
## as.factor(grade)7 -3.00776 0.28994 -10.37 < 2e-16 ***
## as.factor(grade)8 -7.74983 0.32188 -24.08 < 2e-16 ***
## subjectmath:factor(test_year)2007 12.03354 0.32069 37.52 < 2e-16 ***
## subjectread:factor(test_year)2007 -0.50753 0.31972 -1.59 0.1124
## subjectmath:factor(test_year)2008 7.57083 0.31980 23.67 < 2e-16 ***
## subjectread:factor(test_year)2008 -2.22633 0.31968 -6.96 3.4e-12 ***
## subjectmath:factor(test_year)2009 12.29741 0.32256 38.12 < 2e-16 ***
## subjectread:factor(test_year)2009 -1.70032 0.32223 -5.28 1.3e-07 ***
## subjectmath:factor(test_year)2010 10.45335 0.32279 32.38 < 2e-16 ***
## subjectread:factor(test_year)2010 -2.67587 0.32275 -8.29 < 2e-16 ***
## subjectmath:factor(test_year)2011 8.52049 0.32321 26.36 < 2e-16 ***
## subjectread:factor(test_year)2011 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.1 on 19970 degrees of freedom
## Multiple R-squared: 0.912, Adjusted R-squared: 0.912
## F-statistic: 1.47e+04 on 14 and 19970 DF, p-value: <2e-16
anova(my_megamod, my_megamod2, megamodelii, megamodeli)
## Analysis of Variance Table
##
## Model 1: ss2 ~ ss1 + grade + test_year + subject
## Model 2: ss2 ~ ss1 + as.factor(grade) + as.factor(test_year) + subject
## Model 3: ss2 ~ ss1 + as.factor(grade) + subject:factor(test_year)
## Model 4: ss2 ~ ss1 + as.factor(grade) + subject * factor(test_year)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 19980 2306425
## 2 19974 2061668 6 244757 399.3 <2e-16 ***
## 3 19970 2040193 4 21475 52.5 <2e-16 ***
## 4 19970 2040193 0 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
badidea <- lm(ss2 ~ ss1 + as.factor(grade) + as.factor(test_year) + subject +
factor(district_id), data = midsch)
summary(badidea)
##
## Call:
## lm(formula = ss2 ~ ss1 + as.factor(grade) + as.factor(test_year) +
## subject + factor(district_id), data = midsch)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.49 -5.48 -0.01 5.47 62.79
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 148.98751 3.81658 39.04 < 2e-16 ***
## ss1 0.74205 0.00423 175.31 < 2e-16 ***
## as.factor(grade)5 -3.85890 0.21074 -18.31 < 2e-16 ***
## as.factor(grade)6 6.60333 0.30341 21.76 < 2e-16 ***
## as.factor(grade)7 7.73675 0.33706 22.95 < 2e-16 ***
## as.factor(grade)8 5.69441 0.38978 14.61 < 2e-16 ***
## as.factor(test_year)2008 -2.59695 0.21338 -12.17 < 2e-16 ***
## as.factor(test_year)2009 0.17251 0.21602 0.80 0.42453
## as.factor(test_year)2010 -0.84928 0.21778 -3.90 9.7e-05 ***
## as.factor(test_year)2011 -0.45301 0.21753 -2.08 0.03731 *
## subjectread -10.97461 0.13420 -81.78 < 2e-16 ***
## factor(district_id)14 -5.89212 3.61105 -1.63 0.10276
## factor(district_id)70 1.89458 4.47156 0.42 0.67179
## factor(district_id)84 -1.31629 4.71604 -0.28 0.78016
## factor(district_id)91 2.99628 4.17978 0.72 0.47347
## factor(district_id)112 0.30500 3.65142 0.08 0.93343
## factor(district_id)119 3.35189 3.68485 0.91 0.36302
## factor(district_id)126 -2.41477 5.77471 -0.42 0.67583
## factor(district_id)140 -1.61203 3.62384 -0.44 0.65644
## factor(district_id)147 1.75132 3.36281 0.52 0.60252
## factor(district_id)154 -1.94406 3.58967 -0.54 0.58812
## factor(district_id)161 3.79405 5.77435 0.66 0.51116
## factor(district_id)170 -0.95470 3.54819 -0.27 0.78788
## factor(district_id)182 4.67066 3.56335 1.31 0.18996
## factor(district_id)203 0.95483 4.30520 0.22 0.82448
## factor(district_id)217 -2.14888 5.77247 -0.37 0.70970
## factor(district_id)231 1.32681 3.84933 0.34 0.73033
## factor(district_id)238 0.99708 3.81210 0.26 0.79367
## factor(district_id)280 -1.67854 3.49559 -0.48 0.63110
## factor(district_id)308 -4.03393 3.66661 -1.10 0.27127
## factor(district_id)336 0.01280 3.49218 0.00 0.99708
## factor(district_id)350 5.40344 5.09415 1.06 0.28883
## factor(district_id)413 -4.40799 3.38955 -1.30 0.19346
## factor(district_id)422 -1.68519 3.68427 -0.46 0.64739
## factor(district_id)427 19.33171 7.45405 2.59 0.00951 **
## factor(district_id)434 -1.73695 3.65076 -0.48 0.63424
## factor(district_id)469 3.75988 4.71376 0.80 0.42509
## factor(district_id)476 -4.06442 3.72603 -1.09 0.27537
## factor(district_id)485 2.61823 4.30459 0.61 0.54303
## factor(district_id)497 -3.12815 5.09092 -0.61 0.53892
## factor(district_id)602 -1.92506 3.94358 -0.49 0.62545
## factor(district_id)609 -0.43771 4.71448 -0.09 0.92603
## factor(district_id)616 1.28019 3.94472 0.32 0.74554
## factor(district_id)623 3.18066 4.30363 0.74 0.45988
## factor(district_id)637 -0.55437 7.45378 -0.07 0.94071
## factor(district_id)657 12.53789 4.47516 2.80 0.00509 **
## factor(district_id)658 -1.00424 4.30261 -0.23 0.81545
## factor(district_id)665 -1.05043 3.58996 -0.29 0.76983
## factor(district_id)700 -0.79929 3.84847 -0.21 0.83547
## factor(district_id)714 7.22362 3.45606 2.09 0.03662 *
## factor(district_id)721 0.24070 3.65129 0.07 0.94744
## factor(district_id)735 -1.14845 4.08171 -0.28 0.77843
## factor(district_id)777 -1.51765 3.61159 -0.42 0.67433
## factor(district_id)840 7.54670 7.45362 1.01 0.31132
## factor(district_id)870 1.36074 4.17983 0.33 0.74477
## factor(district_id)896 -0.29208 4.71412 -0.06 0.95060
## factor(district_id)903 -1.34798 3.81176 -0.35 0.72361
## factor(district_id)910 -3.96702 4.00659 -0.99 0.32213
## factor(district_id)994 9.29289 5.77557 1.61 0.10763
## factor(district_id)1015 6.43248 3.62528 1.77 0.07602 .
## factor(district_id)1029 6.11486 4.47180 1.37 0.17151
## factor(district_id)1078 1.14949 3.84857 0.30 0.76519
## factor(district_id)1085 3.77354 3.94367 0.96 0.33865
## factor(district_id)1092 -0.18947 3.48134 -0.05 0.95660
## factor(district_id)1134 -2.73460 3.66722 -0.75 0.45587
## factor(district_id)1141 -4.27890 3.66731 -1.17 0.24332
## factor(district_id)1162 5.02019 5.09040 0.99 0.32404
## factor(district_id)1176 1.51987 3.75060 0.41 0.68531
## factor(district_id)1183 1.93317 3.72664 0.52 0.60394
## factor(district_id)1218 -1.35219 4.71286 -0.29 0.77418
## factor(district_id)1232 1.50135 4.71616 0.32 0.75023
## factor(district_id)1253 -4.08712 3.49242 -1.17 0.24190
## factor(district_id)1260 -1.79921 3.70474 -0.49 0.62722
## factor(district_id)1295 -0.65535 4.47422 -0.15 0.88355
## factor(district_id)1309 2.55693 4.17909 0.61 0.54065
## factor(district_id)1316 3.67797 3.65186 1.01 0.31387
## factor(district_id)1376 6.55185 3.56432 1.84 0.06605 .
## factor(district_id)1380 -7.86521 3.49585 -2.25 0.02447 *
## factor(district_id)1407 2.30171 4.30266 0.53 0.59269
## factor(district_id)1414 5.36948 3.62488 1.48 0.13855
## factor(district_id)1428 2.20112 4.08271 0.54 0.58980
## factor(district_id)1449 10.67951 7.45333 1.43 0.15192
## factor(district_id)1499 -2.28246 4.18016 -0.55 0.58506
## factor(district_id)1526 -0.69238 3.62326 -0.19 0.84846
## factor(district_id)1540 2.69523 3.65116 0.74 0.46041
## factor(district_id)1554 1.37737 3.38333 0.41 0.68394
## factor(district_id)1568 0.77716 3.65077 0.21 0.83143
## factor(district_id)1600 5.29397 7.45430 0.71 0.47760
## factor(district_id)1631 2.77177 4.47192 0.62 0.53539
## factor(district_id)1638 2.45280 3.51370 0.70 0.48514
## factor(district_id)1645 4.41788 3.89267 1.13 0.25642
## factor(district_id)1659 2.53142 3.77974 0.67 0.50304
## factor(district_id)1673 -7.60574 5.77207 -1.32 0.18763
## factor(district_id)1687 3.52457 3.59087 0.98 0.32634
## factor(district_id)1694 -0.18870 3.68475 -0.05 0.95916
## factor(district_id)1729 -0.84508 5.77264 -0.15 0.88361
## factor(district_id)1813 -7.13497 5.09373 -1.40 0.16131
## factor(district_id)1848 -10.70569 4.47481 -2.39 0.01675 *
## factor(district_id)1855 -2.50834 5.77242 -0.43 0.66390
## factor(district_id)1862 -0.37137 3.41120 -0.11 0.91331
## factor(district_id)1883 2.20178 3.68386 0.60 0.55006
## factor(district_id)1890 5.96119 4.47214 1.33 0.18256
## factor(district_id)1897 4.40600 3.62546 1.22 0.22427
## factor(district_id)1900 4.66362 3.52007 1.32 0.18523
## factor(district_id)1939 -2.29608 5.09149 -0.45 0.65202
## factor(district_id)1945 0.52588 3.58011 0.15 0.88322
## factor(district_id)1953 2.08524 3.77912 0.55 0.58111
## factor(district_id)2009 -0.54325 4.30303 -0.13 0.89954
## factor(district_id)2044 4.81875 5.09405 0.95 0.34418
## factor(district_id)2051 -1.05207 3.70471 -0.28 0.77643
## factor(district_id)2058 5.48442 3.60238 1.52 0.12791
## factor(district_id)2128 3.17750 5.09021 0.62 0.53248
## factor(district_id)2142 4.70263 3.81229 1.23 0.21739
## factor(district_id)2184 -2.01049 3.68504 -0.55 0.58536
## factor(district_id)2205 5.24323 4.71439 1.11 0.26608
## factor(district_id)2212 -7.82371 5.77328 -1.36 0.17538
## factor(district_id)2217 3.29519 3.77999 0.87 0.38336
## factor(district_id)2226 -8.82873 4.08243 -2.16 0.03058 *
## factor(district_id)2233 1.27707 3.72649 0.34 0.73183
## factor(district_id)2240 -5.08814 4.71348 -1.08 0.28038
## factor(district_id)2289 -1.41013 3.35904 -0.42 0.67463
## factor(district_id)2296 4.88177 3.52538 1.38 0.16614
## factor(district_id)2303 -2.15791 3.51330 -0.61 0.53908
## factor(district_id)2310 0.57489 5.09356 0.11 0.91014
## factor(district_id)2420 6.15664 3.52477 1.75 0.08071 .
## factor(district_id)2422 -2.71442 3.72648 -0.73 0.46637
## factor(district_id)2443 2.17936 3.65091 0.60 0.55056
## factor(district_id)2460 4.63823 3.62429 1.28 0.20064
## factor(district_id)2478 -0.07777 3.65072 -0.02 0.98300
## factor(district_id)2485 -0.32718 4.47159 -0.07 0.94167
## factor(district_id)2523 2.10726 4.08129 0.52 0.60563
## factor(district_id)2527 2.19518 4.17909 0.53 0.59940
## factor(district_id)2534 4.39408 7.45429 0.59 0.55555
## factor(district_id)2541 -7.08520 5.09390 -1.39 0.16427
## factor(district_id)2562 2.36758 3.50406 0.68 0.49926
## factor(district_id)2576 -0.25670 3.89144 -0.07 0.94741
## factor(district_id)2583 3.22269 3.49641 0.92 0.35669
## factor(district_id)2604 4.19445 3.58169 1.17 0.24158
## factor(district_id)2605 4.42663 4.30389 1.03 0.30372
## factor(district_id)2611 5.74927 3.47547 1.65 0.09809 .
## factor(district_id)2618 -3.66167 4.17895 -0.88 0.38092
## factor(district_id)2632 4.82899 4.47407 1.08 0.28045
## factor(district_id)2646 3.15504 4.47427 0.71 0.48072
## factor(district_id)2695 1.17008 3.38716 0.35 0.72976
## factor(district_id)2702 -0.10449 3.72627 -0.03 0.97763
## factor(district_id)2730 -3.09343 4.30366 -0.72 0.47228
## factor(district_id)2737 5.32855 4.47168 1.19 0.23342
## factor(district_id)2744 -4.60344 3.77878 -1.22 0.22315
## factor(district_id)2758 -1.76045 3.61111 -0.49 0.62590
## factor(district_id)2793 -0.56062 3.35586 -0.17 0.86733
## factor(district_id)2800 -0.06760 3.65091 -0.02 0.98523
## factor(district_id)2814 3.62502 4.71318 0.77 0.44183
## factor(district_id)2828 1.44955 3.75220 0.39 0.69926
## factor(district_id)2835 6.56438 3.62457 1.81 0.07014 .
## factor(district_id)2842 10.16507 5.09333 2.00 0.04597 *
## factor(district_id)2849 -1.56283 3.41139 -0.46 0.64687
## factor(district_id)2856 -2.41869 3.75122 -0.64 0.51908
## factor(district_id)2863 0.75008 7.45383 0.10 0.91985
## factor(district_id)2885 -2.90480 3.50402 -0.83 0.40712
## factor(district_id)2898 1.42446 3.75087 0.38 0.70412
## factor(district_id)2912 1.41648 4.71501 0.30 0.76386
## factor(district_id)2940 -7.94843 4.71462 -1.69 0.09183 .
## factor(district_id)2961 7.85789 4.71429 1.67 0.09557 .
## factor(district_id)3087 -4.01147 4.71609 -0.85 0.39501
## factor(district_id)3129 0.38014 3.65092 0.10 0.91707
## factor(district_id)3150 1.97300 3.84836 0.51 0.60818
## factor(district_id)3171 2.00136 4.71435 0.42 0.67119
## factor(district_id)3206 10.75595 7.45256 1.44 0.14896
## factor(district_id)3220 2.40836 3.72648 0.65 0.51810
## factor(district_id)3269 -1.52719 3.35200 -0.46 0.64868
## factor(district_id)3290 -1.75732 3.42052 -0.51 0.60743
## factor(district_id)3297 -0.83698 3.94285 -0.21 0.83189
## factor(district_id)3304 5.20597 4.47435 1.16 0.24464
## factor(district_id)3311 -1.18911 3.72686 -0.32 0.74968
## factor(district_id)3318 -1.50459 5.09382 -0.30 0.76771
## factor(district_id)3325 -3.15125 5.77539 -0.55 0.58532
## factor(district_id)3332 -0.52079 3.72640 -0.14 0.88885
## factor(district_id)3339 5.17735 3.46511 1.49 0.13516
## factor(district_id)3360 0.00374 3.59970 0.00 0.99917
## factor(district_id)3367 -0.24629 3.58983 -0.07 0.94530
## factor(district_id)3381 1.58885 3.68557 0.43 0.66640
## factor(district_id)3409 -0.60769 3.66741 -0.17 0.86839
## factor(district_id)3427 -2.25874 4.47161 -0.51 0.61347
## factor(district_id)3428 5.82348 5.77326 1.01 0.31313
## factor(district_id)3430 -5.60499 3.47808 -1.61 0.10708
## factor(district_id)3434 -8.43723 3.72691 -2.26 0.02359 *
## factor(district_id)3437 4.23683 3.54239 1.20 0.23170
## factor(district_id)3444 -2.75734 3.53509 -0.78 0.43541
## factor(district_id)3479 8.07864 3.49078 2.31 0.02066 *
## factor(district_id)3484 -6.21708 4.47217 -1.39 0.16449
## factor(district_id)3500 1.48277 3.58078 0.41 0.67881
## factor(district_id)3510 11.97253 3.75368 3.19 0.00143 **
## factor(district_id)3528 8.12939 3.72929 2.18 0.02928 *
## factor(district_id)3549 6.71485 3.42833 1.96 0.05017 .
## factor(district_id)3612 0.58480 3.78012 0.15 0.87706
## factor(district_id)3619 -11.40273 3.33859 -3.42 0.00064 ***
## factor(district_id)3640 -1.95526 4.47171 -0.44 0.66193
## factor(district_id)3654 0.16701 4.47190 0.04 0.97021
## factor(district_id)3661 0.98847 5.77469 0.17 0.86409
## factor(district_id)3668 5.59218 5.77236 0.97 0.33266
## factor(district_id)3675 4.46823 3.66813 1.22 0.22319
## factor(district_id)3682 1.05085 3.75157 0.28 0.77940
## factor(district_id)3689 -0.81978 4.17866 -0.20 0.84447
## factor(district_id)3696 0.61941 5.77433 0.11 0.91458
## factor(district_id)3787 -1.82025 3.65125 -0.50 0.61812
## factor(district_id)3794 3.50740 3.65132 0.96 0.33677
## factor(district_id)3822 6.69083 3.49384 1.92 0.05550 .
## factor(district_id)3857 3.63767 3.51487 1.03 0.30071
## factor(district_id)3862 9.15740 3.65499 2.51 0.01224 *
## factor(district_id)3871 -3.21138 3.72616 -0.86 0.38878
## factor(district_id)3892 3.30407 3.42625 0.96 0.33489
## factor(district_id)3899 -2.46155 4.08164 -0.60 0.54646
## factor(district_id)3906 -3.12682 3.65131 -0.86 0.39181
## factor(district_id)3913 -1.24485 4.47425 -0.28 0.78084
## factor(district_id)3925 5.25080 3.48699 1.51 0.13213
## factor(district_id)3934 4.35825 5.09372 0.86 0.39222
## factor(district_id)3941 -0.21718 4.08117 -0.05 0.95756
## factor(district_id)3948 -4.38526 3.81242 -1.15 0.25005
## factor(district_id)3955 -0.73287 3.63626 -0.20 0.84028
## factor(district_id)3962 0.49831 3.61097 0.14 0.89024
## factor(district_id)3969 -1.32823 5.09142 -0.26 0.79419
## factor(district_id)3983 -3.93047 3.68462 -1.07 0.28611
## factor(district_id)3990 -8.58757 5.09367 -1.69 0.09183 .
## factor(district_id)4011 1.01859 4.30367 0.24 0.81291
## factor(district_id)4018 -0.46530 3.43564 -0.14 0.89227
## factor(district_id)4060 4.70264 3.48238 1.35 0.17690
## factor(district_id)4067 -2.17809 3.68491 -0.59 0.55447
## factor(district_id)4074 -1.54639 3.75127 -0.41 0.68017
## factor(district_id)4088 -5.26138 3.77880 -1.39 0.16383
## factor(district_id)4095 3.15058 3.51345 0.90 0.36988
## factor(district_id)4144 2.90557 3.75214 0.77 0.43872
## factor(district_id)4151 0.82096 3.66734 0.22 0.82287
## factor(district_id)4165 -0.15343 3.77881 -0.04 0.96761
## factor(district_id)4179 0.88401 3.38827 0.26 0.79417
## factor(district_id)4186 -4.82044 4.71338 -1.02 0.30646
## factor(district_id)4207 2.35274 5.09413 0.46 0.64419
## factor(district_id)4221 0.11081 7.45369 0.01 0.98814
## factor(district_id)4228 -0.55538 3.81152 -0.15 0.88415
## factor(district_id)4235 6.42321 3.65247 1.76 0.07866 .
## factor(district_id)4270 -2.05746 4.08349 -0.50 0.61437
## factor(district_id)4305 -0.84025 3.72717 -0.23 0.82164
## factor(district_id)4312 4.54308 3.75327 1.21 0.22613
## factor(district_id)4330 -3.29387 5.09215 -0.65 0.51773
## factor(district_id)4347 3.15451 4.71379 0.67 0.50337
## factor(district_id)4368 -2.06023 3.94404 -0.52 0.60142
## factor(district_id)4375 -4.94303 7.45176 -0.66 0.50712
## factor(district_id)4389 3.52896 3.81179 0.93 0.35456
## factor(district_id)4459 -1.21356 4.71317 -0.26 0.79681
## factor(district_id)4473 0.48775 4.00598 0.12 0.90309
## factor(district_id)4501 2.66716 3.48896 0.76 0.44460
## factor(district_id)4515 1.68297 3.65202 0.46 0.64492
## factor(district_id)4529 -2.79105 4.71634 -0.59 0.55400
## factor(district_id)4536 -1.22878 3.94418 -0.31 0.75539
## factor(district_id)4543 -3.70921 3.94404 -0.94 0.34699
## factor(district_id)4571 6.56348 4.17899 1.57 0.11629
## factor(district_id)4578 -0.34842 3.89196 -0.09 0.92867
## factor(district_id)4606 -5.12270 4.47217 -1.15 0.25203
## factor(district_id)4613 3.71168 3.58992 1.03 0.30119
## factor(district_id)4620 -5.72089 3.36191 -1.70 0.08883 .
## factor(district_id)4627 0.42263 3.59009 0.12 0.90629
## factor(district_id)4634 0.53901 3.94257 0.14 0.89126
## factor(district_id)4641 2.86318 3.84934 0.74 0.45700
## factor(district_id)4686 -1.31770 4.47447 -0.29 0.76838
## factor(district_id)4690 4.97604 3.94520 1.26 0.20722
## factor(district_id)4753 -2.89608 3.56281 -0.81 0.41630
## factor(district_id)4760 -0.49195 5.09203 -0.10 0.92304
## factor(district_id)4781 -2.04888 3.77917 -0.54 0.58772
## factor(district_id)4802 1.20556 3.63632 0.33 0.74025
## factor(district_id)4843 2.64340 4.47498 0.59 0.55472
## factor(district_id)4851 -0.02014 3.72589 -0.01 0.99569
## factor(district_id)4872 3.02402 3.70453 0.82 0.41434
## factor(district_id)4893 2.42670 3.61136 0.67 0.50162
## factor(district_id)4956 -1.02238 5.77526 -0.18 0.85949
## factor(district_id)4963 0.09122 5.77239 0.02 0.98739
## factor(district_id)4970 1.84765 3.46417 0.53 0.59379
## factor(district_id)4998 0.33248 3.75226 0.09 0.92939
## factor(district_id)5019 1.99548 3.65133 0.55 0.58472
## factor(district_id)5026 -2.27390 3.65125 -0.62 0.53344
## factor(district_id)5068 0.76529 3.58969 0.21 0.83118
## factor(district_id)5100 2.39017 3.65142 0.65 0.51274
## factor(district_id)5138 -0.12093 3.68491 -0.03 0.97382
## factor(district_id)5264 -1.97448 3.57126 -0.55 0.58035
## factor(district_id)5271 -2.39441 3.39648 -0.70 0.48084
## factor(district_id)5278 0.02451 3.70488 0.01 0.99472
## factor(district_id)5306 0.08681 4.17874 0.02 0.98343
## factor(district_id)5348 1.60339 4.47423 0.36 0.72008
## factor(district_id)5355 7.44106 3.53756 2.10 0.03544 *
## factor(district_id)5362 -10.94444 5.77429 -1.90 0.05806 .
## factor(district_id)5369 -3.42761 3.94394 -0.87 0.38481
## factor(district_id)5390 4.57097 3.68549 1.24 0.21489
## factor(district_id)5432 -1.67459 3.66743 -0.46 0.64795
## factor(district_id)5439 0.22307 3.51339 0.06 0.94938
## factor(district_id)5457 3.90093 4.71348 0.83 0.40790
## factor(district_id)5460 -1.62756 3.75091 -0.43 0.66436
## factor(district_id)5467 -4.42693 4.71391 -0.94 0.34768
## factor(district_id)5474 -1.44116 3.65127 -0.39 0.69307
## factor(district_id)5523 0.43580 4.71383 0.09 0.92634
## factor(district_id)5593 3.46994 3.77893 0.92 0.35851
## factor(district_id)5607 2.12929 3.39691 0.63 0.53078
## factor(district_id)5614 7.36388 5.09060 1.45 0.14804
## factor(district_id)5621 2.28187 3.63714 0.63 0.53042
## factor(district_id)5628 -4.38937 5.09158 -0.86 0.38865
## factor(district_id)5642 1.17074 3.68471 0.32 0.75069
## factor(district_id)5656 3.25016 3.42249 0.95 0.34230
## factor(district_id)5663 -2.10336 3.51346 -0.60 0.54941
## factor(district_id)5740 1.62524 5.77432 0.28 0.77836
## factor(district_id)5747 -2.07069 3.48823 -0.59 0.55277
## factor(district_id)5754 -0.82441 3.65073 -0.23 0.82134
## factor(district_id)5757 1.25904 4.47154 0.28 0.77828
## factor(district_id)5810 0.59861 4.47433 0.13 0.89357
## factor(district_id)5817 0.19100 3.58970 0.05 0.95757
## factor(district_id)5824 -1.87195 3.75177 -0.50 0.61782
## factor(district_id)5859 -0.37260 3.58995 -0.10 0.91734
## factor(district_id)5866 -1.26506 3.89199 -0.33 0.74515
## factor(district_id)5901 4.99285 3.43009 1.46 0.14552
## factor(district_id)5985 0.48091 3.70479 0.13 0.89672
## factor(district_id)6022 0.59604 3.75057 0.16 0.87373
## factor(district_id)6027 3.73754 4.47270 0.84 0.40337
## factor(district_id)6069 6.75909 7.45406 0.91 0.36454
## factor(district_id)6113 3.01917 3.54343 0.85 0.39420
## factor(district_id)6118 2.15589 4.17919 0.52 0.60596
## factor(district_id)6125 -3.01876 3.50402 -0.86 0.38897
## factor(district_id)6174 -0.84132 3.38023 -0.25 0.80345
## factor(district_id)6181 4.72504 3.68737 1.28 0.20006
## factor(district_id)6195 1.50596 3.65099 0.41 0.67999
## factor(district_id)6216 -0.28851 3.84864 -0.07 0.94024
## factor(district_id)6223 -0.98643 3.40467 -0.29 0.77203
## factor(district_id)6237 -1.21340 3.65132 -0.33 0.73965
## factor(district_id)6244 5.09525 3.46210 1.47 0.14111
## factor(district_id)6251 -7.39975 7.45382 -0.99 0.32085
## factor(district_id)6293 -4.72362 5.77298 -0.82 0.41324
## factor(district_id)6300 -1.23379 3.39171 -0.36 0.71604
## factor(district_id)6307 3.35074 3.42736 0.98 0.32826
## factor(district_id)6321 2.70223 4.30224 0.63 0.52995
## factor(district_id)6328 2.52815 3.58970 0.70 0.48127
## factor(district_id)6335 -5.87878 4.00720 -1.47 0.14238
## factor(district_id)6354 4.41717 5.77437 0.76 0.44430
## factor(district_id)6370 1.61430 3.68449 0.44 0.66129
## factor(district_id)6384 3.15642 3.94354 0.80 0.42349
## factor(district_id)6410 4.04696 4.71351 0.86 0.39058
## factor(district_id)6412 -1.06433 4.47409 -0.24 0.81197
## factor(district_id)6419 7.85542 3.60295 2.18 0.02925 *
## factor(district_id)6426 -4.29733 4.71493 -0.91 0.36208
## factor(district_id)6461 -1.73587 3.72576 -0.47 0.64128
## factor(district_id)6470 5.11143 3.58076 1.43 0.15346
## factor(district_id)6475 3.70838 4.71326 0.79 0.43141
## factor(district_id)6482 6.38815 3.94488 1.62 0.10539
## factor(district_id)6608 1.08436 3.81156 0.28 0.77604
## factor(district_id)6678 -1.38829 3.62290 -0.38 0.70158
## factor(district_id)6685 0.95113 3.41806 0.28 0.78081
## factor(district_id)6692 0.54324 3.65124 0.15 0.88173
## factor(district_id)6713 5.17383 5.09167 1.02 0.30958
## factor(district_id)6720 0.05296 3.75167 0.01 0.98874
## factor(district_id)6734 2.26972 3.94448 0.58 0.56502
## factor(district_id)6748 0.91425 3.94414 0.23 0.81670
## factor(district_id)8101 12.82290 7.45221 1.72 0.08532 .
## factor(district_id)8105 -7.72771 3.59165 -2.15 0.03144 *
## factor(district_id)8106 -9.86535 3.59567 -2.74 0.00608 **
## factor(district_id)8108 -9.30716 3.59433 -2.59 0.00962 **
## factor(district_id)8109 -12.35445 3.65396 -3.38 0.00072 ***
## factor(district_id)8110 -7.54074 3.59052 -2.10 0.03573 *
## factor(district_id)8111 -2.87774 3.59103 -0.80 0.42293
## factor(district_id)8112 -16.47665 3.95843 -4.16 3.2e-05 ***
## factor(district_id)8113 -2.60778 3.94245 -0.66 0.50832
## factor(district_id)8114 -5.19282 4.71499 -1.10 0.27076
## factor(district_id)8121 -2.72249 4.00629 -0.68 0.49679
## factor(district_id)8122 -17.15444 5.77587 -2.97 0.00298 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.42 on 19618 degrees of freedom
## Multiple R-squared: 0.925, Adjusted R-squared: 0.923
## F-statistic: 657 on 366 and 19618 DF, p-value: <2e-16
sandwich
package, which allows us to manipulate the variance-covariance matrix and adjust our estimates of standard errors appropriately to correct for non-independencelibrary(sandwich)
vcovHC(ss_mod, type = "HC4")
## (Intercept) ss1
## (Intercept) 182.3905 -0.3858415
## ss1 -0.3858 0.0008173
gls.correct <- function(lm) {
require(sandwich)
rob <- t(sapply(c("const", "HC0", "HC1", "HC2", "HC3", "HC4", "HC5"), function(x) sqrt(diag(vcovHC(lm,
type = x)))))
a <- c(NA, (rob[2, 1] - rob[1, 1])/rob[1, 1], (rob[3, 1] - rob[1, 1])/rob[1,
1], (rob[4, 1] - rob[1, 1])/rob[1, 1], (rob[5, 1] - rob[1, 1])/rob[1,
1], (rob[6, 1] - rob[1, 1])/rob[1, 1], (rob[7, 1] - rob[1, 1])/rob[1,
1])
rob <- cbind(rob, round(a * 100, 2))
a <- c(NA, (rob[2, 2] - rob[1, 2])/rob[1, 2], (rob[3, 2] - rob[1, 2])/rob[1,
2], (rob[4, 2] - rob[1, 2])/rob[1, 2], (rob[5, 2] - rob[1, 2])/rob[1,
2], (rob[6, 2] - rob[1, 2])/rob[1, 2], (rob[7, 2] - rob[1, 2])/rob[1,
2])
rob <- cbind(rob, round(a * 100, 2))
rob <- as.data.frame(rob)
names(rob) <- c("alpha", "beta", "alpha.pchange", "beta.pchange")
rob$id2 <- rownames(rob)
rob
}
gls.correct(ss_mod)
a <- coef(ssN1_mod)
xdat <- seq(min(midsch_sub$ss1), max(midsch_sub$ss1), length.out = 20)
ydat <- xdat * a[2] + a[1] + (median(midsch_sub$n1) * a[3])
ydatsmall <- xdat * a[2] + a[1] + (min(midsch_sub$n1) * a[3])
ydatbig <- xdat * a[2] + a[1] + (max(midsch_sub$n1) * a[3])
myx <- rep(xdat, 3)
myy <- c(ydat, ydatsmall, ydatbig)
mymod <- rep(c("mean", "low", "high"), each = length(xdat))
newdat <- data.frame(x = myx, y = myy, type = mymod)
ggplot(newdat, aes(x = x, y = y, color = mymod)) + geom_line(aes(linetype = mymod),
size = I(0.8)) + coord_cartesian() + theme_dpi()
b <- sqrt(diag(vcov(ssN1_mod)))
mycoef <- data.frame(var = names(b), y = a, se = b)
ggplot(mycoef[2:3, ], aes(x = var, y = y, ymin = y - 2 * se, ymax = y + 2 *
se)) + geom_pointrange() + theme_dpi() + geom_hline(yintercept = 0, size = I(1.1),
color = I("red"))
Test our new megamodel for heteroskedacticty. What do you find?
Does megamodel2
exhibit non-linearity? Can you fit a quantile regression model of this?
Can you simulate a model with multiple variables? What does it look like?
Bonus: Write better code for the plot with different lines for different values of variable 2.
It is good to include the session info, e.g. this document is produced with knitr version 0.8
. Here is my session info:
print(sessionInfo(), locale = FALSE)
## R version 2.15.2 (2012-10-26)
## Platform: i386-w64-mingw32/i386 (32-bit)
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] sandwich_2.2-9 quantreg_4.91 SparseM_0.96 mgcv_1.7-22
## [5] eeptools_0.1 mapproj_1.1-8.3 maps_2.2-6 proto_0.3-9.2
## [9] stringr_0.6.1 plyr_1.7.1 ggplot2_0.9.2.1 lmtest_0.9-30
## [13] zoo_1.7-9 knitr_0.8
##
## loaded via a namespace (and not attached):
## [1] codetools_0.2-8 colorspace_1.2-0 dichromat_1.2-4
## [4] digest_0.5.2 evaluate_0.4.2 formatR_0.6
## [7] gtable_0.1.1 labeling_0.1 lattice_0.20-10
## [10] MASS_7.3-22 Matrix_1.0-10 memoise_0.1
## [13] munsell_0.4 nlme_3.1-105 RColorBrewer_1.0-5
## [16] reshape2_1.2.1 scales_0.2.2 tools_2.15.1
This work (R Tutorial for Education, by Jared E. Knowles), in service of the Wisconsin Department of Public Instruction, is free of known copyright restrictions.