DPI R Bootcamp
Jared Knowles
In this lesson we hope to learn:  Coding Style  Write a `for’ loop  Write a basic function  Optimize R with parallelization  Mixed effect models  Data mining with R  Animations  Git, GitHub, and addon packages

# Loop to calculate number of students per grade
nstudents < rep(NA, 6)
for (i in unique(df$grade)) {
nstudents[[i  2]] < length(df$stuid[df$grade == i])
}
nstudents
## [1] 500 400 500 400 500 400
summary(factor(df$grade))
## 3 4 5 6 7 8
## 500 400 500 400 500 400
A = matrix(as.numeric(1:1e+05))
system.time({
Sum = 0
for (i in seq_along(A)) {
Sum = Sum + A[[i]]
}
Sum
})
## user system elapsed
## 0.11 0.00 0.11
system.time({
sum(A)
})
## user system elapsed
## 0 0 0
rm(A)
print(myfunction)
print(mean) #bytecode, we can't see it
## function (x, ...)
## UseMethod("mean")
## <bytecode: 0x0ea57bd8>
## <environment: namespace:base>
print(order)
## function (..., na.last = TRUE, decreasing = FALSE)
## {
## z < list(...)
## if (any(unlist(lapply(z, is.object)))) {
## z < lapply(z, function(x) if (is.object(x))
## xtfrm(x)
## else x)
## if (!is.na(na.last))
## return(do.call("order", c(z, na.last = na.last, decreasing = decreasing)))
## }
## else if (!is.na(na.last))
## return(.Internal(order(na.last, decreasing, ...)))
## if (any(diff(l.z < vapply(z, length, 1L)) != 0L))
## stop("argument lengths differ")
## ans < vapply(z, is.na, rep.int(NA, l.z[1L]))
## ok < if (is.matrix(ans))
## !apply(ans, 1, any)
## else !any(ans)
## if (all(!ok))
## return(integer())
## z[[1L]][!ok] < NA
## ans < do.call("order", c(z, decreasing = decreasing))
## keep < seq_along(ok)[ok]
## ans[ans %in% keep]
## }
## <bytecode: 0x0deed500>
## <environment: namespace:base>
x<as.character(x)
is really obnoxiousdefac < function(x) {
# assign function a name, and list its arguments
x < as.character(x) # what does function do?
x # last line is output of function
}
a < factor(letters)
summary(a)
summary(defac(a))
summary(as.character(a))
foo
which has both a character and a missing value?extractN < function(x) {
x < suppressWarnings(as.numeric(x))
# ignore warnings because we don't care
x < x[!is.na(x)]
x
}
extractN(foo)
A < extractN(foo)
lme4
package to do this work, or in a Bayesian framework (fancy) we can use winBUGS or JAGSlme4
by Doug Bates (a retired UWMadison Professor of Statistics)mymod_me<lmer(readSS~factor(grade)+factor(race)+female+disab+ell+(1dist)+(1stuid),data=df)
(1dist)
denote a random effect. In this case we are measuring a random effect for the variable dist
in our data.lm
formula (by designlibrary(lme4)
mymod_me < lmer(readSS ~ factor(grade) + factor(race) + female + disab + ell +
(1  dist) + (1  stuid), data = df)
print(mymod_me, correlation = FALSE)
## Linear mixed model fit by REML
## Formula: readSS ~ factor(grade) + factor(race) + female + disab + ell + (1  dist) + (1  stuid)
## Data: df
## AIC BIC logLik deviance REMLdev
## 30731 30826 15350 30774 30699
## Random effects:
## Groups Name Variance Std.Dev.
## stuid (Intercept) 10626 103.1
## dist (Intercept) 487 22.1
## Residual 1564 39.5
## Number of obs: 2700, groups: stuid, 1200; dist, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 378.24 31.03 12.2
## factor(grade)4 62.72 3.05 20.6
## factor(grade)5 124.67 3.28 38.0
## factor(grade)6 182.67 3.99 45.8
## factor(grade)7 236.64 4.20 56.4
## factor(grade)8 301.41 4.74 63.5
## factor(race)B 61.87 28.05 2.2
## factor(race)H 34.29 27.54 1.2
## factor(race)I 6.09 43.93 0.1
## factor(race)W 10.37 28.00 0.4
## female 8.76 6.21 1.4
## disab 5.26 8.50 0.6
## ell 18.81 13.10 1.4
mymod < lm(readSS ~ factor(grade) + factor(race) + female + disab + ell, data = df)
qplot(readSS, predict(mymod), data = df, alpha = I(0.3), color = I("blue")) +
geom_point(aes(x = df$readSS, y = fitted(mymod_me)), alpha = I(0.4), color = "dark green") +
theme_dpi() + xlab("Observed") + ylab("Predicted") + geom_text(aes(x = 370,
y = 700), label = "Green is Results of the \n Mixed Model")
caret
provides best in class data mining tools to uscaret
package looks likelibrary(caret)
# Set aside test set
testset < sample(unique(df$stuid), 500)
df$case < 0
df$case[df$stuid %in% testset] < 1
# Draw a training set of data (random subset of students)
training < subset(df, case == 0)
testing < subset(df, case == 1)
training < training[, c(3, 6:16, 21, 22, 28, 29, 30)] # subset vars
trainX < training[, 1:15]
refac < function(x) as.factor(as.character(x))
trainX$stuid < refac(trainX$stuid)
trainX$dist < refac(trainX$dist)
trainX$year < refac(trainX$year)
# Parameters
ctrl < trainControl(method = "repeatedcv", number = 7, repeats = 3, summaryFunction = defaultSummary)
# Search grid
grid < expand.grid(.interaction.depth = seq(2, 6, by = 1), .n.trees = seq(200,
700, by = 100), .shrinkage = c(0.05, 0.1))
# Boosted tree search
gbmTune < train(x = trainX, y = training$mathSS, method = "gbm", metric = "RMSE",
trControl = ctrl, tuneGrid = grid, verbose = FALSE)
# gbmPred<predict(gbmTune,testing[,names(trainX)])
plot(gbmTune)
n < 10000
rep < 5
# tLoop < replicate(rep, system.time( integLoop(func, xint, yint, n) ))
# summary(tLoop[3,])
tVec < replicate(rep, system.time(integVec(func, xint, yint, n)))
summary(tVec[3, ])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.004 0.000 0.020
tApply < replicate(rep, system.time(integApply(func, xint, yint, n)))
summary(tApply[3, ])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.96 5.97 5.98 6.01 6.06 6.08
# 2 Core Cluster
library(snow)
c1 < makeCluster(c("localhost", "localhost"), type = "SOCK")
tSnow1 < replicate(rep, system.time(integSnow(c1, func, xint, yint, n)))
summary(tSnow1[3])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.75 8.75 8.75 8.75 8.75 8.75
stopCluster(c1)
git
and GitHub can helpIt is good to include the session info, e.g. this document is produced with knitr version 0.8
. Here is my session info:
print(sessionInfo(), locale = FALSE)
## R version 2.15.2 (20121026)
## Platform: i386w64mingw32/i386 (32bit)
##
## attached base packages:
## [1] splines grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] snow_0.310 gbm_1.63.2 survival_2.3614 caret_5.15044
## [5] foreach_1.4.0 cluster_1.14.3 reshape_0.8.4 lme4_0.9999990
## [9] Matrix_1.010 lattice_0.2010 xtable_1.70 gridExtra_0.9.1
## [13] sandwich_2.29 quantreg_4.91 SparseM_0.96 mgcv_1.722
## [17] eeptools_0.1 mapproj_1.18.3 maps_2.26 proto_0.39.2
## [21] stringr_0.6.1 plyr_1.7.1 ggplot2_0.9.2.1 lmtest_0.930
## [25] zoo_1.79 knitr_0.8
##
## loaded via a namespace (and not attached):
## [1] codetools_0.28 colorspace_1.20 compiler_2.15.2
## [4] dichromat_1.24 digest_0.5.2 evaluate_0.4.2
## [7] formatR_0.6 gtable_0.1.1 iterators_1.0.6
## [10] labeling_0.1 MASS_7.322 memoise_0.1
## [13] munsell_0.4 nlme_3.1105 RColorBrewer_1.05
## [16] reshape2_1.2.1 scales_0.2.2 stats4_2.15.1
## [19] tools_2.15.1
This work (R Tutorial for Education, by Jared E. Knowles), in service of the Wisconsin Department of Public Instruction, is free of known copyright restrictions.