DPI R Bootcamp
Jared Knowles
In this lesson we hope to learn about:
load("data/Student_Attributes.rda")
head(stuatt[, 1:4], 7)
## sid school_year male race_ethnicity
## 1 1 2004 1 B
## 2 1 2005 1 H
## 3 1 2006 1 H
## 4 1 2007 1 H
## 5 2 2006 0 W
## 6 2 2007 0 B
## 7 3 2006 1 H
first_9th_school_year_reported
variablestuatt$first_9th_year_reported <- NULL
NULL
, another R quirklength(unique(stuatt$sid))
## [1] 21803
length(unique(stuatt$sid, stuatt$male))
## [1] 21806
testuniqueness <- function(id, group) {
length(unique(id)) == length(unique(id, group))
} # Need better varname and some optimization to the speed of this code
testuniqueness(stuatt$sid, stuatt$male)
## [1] FALSE
testuniqueness(stuatt$sid, stuatt$race_ethnicity)
## [1] FALSE
testuniqueness(stuatt$sid, stuatt$birth_date)
## [1] FALSE
stuatt[17:21, 1:3]
## sid school_year male
## 17 7 2004 1
## 18 7 2005 1
## 19 7 2006 1
## 20 7 2007 0
## 21 7 2008 1
plyr
strategy we learned in Tutorial 3library(plyr)
sturow <- ddply(stuatt, .(sid), summarize, nvals_gender = length(unique(male)))
table(sturow$nvals_gender)
##
## 1 2
## 21799 4
# A function to find the most frequent value
library(eeptools)
sturow <- ddply(stuatt, .(sid), summarize, nvals_gender = length(unique(male)),
gender_mode = statamode(male), gender_recent = tail(male, 1))
head(sturow[7:10, ])
## sid nvals_gender gender_mode gender_recent
## 7 7 2 1 1
## 8 8 1 1 1
## 9 9 1 1 1
## 10 10 1 1 1
stuatt
and sturow
and we need to replace some values from stuatt
with some values from sturow
merge
to the rescue!merge
our two data objects into a temporary data object called tempdf
tempdf <- merge(stuatt, sturow) # R finds the linking variable already
head(tempdf[17:21, c(1, 2, 3, 10, 11)])
## sid school_year male nvals_gender gender_mode
## 17 7 2004 1 2 1
## 18 7 2005 1 2 1
## 19 7 2006 1 2 1
## 20 7 2007 0 2 1
## 21 7 2008 1 2 1
print(subset(tempdf[, c(1, 2, 3, 10, 11)], sid == 12506))
## sid school_year male nvals_gender gender_mode
## 50064 12506 2004 1 2 .
## 50065 12506 2005 0 2 .
print(subset(tempdf[, c(1, 2, 3, 10, 11, 12)], sid == 12506))
## sid school_year male nvals_gender gender_mode gender_recent
## 50064 12506 2004 1 2 . 0
## 50065 12506 2005 0 2 . 0
gender_recent
variable when there is not gender_mode
that is validrecoding
our variable!tempdf$male
to be the same as tempdf$gender_mode
tempdf$male
is now a “.” indicating no modal category exists, we assign tempdf$gender_recent
to be tempdf$male
testuniqueness(tempdf$id,tempdf$male)
to check if it workedtempdf$male <- tempdf$gender_mode
tempdf$male[tempdf$male == "."] <- tempdf$gender_recent[tempdf$male == "."]
# we have to put the filter on both sides of the assignment operator
testuniqueness(tempdf$id, tempdf$male)
## [1] TRUE
rm(sturow)
stuatt <- tempdf
stuatt$nvals_gender <- NULL
stuatt$gender_mode <- NULL
stuatt$gender_recent <- NULL
# or just run stuatt<-tempdf[,1:9]
rm(tempdf)
race_ethnicity
variable to numeric and add labels to itfactor
variable type like R can, and rely on numeric coding schemesmale
variable as a factor with values M
and F
summary(stuatt$race_ethnicity)
## A B H M/O W NA's
## 7303 25321 30444 2809 20528 1129
length(stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)])
stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)] <- "AI"
summary(stuatt$race_ethnicity)
length(stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)])
## [1] 1129
stuatt$race_ethnicity <- as.character(stuatt$race_ethnicity)
stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)] <- "AI"
stuatt$race_ethnicity <- factor(stuatt$race_ethnicity)
summary(stuatt$race_ethnicity)
## A AI B H M/O W
## 7303 1129 25321 30444 2809 20528
stuatt[7:9, c("sid", "school_year", "race_ethnicity")]
## sid school_year race_ethnicity
## 7 3 2006 H
## 8 3 2006 B
## 9 3 2007 B
nvals <- ddply(stuatt, .(sid, school_year), summarize, nvals_race = length(unique(race_ethnicity)),
tmphispanic = length(which(race_ethnicity == "H")))
tempdf <- merge(stuatt, nvals)
# Clean up
rm(nvals)
# Recode race_ethnicity
tempdf$race2 <- tempdf$race_ethnicity
tempdf$race2[tempdf$nvals_race > 1 & tempdf$tmphispanic == 1] <- "H"
tempdf$race2[tempdf$nvals_race > 1 & tempdf$tmphispanic != 1] <- "M/O"
tempdf$race_ethnicity <- tempdf$race2
# Clean up by removing old variables
tempdf$race2 <- NULL
tempdf$nvals_race <- NULL
tempdf$tmphispanic <- NULL
# Resort our result
tempdf <- tempdf[order(tempdf$sid, tempdf$school_year), ]
## sid school_year race_ethnicity
## 56201 3 2006 H
## 56202 3 2006 H
## 81064 8552 2005 W
## 81065 8552 2006 M/O
## 81066 8552 2006 M/O
## 6162 11382 2005 H
## 6163 11382 2005 H
## 6164 11382 2006 H
## sid school_year race_ethnicity
## 7 3 2006 H
## 8 3 2006 B
## 34290 8552 2005 W
## 34291 8552 2006 A
## 34292 8552 2006 W
## 45674 11382 2005 H
## 45675 11382 2005 M/O
## 45676 11382 2006 H
stuatt <- tempdf
rm(tempdf)
# Stupid hack workaround of ddply bug when running too many of these
# sequentially
ddply_race <- function(x, y, z) {
NewColName <- "race_ethnicity"
z <- ddply(x, .(y, z), .fun = function(xx, col) {
c(nvals_race = length(unique(xx[, col])))
}, NewColName)
z$sid <- z$y
z$school_year <- z$z
z$y <- NULL
z$z <- NULL
return(z)
}
nvals <- ddply_race(stuatt, stuatt$sid, stuatt$school_year)
tempdf <- merge(stuatt, nvals)
tempdf$temp_ishispanic <- NA
tempdf$temp_ishispanic[tempdf$race_ethnicity == "H" & tempdf$nvals_race > 1] <- 1
head(stuatt[, c("sid", "school_year", "race_ethnicity")])
## sid school_year race_ethnicity
## 1 1 2004 B
## 2 1 2005 H
## 3 1 2006 H
## 4 1 2007 H
## 44618 2 2006 W
## 44619 2 2007 B
tempdf <- ddply(stuatt, .(sid), summarize, var_temp = statamode(race_ethnicity),
nvals = length(unique(race_ethnicity)), most_recent_year = max(school_year),
most_recent_var = tail(race_ethnicity, 1))
tempdf$race2[tempdf$var_temp != "."] <- tempdf$var_temp[tempdf$var_temp != "."]
tempdf$race2[tempdf$var_temp == "."] <- paste(tempdf$most_recent_var[tempdf$var_temp ==
"."])
tempdf <- merge(stuatt, tempdf)
head(tempdf[, c(1, 2, 4, 14)], 7)
## sid school_year race_ethnicity race2
## 1 1 2004 B H
## 2 1 2005 H H
## 3 1 2006 H H
## 4 1 2007 H H
## 5 2 2006 W B
## 6 2 2007 B B
## 7 3 2006 H H
summarize
in the ddply
call in this situationtask1 <- function(df, id, year, var) {
require(plyr)
mdf <- eval(parse(text = paste("ddply(", df, ",.(", id, "),summarize,\nvar_temp=statamode(",
var, "),\nnvals=length(unique(", var, ")),most_recent_year=max(", year,
"),\nmost_recent_var=tail(", var, ",1))", sep = "")))
mdf$var2[mdf$var_temp != "."] <- mdf$var_temp[mdf$var_temp != "."]
mdf$var2[mdf$var_temp == "."] <- as.character(mdf$most_recent_var[mdf$var_temp ==
"."])
ndf <- eval(parse(text = paste("merge(", df, ",mdf)", sep = "")))
rm(mdf)
return(ndf)
}
# Note data must be sorted
tempdf <- task1(stuatt, stuatt$sid, stuatt$school_year, stuatt$race_ethnicity)
Sort df
on measerr
and mathss
. What are the highest 5 values of each.
Complete the next part of the SDP toolkit!
It is good to include the session info, e.g. this document is produced with knitr version 0.8
. Here is my session info:
print(sessionInfo(), locale = FALSE)
## R version 2.15.2 (2012-10-26)
## Platform: i386-w64-mingw32/i386 (32-bit)
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] eeptools_0.1 mapproj_1.1-8.3 maps_2.2-6 proto_0.3-9.2
## [5] stringr_0.6.1 plyr_1.7.1 ggplot2_0.9.2.1 lmtest_0.9-30
## [9] zoo_1.7-9 knitr_0.8
##
## loaded via a namespace (and not attached):
## [1] codetools_0.2-8 colorspace_1.2-0 dichromat_1.2-4
## [4] digest_0.5.2 evaluate_0.4.2 formatR_0.6
## [7] gtable_0.1.1 labeling_0.1 lattice_0.20-10
## [10] MASS_7.3-22 memoise_0.1 munsell_0.4
## [13] RColorBrewer_1.0-5 reshape2_1.2.1 scales_0.2.2
## [16] tools_2.15.1
This work (R Tutorial for Education, by Jared E. Knowles), in service of the Wisconsin Department of Public Instruction, is free of known copyright restrictions.