% Tutorial 4: Cleaning and Merging Data % DPI R Bootcamp % Jared Knowles

Overview

In this lesson we hope to learn about:

Data Setup

load("data/Student_Attributes.rda")
head(stuatt[, 1:4], 7)
  sid school_year male race_ethnicity
1   1        2004    1              B
2   1        2005    1              H
3   1        2006    1              H
4   1        2007    1              H
5   2        2006    0              W
6   2        2007    0              B
7   3        2006    1              H

How can R help correct this?

Strategic Data Project

  1. Policy and management decisions can directly influence schools' and teachers' ability to improve student achievement
  2. Valid and reliable data analysis significantly improves the quality of decision making

Toolkit - Data Cleaning

  1. How to identify essential data elements for analyzing student achievement
  2. Clean, check, and build variables in the dataset
  3. Connect relevant datasets from different soruces
  4. Analyze datasets
  5. Adopt coding best practices to facilitate shared and replicable data analysis

SDP Task 1 Student Attributes Intro

stuatt$first_9th_year_reported <- NULL

SDP Task 1 - Step 1: Consistent Gender

length(unique(stuatt$sid))
[1] 21803
length(unique(stuatt$sid, stuatt$male))
[1] 21806

Testing Uniqueness

testuniqueness <- function(id, group) {
    length(unique(id)) == length(unique(id, group))
}  # Need better varname and some optimization to the speed of this code
testuniqueness(stuatt$sid, stuatt$male)
[1] FALSE
testuniqueness(stuatt$sid, stuatt$race_ethnicity)
[1] FALSE
testuniqueness(stuatt$sid, stuatt$birth_date)
[1] FALSE

Where is the data messy?

stuatt[17:21, 1:3]
   sid school_year male
17   7        2004    1
18   7        2005    1
19   7        2006    1
20   7        2007    0
21   7        2008    1

Unifying Consistent Gender Values

library(plyr)
sturow <- ddply(stuatt, .(sid), summarize, nvals_gender = length(unique(male)))
table(sturow$nvals_gender)

    1     2 
21799     4 

Fixing the pesky observations

# A function to find the most frequent value
library(eeptools)
sturow <- ddply(stuatt, .(sid), summarize, nvals_gender = length(unique(male)), 
    gender_mode = statamode(male), gender_recent = tail(male, 1))
head(sturow[7:10, ])
   sid nvals_gender gender_mode gender_recent
7    7            2           1             1
8    8            1           1             1
9    9            1           1             1
10  10            1           1             1

Fixing observations II

tempdf <- merge(stuatt, sturow)  # R finds the linking variable already
head(tempdf[17:21, c(1, 2, 3, 10, 11)])
   sid school_year male nvals_gender gender_mode
17   7        2004    1            2           1
18   7        2005    1            2           1
19   7        2006    1            2           1
20   7        2007    0            2           1
21   7        2008    1            2           1
print(subset(tempdf[, c(1, 2, 3, 10, 11)], sid == 12506))
        sid school_year male nvals_gender gender_mode
50064 12506        2004    1            2           .
50065 12506        2005    0            2           .

Fixing where the mode does not work

print(subset(tempdf[, c(1, 2, 3, 10, 11, 12)], sid == 12506))
        sid school_year male nvals_gender gender_mode gender_recent
50064 12506        2004    1            2           .             0
50065 12506        2005    0            2           .             0

Recode Gender

Results

tempdf$male <- tempdf$gender_mode
tempdf$male[tempdf$male == "."] <- tempdf$gender_recent[tempdf$male == "."]
# we have to put the filter on both sides of the assignment operator
testuniqueness(tempdf$id, tempdf$male)
[1] TRUE
rm(sturow)
stuatt <- tempdf
stuatt$nvals_gender <- NULL
stuatt$gender_mode <- NULL
stuatt$gender_recent <- NULL
# or just run stuatt<-tempdf[,1:9]
rm(tempdf)

Create a consistent race and ethnicity indicator

A Note About Variable Types

Recoding Race

summary(stuatt$race_ethnicity)
    A     B     H   M/O     W  NA's 
 7303 25321 30444  2809 20528  1129 
length(stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)])
stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)] <- "AI"
summary(stuatt$race_ethnicity)

Correct conversion

length(stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)])
[1] 1129
stuatt$race_ethnicity <- as.character(stuatt$race_ethnicity)
stuatt$race_ethnicity[is.na(stuatt$race_ethnicity)] <- "AI"
stuatt$race_ethnicity <- factor(stuatt$race_ethnicity)
summary(stuatt$race_ethnicity)
    A    AI     B     H   M/O     W 
 7303  1129 25321 30444  2809 20528 

Inconsistency Within Years

stuatt[7:9, c("sid", "school_year", "race_ethnicity")]
  sid school_year race_ethnicity
7   3        2006              H
8   3        2006              B
9   3        2007              B

Business Rule

Let's calculate the number of values per year

nvals <- ddply(stuatt, .(sid, school_year), summarize, nvals_race = length(unique(race_ethnicity)), 
    tmphispanic = length(which(race_ethnicity == "H")))
tempdf <- merge(stuatt, nvals)
# Clean up
rm(nvals)
# Recode race_ethnicity
tempdf$race2 <- tempdf$race_ethnicity
tempdf$race2[tempdf$nvals_race > 1 & tempdf$tmphispanic == 1] <- "H"
tempdf$race2[tempdf$nvals_race > 1 & tempdf$tmphispanic != 1] <- "M/O"
tempdf$race_ethnicity <- tempdf$race2

# Clean up by removing old variables
tempdf$race2 <- NULL
tempdf$nvals_race <- NULL
tempdf$tmphispanic <- NULL
# Resort our result
tempdf <- tempdf[order(tempdf$sid, tempdf$school_year), ]

Compare them

        sid school_year race_ethnicity
56201     3        2006              H
56202     3        2006              H
81064  8552        2005              W
81065  8552        2006            M/O
81066  8552        2006            M/O
6162  11382        2005              H
6163  11382        2005              H
6164  11382        2006              H
        sid school_year race_ethnicity
7         3        2006              H
8         3        2006              B
34290  8552        2005              W
34291  8552        2006              A
34292  8552        2006              W
45674 11382        2005              H
45675 11382        2005            M/O
45676 11382        2006              H

OK

stuatt <- tempdf
rm(tempdf)

Break in Case of Emergency

# Stupid hack workaround of ddply bug when running too many of these
# sequentially
ddply_race <- function(x, y, z) {
    NewColName <- "race_ethnicity"
    z <- ddply(x, .(y, z), .fun = function(xx, col) {
        c(nvals_race = length(unique(xx[, col])))
    }, NewColName)
    z$sid <- z$y
    z$school_year <- z$z
    z$y <- NULL
    z$z <- NULL
    return(z)
}

nvals <- ddply_race(stuatt, stuatt$sid, stuatt$school_year)
tempdf <- merge(stuatt, nvals)
tempdf$temp_ishispanic <- NA
tempdf$temp_ishispanic[tempdf$race_ethnicity == "H" & tempdf$nvals_race > 1] <- 1

Inconsistency across years

head(stuatt[, c("sid", "school_year", "race_ethnicity")])
      sid school_year race_ethnicity
1       1        2004              B
2       1        2005              H
3       1        2006              H
4       1        2007              H
44618   2        2006              W
44619   2        2007              B

So…

What do we do?

Try it on your own

Remember, this is tough stuff, so feel free to ask for help!

Answer

tempdf <- ddply(stuatt, .(sid), summarize, var_temp = statamode(race_ethnicity), 
    nvals = length(unique(race_ethnicity)), most_recent_year = max(school_year), 
    most_recent_var = tail(race_ethnicity, 1))

tempdf$race2[tempdf$var_temp != "."] <- tempdf$var_temp[tempdf$var_temp != "."]
tempdf$race2[tempdf$var_temp == "."] <- paste(tempdf$most_recent_var[tempdf$var_temp == 
    "."])

tempdf <- merge(stuatt, tempdf)
head(tempdf[, c(1, 2, 4, 14)], 7)
  sid school_year race_ethnicity race2
1   1        2004              B     H
2   1        2005              H     H
3   1        2006              H     H
4   1        2007              H     H
5   2        2006              W     B
6   2        2007              B     B
7   3        2006              H     H

A Faster Way

task1 <- function(df, id, year, var) {
    require(plyr)
    mdf <- eval(parse(text = paste("ddply(", df, ",.(", id, "),summarize,\nvar_temp=statamode(", 
        var, "),\nnvals=length(unique(", var, ")),most_recent_year=max(", year, 
        "),\nmost_recent_var=tail(", var, ",1))", sep = "")))
    mdf$var2[mdf$var_temp != "."] <- mdf$var_temp[mdf$var_temp != "."]
    mdf$var2[mdf$var_temp == "."] <- as.character(mdf$most_recent_var[mdf$var_temp == 
        "."])
    ndf <- eval(parse(text = paste("merge(", df, ",mdf)", sep = "")))
    rm(mdf)
    return(ndf)
}
# Note data must be sorted
tempdf <- task1(stuatt, stuatt$sid, stuatt$school_year, stuatt$race_ethnicity)

Other References

Session Info

It is good to include the session info, e.g. this document is produced with knitr version 0.9.6. Here is my session info:

print(sessionInfo(), locale = FALSE)
R version 2.15.2 (2012-10-26)
Platform: x86_64-w64-mingw32/x64 (64-bit)

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] sandwich_2.2-9  quantreg_4.94   SparseM_0.96    gridExtra_0.9.1
 [5] mgcv_1.7-22     eeptools_0.1    mapproj_1.2-0   maps_2.3-0     
 [9] proto_0.3-10    plyr_1.8        stringr_0.6.2   ggplot2_0.9.3  
[13] lmtest_0.9-30   zoo_1.7-9       knitr_0.9.6    

loaded via a namespace (and not attached):
 [1] codetools_0.2-8    colorspace_1.2-0   dichromat_1.2-4   
 [4] digest_0.6.0       evaluate_0.4.3     formatR_0.7       
 [7] gtable_0.1.2       labeling_0.1       lattice_0.20-10   
[10] MASS_7.3-22        Matrix_1.0-10      munsell_0.4       
[13] nlme_3.1-106       RColorBrewer_1.0-5 reshape2_1.2.2    
[16] scales_0.2.3       tools_2.15.2      

Attribution and License

Public Domain Mark
This work (R Tutorial for Education, by Jared E. Knowles), in service of the Wisconsin Department of Public Instruction, is free of known copyright restrictions.