Tutorial 3: Manipulating Data in R

DPI R Bootcamp

Jared Knowles

Overview

In this lesson we hope to learn:

Again, read in our dataset

# Set working directory to the tutorial directory In RStudio can do
# this in 'Tools' tab
setwd("~/GitHub/r_tutorial_ed")
# Load some data
load("data/smalldata.rda")
# Note if we don't assign data to 'df' R just prints contents of
# table

Aggregation

table(df$grade, df$year)
   
    2000 2001 2002
  3  200  100  200
  4  100  200  100
  5  200  100  200
  6  100  200  100
  7  200  100  200
  8  100  200  100

Aggregation can be more complex

table(df$year, df$race)
      
         A   B   H   I   W
  2000  16 370  93   7 414
  2001  16 370  93   7 414
  2002  16 370  93   7 414

More complicated still

with(df[df$grade == 3, ], {
    table(year, race)
})
      race
year     A   B   H   I   W
  2000   4  78  22   4  92
  2001   1  44   8   2  45
  2002   0  74  20   1 105
df2 <- subset(df, grade == 3)
table(df2$year, df2$race)
      
         A   B   H   I   W
  2000   4  78  22   4  92
  2001   1  44   8   2  45
  2002   0  74  20   1 105
rm(df2)

Quick exercise

Answer

with(df[df$race=="B",],{table(year,grade)})
      grade
year    3  4  5  6  7  8
  2000 78 48 87 39 74 44
  2001 44 78 48 87 39 74
  2002 74 44 78 48 87 39

Tables cont.

table(df$year, df$proflvl)
      
       advanced basic below basic proficient
  2000       56   313         143        388
  2001      229   183          64        424
  2002      503    27           3        367
table(df$race, df$proflvl)
   
    advanced basic below basic proficient
  A       19     7           3         19
  B      160   302         162        486
  H       54    76          33        116
  I        7     4           1          9
  W      548   134          11        549

Proportional Tables

prop.table(table(df$race, df$proflvl))
   
     advanced     basic below basic proficient
  A 0.0070370 0.0025926   0.0011111  0.0070370
  B 0.0592593 0.1118519   0.0600000  0.1800000
  H 0.0200000 0.0281481   0.0122222  0.0429630
  I 0.0025926 0.0014815   0.0003704  0.0033333
  W 0.2029630 0.0496296   0.0040741  0.2033333

Try number 2

round(prop.table(table(df$race, df$proflvl), 1), digits = 3)
   
    advanced basic below basic proficient
  A    0.396 0.146       0.062      0.396
  B    0.144 0.272       0.146      0.438
  H    0.194 0.272       0.118      0.416
  I    0.333 0.190       0.048      0.429
  W    0.441 0.108       0.009      0.442

Checking Understanding

Aggregating Data

# Reading Scores by Race
aggregate(readSS ~ race, FUN = mean, data = df)
  race readSS
1    A  508.7
2    B  460.2
3    H  473.2
4    I  485.2
5    W  533.2

Aggregate (II)

aggregate(cbind(readSS, mathSS) ~ race, data = df, mean)
  race readSS mathSS
1    A  508.7  477.9
2    B  460.2  442.5
3    H  473.2  442.7
4    I  485.2  455.9
5    W  533.2  529.8
head(aggregate(cbind(readSS, mathSS) ~ race + grade, data = df, mean), 
    8)
  race grade readSS mathSS
1    A     3  397.8  454.8
2    B     3  409.8  371.6
3    H     3  417.7  364.2
4    I     3  407.6  449.3
5    W     3  481.1  450.7
6    A     4  456.0  438.2
7    B     4  426.9  408.1
8    H     4  418.8  404.6

Crosstabs

ag <- aggregate(readSS ~ race + grade, data = df, mean)
xtabs(readSS ~ ., data = ag)
    grade
race     3     4     5     6     7     8
   A 397.8 456.0 479.1 539.5 600.4 605.3
   B 409.8 426.9 447.6 470.9 492.3 523.5
   H 417.7 418.8 481.2 489.1 500.3 534.2
   I 407.6 531.1 547.6   0.0 405.5 518.0
   W 481.1 498.5 517.1 546.6 565.2 596.1
ftable(xtabs(readSS ~ ., data = ag))
     grade     3     4     5     6     7     8
race                                          
A          397.8 456.0 479.1 539.5 600.4 605.3
B          409.8 426.9 447.6 470.9 492.3 523.5
H          417.7 418.8 481.2 489.1 500.3 534.2
I          407.6 531.1 547.6   0.0 405.5 518.0
W          481.1 498.5 517.1 546.6 565.2 596.1

Check your work

Answer II

aggregate(cbind(readSS, mathSS) ~ disab + grade, data = df, mean)
   disab grade readSS mathSS
1      0     3  449.9  418.3
2      1     3  421.1  376.3
3      0     4  464.0  454.2
4      1     4  438.2  425.1
5      0     5  484.9  470.2
6      1     5  475.1  431.0
7      0     6  511.7  507.9
8      1     6  481.8  476.9
9      0     7  532.0  532.0
10     1     7  516.1  474.3
11     0     8  567.6  567.7
12     1     8  518.8  534.1

School Means

z <- aggregate(readSS ~ dist, FUN = mean, data = df)
z
  dist readSS
1  205  496.5
2  402  500.5
3  495  491.6

Aggregate Isn't Enough

The Logic of plyr

An Aside about Split-Apply-Combine

The logic of plyr

How plyr works on dataframes

Using plyr

plyr in Action

  library(plyr)
myag<-ddply(df, .(dist,grade),summarize,
            mean_read=mean(readSS,na.rm=T),
            mean_math=mean(mathSS,na.rm=T),
            sd_read=sd(readSS,na.rm=T),
            sd_math=sd(mathSS,na.rm=T),
            count_read=length(readSS),
            count_math=length(mathSS))

Results

head(myag)
  dist grade mean_read mean_math sd_read sd_math count_read
1  205     3     451.7     406.1   93.52   72.45        200
2  205     4     438.9     459.9   77.76   79.10        100
3  205     5     487.9     462.6   85.30   75.10        200
4  205     6     514.7     526.8   76.83   66.04        100
5  205     7     530.0     521.5   84.82   74.85        200
6  205     8     575.5     581.2   79.58   83.45        100
  count_math
1        200
2        100
3        200
4        100
5        200
6        100

More plyr

myag<-ddply(df, .(dist,grade),summarize,
            mean_read=mean(readSS,na.rm=T),
            mean_math=mean(mathSS,na.rm=T),
            sd_read=sd(readSS,na.rm=T),
            sd_math=sd(mathSS,na.rm=T),
            count_read=length(readSS),
            count_math=length(mathSS),
            count_black=length(race[race=='B']),
            per_black=length(race[race=='B'])/length(readSS))
summary(myag[,7:10])            
   count_read    count_math   count_black     per_black    
 Min.   :100   Min.   :100   Min.   :36.0   Min.   :0.360  
 1st Qu.:100   1st Qu.:100   1st Qu.:41.2   1st Qu.:0.386  
 Median :150   Median :150   Median :60.5   Median :0.412  
 Mean   :150   Mean   :150   Mean   :61.7   Mean   :0.411  
 3rd Qu.:200   3rd Qu.:200   3rd Qu.:82.0   3rd Qu.:0.439  
 Max.   :200   Max.   :200   Max.   :92.0   Max.   :0.480  

Note for SQL Junkies

Quick Exercises in ddply

Answer III

myag2<-ddply(df, .(dist,grade,ell),summarize,
            mean_read=mean(readSS,na.rm=T),
            mean_math=mean(mathSS,na.rm=T),
            sd_read=sd(readSS,na.rm=T),
            sd_math=sd(mathSS,na.rm=T),
            count_read=length(readSS),
            count_math=length(mathSS),
            count_black=length(race[race=='B']),
            per_black=length(race[race=='B'])/length(readSS))
subset(myag2,ell==1&grade==4)        
   dist grade ell mean_read mean_math sd_read sd_math count_read
4   205     4   1     403.0     392.9   64.52   39.09         16
16  402     4   1     443.1     388.7   79.52   53.28         29
28  495     4   1     408.8     431.9   77.47   70.77         13
   count_math count_black per_black
4          16           2   0.12500
16         29           6   0.20690
28         13           1   0.07692

Sorting

df.badsort <- order(df$readSS, df$mathSS)
head(df.badsort)
[1]  106 1026    2   56  122  118

Correct Example

df.sort <- df[order(df$readSS, df$mathSS, df$attday), ]
head(df[, c(3, 23, 29, 30)])
   stuid attday readSS mathSS
1 149995    180  357.3  387.3
2  13495    180  263.9  302.6
3 106495    160  369.7  365.5
4  45205    168  346.6  344.5
5 142705    156  373.1  441.2
6  14995    157  436.8  463.4
head(df.sort[, c(3, 23, 29, 30)])
      stuid attday readSS mathSS
106  106705    160  251.5  277.0
1026  80995    176  263.2  377.8
2     13495    180  263.9  302.6
56   122402    180  264.3  271.7
122   79705    168  266.4  318.7
118   40495    173  266.9  275.0

Let's clean it up a bit more

head(df[with(df, order(-readSS, -attday)), c(3, 23, 29, 30)])
      stuid attday readSS mathSS
1631 145205    137  833.2  828.4
1462 107705    180  773.3  746.6
2252 122902    180  744.0  621.6
2341  44902    175  741.7  676.3
1482 134705    180  739.2  705.4
1630  14495    162  738.9  758.2

About sorting

M <- matrix(c(1, 2, 2, 2, 3, 6, 4, 5), 4, 2, byrow = FALSE, dimnames = list(NULL, 
    c("a", "b")))
M[order(M[, "a"], -M[, "b"]), ]
     a b
[1,] 1 3
[2,] 2 6
[3,] 2 5
[4,] 2 4

About Sorting

mytab <- table(df$grade, df$year)
mytab[order(mytab[, 1]), ]
   
    2000 2001 2002
  4  100  200  100
  6  100  200  100
  8  100  200  100
  3  200  100  200
  5  200  100  200
  7  200  100  200
mytab[order(mytab[, 2]), ]
   
    2000 2001 2002
  3  200  100  200
  5  200  100  200
  7  200  100  200
  4  100  200  100
  6  100  200  100
  8  100  200  100

Filtering Data

Basic Filtering a Column

# Gives all rows that meet this requirement
df[df$readSS > 800, ]
           X school  stuid grade schid dist white black hisp indian
1631 1281061    852 145205     8    15  205     1     0    0      0
     asian econ female ell disab sch_fay dist_fay luck ability
1631     0    0      1   0     0       0        0    0   108.3
     measerr teachq year attday schoolscore district schoolhigh
1631   6.325  155.7 2001    137       227.7       19          0
     schoolavg schoollow readSS mathSS  proflvl race
1631         1         0  833.2  828.4 advanced    W
df$grade[df$mathSS > 800]
[1] 8
# Gives all values of grade that meet this requirement

Multiple filters

df$grade[df$black == 1 & df$readSS > 650]
 [1] 8 7 8 6 6 7 8 7 8 8 8 4

Using filters to assign values

myag$spread <- NA  # create variable
myag$spread[myag$sd_read < 75] <- "low"
myag$spread[myag$sd_read > 75] <- "high"
myag$spread <- as.factor(myag$spread)
summary(myag$spread)
high  low 
  15    3 

How does it work?

myag$spread <- NA  # create variable
myag$spread[myag$sd_read < 75] <- "low"
myag$spread[myag$sd_read > 75] <- "high"
myag$spread <- as.factor(myag$spread)
Grade Score Range Code
3 >425 "Hi"
4 >450 "Hi"
5 >475 "Hi"
6 >500 "Hi"
7 >525 "Hi"
8 >575 "Hi"

Results

myag$schoolperf <- "lo"
myag$schoolperf[myag$grade == 3 & myag$mean_math > 425] <- "hi"
myag$schoolperf[myag$grade == 4 & myag$mean_math > 450] <- "hi"
myag$schoolperf[myag$grade == 5 & myag$mean_math > 475] <- "hi"
myag$schoolperf[myag$grade == 6 & myag$mean_math > 500] <- "hi"
myag$schoolperf[myag$grade == 7 & myag$mean_math > 525] <- "hi"
myag$schoolperf[myag$grade == 8 & myag$mean_math > 575] <- "hi"
myag$schoolperf <- as.factor(myag$schoolperf)
summary(myag$schoolperf)
hi lo 
 9  9 
table(myag$dist, myag$schoolperf)
     
      hi lo
  205  3  3
  402  3  3
  495  3  3

Let's replace data

myag$mean_read[myag$dist == 6 & myag$grade == 3] <- NA
head(myag[, 1:4], 2)
  dist grade mean_read mean_math
1  205     3     451.7     406.1
2  205     4     438.9     459.9
myag$mean_read[myag$dist == 6 & myag$grade == 3] <- myag$mean_read[myag$dist == 
    6 & myag$grade == 4]
head(myag[, 1:4], 2)
  dist grade mean_read mean_math
1  205     3     451.7     406.1
2  205     4     438.9     459.9

Why do NAs matter so much?

myag$mean_read[myag$grade == 3] <- NA
head(myag[order(myag$grade), 1:4])
   dist grade mean_read mean_math
1   205     3        NA     406.1
7   402     3        NA     431.9
13  495     3        NA     405.5
2   205     4     438.9     459.9
8   402     4     474.9     432.8
14  495     4     447.8     469.1

NAs II

mean(myag$mean_math)
[1] 490.7
mean(myag$mean_read)
[1] NA
mean(myag$mean_math, na.rm = T)
[1] 490.7
mean(myag$mean_read, na.rm = T)
[1] 507.5

Beyond the Mean

length(myag$dist[myag$mean_read < 500])
[1] 10
head(myag$mean_read[myag$mean_read < 500])
[1]    NA 438.9 487.9    NA 474.9 472.5
badvar <- myag$mean_read + myag$sd_read
summary(badvar)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    517     566     592     587     614     655       3 

So we need to filter NAs explicitly

myag$sd_read[myag$count_read < 100 & myag$mean_read < 550] <- NA
length(myag$mean_read[myag$mean_read < 550])
[1] 16
length(myag$mean_read[myag$mean_read < 550 & !is.na(myag$mean_read)])
[1] 13

Merging Data

Merging Data II

names(myag)
 [1] "dist"        "grade"       "mean_read"   "mean_math"  
 [5] "sd_read"     "sd_math"     "count_read"  "count_math" 
 [9] "count_black" "per_black"   "spread"      "schoolperf" 
names(df[, c(2, 3, 4, 6)])
[1] "school" "stuid"  "grade"  "dist"  

Merge Options

simple_merge <- merge(df, myag)
names(simple_merge)
 [1] "grade"       "dist"        "X"           "school"     
 [5] "stuid"       "schid"       "white"       "black"      
 [9] "hisp"        "indian"      "asian"       "econ"       
[13] "female"      "ell"         "disab"       "sch_fay"    
[17] "dist_fay"    "luck"        "ability"     "measerr"    
[21] "teachq"      "year"        "attday"      "schoolscore"
[25] "district"    "schoolhigh"  "schoolavg"   "schoollow"  
[29] "readSS"      "mathSS"      "proflvl"     "race"       
[33] "mean_read"   "mean_math"   "sd_read"     "sd_math"    
[37] "count_read"  "count_math"  "count_black" "per_black"  
[41] "spread"      "schoolperf" 

Merge Options

Reshaping Data

head(df[, 1:10], 3)
    X school  stuid grade schid dist white black hisp indian
1  44      1 149995     3   105  495     0     1    0      0
2  53      1  13495     3    45  495     0     1    0      0
3 116      1 106495     3    45  495     0     1    0      0
head(widedf[, c(1, 28:40)], 3)
   stuid readSS.2000 mathSS.2000 proflvl.2000 race.2000  X.2001
1 149995       357.3       387.3        basic         B  441000
2  13495       263.9       302.6  below basic         B  531000
3 106495       369.7       365.5        basic         B 1161000
  school.2001 grade.2001 schid.2001 dist.2001 white.2001 black.2001
1           1          4        105       495          0          1
2           1          4         45       495          0          1
3           1          4         45       495          0          1
  hisp.2001 indian.2001
1         0           0
2         0           0
3         0           0

Wide Data v. Long Data

The reshape Function

Deconstructing reshape

widedf <- reshape(df, timevar = "year", idvar = "stuid", direction = "wide")

What about Wide to Long?

longdf <- reshape(widedf, idvar = "stuid", timevar = "year", varying = names(widedf[, 
    2:91]), direction = "long", sep = ".")

Subsetting Data

g4 <- subset(df, grade == 4)
dim(g4)
[1] 400  32
g4_b <- df[df$grade == 4, ]
identical(g4, g4_b)
[1] TRUE

That's it

Exercises

  1. Say we are unhappy about attributing the school/grade mean score across years to student-year observations like we did in this lesson. Let's fix it by first aggregating our student data frame to a school/grade/year data frame, and second by merging that new data frame with our student level data.

  2. Sort the student-level data frame on attday and ability in descending order.

  3. Find the highest proportion of black students in any school/grade/year combination.

Other References

Session Info

It is good to include the session info, e.g. this document is produced with knitr version 0.9.6. Here is my session info:

print(sessionInfo(), locale = FALSE)
R version 2.15.2 (2012-10-26)
Platform: x86_64-w64-mingw32/x64 (64-bit)

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets 
[7] methods   base     

other attached packages:
 [1] sandwich_2.2-9  quantreg_4.94   SparseM_0.96    gridExtra_0.9.1
 [5] mgcv_1.7-22     eeptools_0.1    mapproj_1.2-0   maps_2.3-0     
 [9] proto_0.3-10    plyr_1.8        stringr_0.6.2   ggplot2_0.9.3  
[13] lmtest_0.9-30   zoo_1.7-9       knitr_0.9.6    

loaded via a namespace (and not attached):
 [1] colorspace_1.2-0   dichromat_1.2-4    digest_0.6.0      
 [4] evaluate_0.4.3     formatR_0.7        gtable_0.1.2      
 [7] labeling_0.1       lattice_0.20-10    MASS_7.3-22       
[10] Matrix_1.0-10      munsell_0.4        nlme_3.1-106      
[13] RColorBrewer_1.0-5 reshape2_1.2.2     scales_0.2.3      
[16] tools_2.15.2      

Attribution and License

Public Domain Mark
This work (R Tutorial for Education, by Jared E. Knowles), in service of the Wisconsin Department of Public Instruction, is free of known copyright restrictions.