--- title: "Tidy Data" author: "JJB + Course" date: "06/28/2018" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Pipe Operator ## Example: Piping Operator ```{r} # install.packages("magrittr") library("magrittr") 4 %>% # Take the number four and, then sqrt() # find the square root # Same as # sqrt(4) c(7, 42, 1, 25) %>% # Combine four elements and, then log() %>% # take the natural log and, then round(2) %>% # round to the second decimal and, then diff() # take the difference between consecutive elements # Same as # diff(round(log(c(7,42,1,25)), 2)) ``` ## Example: Multi-step forms ```{r} # Embedded / Nested Functions set.seed(821) mean(rnorm(10)) # Piped # install.packages("magrittr") library("magrittr") set.seed(821) rnorm(10) %>% # Generate 10 random values from a normal and, then mean() # take the mean. # Temporary Intermediate Variables set.seed(821) rand_nums = rnorm(10) mean_nums = mean(rand_nums) ``` ## Example: Inside-out form of Pipe **Embedded/Nested** Function Calls ```{r, eval = FALSE} pickup( # Step 5 goto( # Step 4 order( # Step 3 store( # Step 2 drink("Java Chip Frap"), # Step 1 loc = "Green St.") ) ) ) ``` **Piped** ```{r, eval = FALSE} "Java Chip Frap" %>% drink() %>% # Step 1 store(loc = "Green St.") %>% # Step 2 order() %>% # Step 3 goto() %>% # Step 4 pickup() # Step 5 ``` ## Example: Pipe to Other Arugments ```{r} myfunc = function(x, y) { x - y } x = 3; y = 1 myfunc(x, y) x %>% myfunc(., y) # Default myfunc(3, 1) # Change position of the pipe location y %>% myfunc(x, .) # Pipe to second argument # x %>% myfunc(y, .) # Pipe to second argument myfunc(y, x) ``` ### Exercise: Working with Pipes Make the following pipeable ```{r} # install.packages("dplyr") library("dplyr") tail(filter(iris, Petal.Width > mean(Petal.Width))) tail( filter(iris, Petal.Width > mean( Petal.Width) ) ) # How can we transform this embedded code statement? tail( filter(iris, Petal.Width > mean(Petal.Width) ) ) # How many functions exist? iris %>% # Take the data set iris AND THEN filter(Petal.Width > mean(Petal.Width)) %>% # filter the data to match a criterion AND THEN tail() # take the last 6 observations of the dataset ``` Write a pipe that provides the sqrt of 2+2 ```{r} sqrt(2 + 2) # Naive approach 2 + 2 %>% sqrt() # Logic error 2 + { 2 %>% sqrt() } 2 %>% sqrt() (2 + 2) %>% sqrt() # Advanced don't worry about for now. 2 %>% `+`(2) %>% sqrt() ``` Create another pipe that transforms two strings into one upper case string. ```{r} a = "stat 385 is evolving" b = "My pokemon is evolving faster..." a %>% paste(b) %>% toupper() ``` ## Example: Enrollment Untidy to Tidy ```{r} # install.packages("tidyr") library("tidyr") # Untidy Data enrolled_fa17 = data.frame( undergrads = c(18345, 15267, 12), profs = c(352, 640, 0), grads = c(7173, 6028, 9), gender = c("Men", "Women", "Unknown") ) # Why do we need to include gender inside the data.frame? # Tidy the data enrolled_fa17_tidy = gather(enrolled_fa17, key = "Year", # What the key is value = "Enrolled", # Specify what the _third variable_ should be undergrads:grads) # Take variables from undergrads to grads # similar to seq(from, to) enrolled_fa17_tidy ``` ## Example: Alternative Meanings - ChickWeights ```{r} # install.packages("tidyr") library("tidyr") # "Long"-form or "Tidy Data" head(ChickWeight) # "Wide"-form or "Messy data" ChickWeight_wide = ChickWeight %>% spread(Time, weight) # Check data head(ChickWeight_wide) # Recover "long"-form or "Tidy Data" ChickWeight_long = ChickWeight_wide %>% gather(key = Time, # Key for the key/value pairing value = weight, # Column for measurements `0`:`21`) # Column seleciton # Check data head(ChickWeight_long) ``` ## Example: Alternative Meanings - Science! ```{r} # Load in "Wide Data" experiment = read.table(header=TRUE, text=' subject sex control a b S1 F 4.2 4.1 2.2 S2 M 5.9 7.2 6.8 S3 M 9.1 9.8 10.2 S5 F 2.1 23.5 5.2 ') # Show wide-experiment data experiment ``` ```{r} # Convert data to long format experiment_long = gather(experiment, condition, measurement, control:b) # Example of Wide Format head(experiment_long) ``` ### Exercise: Making a Data Set Messy Make the `mtcars` data set messy by converting it to: ``` # model type value # 1 AMC Javelin mpg 15.200 # 2 AMC Javelin cyl 8.000 # 3 AMC Javelin disp 304.000 # 4 AMC Javelin hp 150.000 # 5 AMC Javelin drat 3.150 # 6 AMC Javelin wt 3.435 ``` ```{r, eval = FALSE} library("tidyr") rownames(mtcars) # Move the rowname to a variable name inside the data set. mtcars$model = rownames(mtcars) head(mtcars) mtcars_long = mtcars %>% gather(key = "type", value = "value", mpg:carb) # In IDE data viewer # View(mtcars_long) # Search for only the AMC Javelin entries in Model head(mtcars_long$model == "AMC Javelin") # Subset from the data.frame only AMC Javelin observations with all variables # contained within the data.frame head(mtcars_long[mtcars_long$model == "AMC Javelin", ]) ``` Now, fix the data by converting it back. ```{r, eval = FALSE} mtcars_wide = mtcars_long %>% spread(key = "type", value = "value") head(mtcars_wide) ``` ## Example: Splitting Values - Location Data ```{r breakdown_loc} cities = data.frame(stringsAsFactors=FALSE, city = c("Houston", "Miami", "Atlanta", "Chicago", "Los Angeles", "Washington, D.C.", "New York"), loc = c("29.81997438, -95.33997929", "25.7876107, -80.22410608", "33.83001385, -84.39994938", "41.82999066, -87.75005497", "33.98997825, -118.1799805", "38.89954938, -77.00941858", "40.74997906, -73.98001693"), pop = c(4053287, 2983947, 2464454, 5915976, 8097410, 2445216.5, 13524139), iso3 = c("USA", "USA", "USA", "USA", "USA", "USA", "USA"), province = c("Texas", "Florida", "Georgia", "Illinois", "California", "District of Columbia", "New York") ) cities_split = cities %>% separate(loc, c("lat", "lng"), sep=",") cities_split ``` ## Example: Uniting Values - Location Data ```{r combine_locs, dependson = "breakdown_loc"} cities_split %>% unite(loc, c("lat", "lng"), sep = ",") ``` ### Exercise: Tidying WHO data ```{r, eval = FALSE} who = tidyr::who View(who) head(who) library("ggplot2") library("stringr") colnames(who) # Convert from a combined value to a split value. colnames(who) = str_replace_all(colnames(who), pattern = "newrel", replacement = "new_rel") # Check to see that files are all in alignment e.g. new_sp_mXXXX colnames(who) # Matching codes # newrel => new_rel who %>% gather(key = "key", # Key is where we should fold values under value = "cases", # Value is where the observation is new_sp_m014:new_rel_f65, # Specifying the range of column names to transform na.rm = TRUE) %>% # We remove any case that has a missing value separate(key, c("new", "type", "sexage")) %>% separate(sexage, c("sex", "age"), sep = 1) %>% { ggplot(., aes(year, cases, color = country)) + # plus is not a pipe! Remember layering geom_jitter() + theme(legend.position = "none") } ```