# Day 2 # You can set options for how Rstudio looks using Tools > Global Options # Reviewing the Day 1 Content --------------------------------------------- # Challenge 2: elephant1_kg <- 3492 elephant2_lb <- 7757 # now need to convert one to the other...google says conversion is 1lb to 0.453592kg # so let's make a new elephant2_kg elephant2_kg <- elephant2_lb * 0.453592 # now compare which weighs more: elephant1_kg > elephant2_kg # DAY 2: Installing Packages ---------------------------------------------- # installing via the console: install.packages(c("gapminder", "tidyr")) # you can check your CRAN Mirror (where packages are being downloaded from) using # Tools > Global Options > Packages. Then check "Change" at the top and find a new mirror library(tidyverse) library(gapminder) # DATA CLASSES ------------------------------------------------------------ # the main types we will mostly deal with are: ## integer ## numeric ## character ## logical (T/F) ## factor # Vectors are workhorse of R, can be numeric or character # numeric weight_g <- c(50, 60, 65, 82) weight_g # character animals <- c("mouse", "rat", "dog") animals # use different functions to assess data type and shape # look at length length(weight_g) # class class(animals) # str (structure) str(animals) weight_g <- c(weight_g, 90) mixed <- c(animals, weight_g) logical <- c(T, F, T, T, T, T) mixed2 <- c(weight_g, logical) # how to convert logical to numeric: # functions to convert different data types as.character(logical) as.numeric(animals) as.numeric(logical) as.factor() as.integer() # FACTORS ----------------------------------------------------------------- # stores both character and numeric in same vector animal_factors<-as.factor(animals) as.integer(animal_factors) levels(animal_factors) nlevels(animal_factors) # factors can be both character based or numeric based hucs <- c("08123", "09234","07222") as.factor(hucs) # use an example of how to re-order your factors lomedhi<- as.factor(c("low", "med", "high")) levels(lomedhi) # notice it is spelling sensitive lomedhi<- as.factor(c("low", "med", "high", "med", "hi")) levels(lomedhi) # we nested a vector (c()) inside a function as.factor() to create a new factor vector. levels(lomedhi) # tell R the order you want the "levels" to occur in lomedhi <- factor(x=lomedhi, levels = c("low","med","high")) # notice it is now re-ordered as preferred levels(lomedhi) # you can add whatever levels you want, but if there is no value in the vector associated with that level, it will default to a NA. lomedhi <- factor(x=lomedhi, levels = c("low","med","High")) summary(lomedhi) # dplyr ------------------------------------------------------------------- # first let's look at subsetting in base R # we need to grab the dataset from yesterday and load it load(url("https://mikoontz.github.io/data-carpentry-week/data/continents.RDA")) # find total land area in continents with at least 10% of worlds pop df <- continents[continents$percent_total_pop>10, ] df sum(df$area_km2) # loading libraries library(gapminder) library(tidyverse) class(gapminder) str(gapminder) head(gapminder) gapminder<-tbl_df(gapminder) # notice deprecation note gapminder<- tibble::as_tibble(gapminder) # dplyr VERBS ------------------------------------------------------------- # select (works to subset to columns) # filter (works to subset to rows) # arrange (sort data in order you prefer) # mutate (make new columns in same dataframe) # summarize (summarize groups of data) # how many unique countries? unique(gapminder$country) ### FILTER # always data, then filtering condition filter(gapminder, country=="United States") # multiple options: "|" says meet one OR other condition filter(gapminder, country=="Afghanistan" | country=="United States") # this is exclusively AND (so trying to meet both conditions, but conflict) filter(gapminder, country=="Afghanistan" & country=="United States") filter(gapminder, country=="United States", year > 2000) ### SELECT # select 2 columns select(gapminder, country, lifeExp) # rename and select select(gapminder, ThePlace = country, HowLongTheyLive = lifeExp) # doesn't select any columns, just renames them rename(gapminder, ThePlace=country, HowLongTheyLive = lifeExp) # to keep a new dataframe, just use "<-" to assign df1 <- select(gapminder, ThePlace = country, HowLongTheyLive = lifeExp) # deselect columns but keep remaining columns df2 <- select(gapminder, -c(pop, continent)) # use other select options to pick columns: df3 <- select(gapminder, starts_with("c")) # move or rerrange column order df4 <- select(gapminder, pop, everything()) # try challenge with continents dataset but using dplyr # "find total land area in continents with at least 10% of worlds pop" totlandarea <- filter(continents, percent_total_pop>10) totlandarea <- sum(totlandarea$area_km2) ## PIPES # ctrl + shift + m == "%>%" totlandarea <- filter(continents, percent_total_pop>10) %>% select(area_km2) %>% sum ### MUTATE # add a column (a new variable) mutate(gapminder, total_gdp = gdpPercap * pop) df5 <- gapminder %>% mutate(total_gdp = gdpPercap * pop) %>% select(country, total_gdp) %>% filter(total_gdp > 1.868e+11) %>% arrange(total_gdp) glimpse(df5) # this is like "str()" but in dplyr head(df5) # view by descending value of total_gdp df5 %>% arrange(desc(total_gdp)) %>% head ### SUMMARIZE gapminder %>% filter(country=="United States") %>% group_by(year, country) %>% summarize(mean_gdp = mean(gdpPercap), median_gdp = median(gdpPercap)) # Challenge Part I: # Reload packages in case we closed R library(tidyverse) library(gapminder) # Split, apply, combine strategy of data summarization glimpse(gapminder) # We use mutate to add a column to a data frame by # using element-by-element operations (add a column, # but don't change the number of rows) withTotGDP <- gapminder %>% mutate(totGDP = pop * gdpPercap) glimpse(withTotGDP) # Split, apply, combine gapminder %>% mutate(totGDP = pop * gdpPercap) %>% group_by(year) %>% summarize(varGDP = var(totGDP)) %>% arrange(year) # What happens if we don't do a group_by? gapminder %>% mutate(totGDP = pop * gdpPercap) %>% summarize(varGDP = var(totGDP)) # How might we pull out just the first year and the last # year from the summarized data frame? gapminder %>% mutate(totGDP = pop * gdpPercap) %>% group_by(year) %>% summarize(varGDP = var(totGDP)) %>% filter(year == "1952" | year == "2007") # How to compare the two values we just summarized # and subsetted to? gapminder %>% mutate(totGDP = pop * gdpPercap) %>% group_by(year) %>% summarize(varGDP = var(totGDP)) %>% filter(year == "1952" | year == "2007") %>% summarize(diff(varGDP)) # Grab the first row using slice(1) firstGDP <- gapminder %>% mutate(totGDP = pop * gdpPercap) %>% group_by(year) %>% summarize(varGDP = var(totGDP)) %>% filter(year == "1952" | year == "2007") %>% select(varGDP) %>% slice(1) lastGDP <- gapminder %>% mutate(totGDP = pop * gdpPercap) %>% group_by(year) %>% summarize(varGDP = var(totGDP)) %>% filter(year == "1952" | year == "2007") %>% select(varGDP) %>% slice(2) # Subtract the two values we just saved! lastGDP - firstGDP # Can we reference elements in a tibble using square # bracket notation gapminder %>% mutate(totGDP = pop * gdpPercap) %>% group_by(year) %>% summarize(varGDP = var(totGDP)) %>% .[1, 1] # Here's what the period does '.' testDF <- gapminder %>% mutate(totGDP = pop * gdpPercap) %>% group_by(year) %>% summarize(varGDP = var(totGDP)) testDF %>% .[1, 3] # ... is equivalent to testDF[1, 1] # testDF <- gapminder %>% mutate(totGDP = pop * gdpPercap) %>% group_by(year) %>% summarize(varGDP = var(totGDP)) %>% select(varGDP) # The answer to Challenge 2 gapminder %>% mutate(totGDP = pop * gdpPercap) %>% group_by(year) %>% summarize(var_totGDP = var(totGDP), var_perCapGDP = var(gdpPercap)) # Data visualization with ggplot2 ----------------------------------------- # Motivating example! ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) + geom_point() # We want to think about 'mapping' our data to # visual features of a plot. We do that by using # the aes() [we are specifying the aesthetic of # a particular visual feature on a plot] ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) # Let's add our first geom_ ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) + geom_point() # Modify the code that we wrote above so that we can # visualize how life expectancy has changed over time. ggplot(gapminder, aes(x = year, y = lifeExp)) + geom_point() # bonus! color the points by the continent that they # are on gapminder %>% ggplot(aes(x = year, y = lifeExp, color = continent)) + geom_point() # Can we put an aes() function call within the geom_point()? Let's try. # Add a line geometry to our ggplot gapminder %>% ggplot(aes(x = year, y = lifeExp, color = continent)) + geom_point() + geom_line() # Let's use different aesthetics for different geometries # by adding a line for each country gapminder %>% ggplot(aes(x = year, y = lifeExp, color = continent)) + geom_point() + geom_line(aes(group = country)) # What happens when we put the grouping variable at the # top level of the aes() hierarchy? gapminder %>% ggplot(aes(x = year, y = lifeExp, group = country)) + geom_point() + geom_line(aes(color = continent)) # Rearrange order to put points on top of the lines gapminder %>% ggplot(aes(x = year, y = lifeExp, group = country)) + geom_line(aes(color = continent)) + geom_point() # How can we plot a continuous variable versus a categorical variable? gapminder %>% ggplot(aes(x = continent, y = lifeExp)) + geom_boxplot() # Challenge: using violin plots gapminder %>% ggplot(aes(x = continent, y = lifeExp)) + geom_violin() # Filling each violin plot by continent gapminder %>% ggplot(aes(x = continent, y = lifeExp)) + geom_violin(aes(fill = continent)) # what if we put aes(fill = continent) in the beginning? gapminder %>% ggplot(aes(x = continent, y = lifeExp, fill = continent)) + geom_violin() # color versus fill for violin plots # what if we put aes(fill = continent) in the beginning? gapminder %>% ggplot(aes(x = continent, y = lifeExp, color = continent)) + geom_violin() # both color and fill gapminder %>% ggplot(aes(x = continent, y = lifeExp, color = continent, fill = continent)) + geom_violin() # What happens if we don't put a value in an aesthetic? gapminder %>% ggplot(aes(x = continent, y = lifeExp, fill = continent)) + geom_violin(color = "red") # Get rid of the border? gapminder %>% ggplot(aes(x = continent, y = lifeExp, fill = continent)) + geom_violin(color = NA) # Why doesn't this work? fill = continent is trying to # map data to a visual feature, so it must go in a call # to the aes() function gapminder %>% ggplot(aes(x = continent, y = lifeExp)) + geom_violin(color = "red", fill = continent) # fixed! gapminder %>% ggplot(aes(x = continent, y = lifeExp)) + geom_violin(color = "red", aes(fill = continent)) # Starting to modify the scales gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp)) + geom_point() # Those data are hard to read because there is a huge # density of points at the low end of gdpPercap gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp)) + geom_point() + scale_x_log10() # Let's add a best fit line to our plot; default best fit # line is a gam() gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp)) + geom_point() + scale_x_log10() + geom_smooth() # Fit a linear model using ordinary least squares regression gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp)) + geom_point() + scale_x_log10() + geom_smooth(method = "lm") # Challenge: # This works, but I went over 20 characters. gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp)) + geom_point() + scale_x_log10() + geom_smooth(method = "lm", aes(group = continent)) # Where else can we put the group aesthetic gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) + geom_point() + scale_x_log10() + geom_smooth(method = "lm") # One best fit line, but color points by continent gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp)) + geom_point(aes(color = continent)) + scale_x_log10() + geom_smooth(method = "lm")