# Day 3 # Challenge 2 # Load any libraries you may need # Subset the gapminder data to only Oceania countries post-1980. # Remove the continent column # Make a scatter plot of gdpPercap vs. population colored by country # Advanced How would you determine the median population for the American countries between 1970 and 1980? library(tidyverse) library(gapminder) # The FIRST thing that I do before I start working with a # dataset is to remind myself what it looks like. glimpse(gapminder) # When I read "only Oceania countries post-1980", I am thinking # that I want specific rows that meet these 2 criteria (1st: rows # where the continent column is "Oceania", 2nd: rows where the # year is greater than 1980) gapminder %>% filter(continent == "Oceania" & year > 1980) # When I read that I want to "remove the continent column", I # think that we need to operate on the variables in our data # frame using the select() function # This code will remove the year column. Modify it to remove # the continent column. gapminder %>% filter(continent == "Oceania" & year > 1980) %>% select(-year) # I'm going to save the manipulated data frame as a new variable, # but you don't necessarily need to if you pipe it right to a call # to ggplot() modernOceania <- gapminder %>% filter(continent == "Oceania" & year > 1980) %>% select(-continent) glimpse(modernOceania) # Break the plotting effort into small pieces and # get each of them to work before adding complexity ggplot(modernOceania, aes(x = pop, y = gdpPercap)) + geom_point() # Now let's color by country ggplot(modernOceania, aes(x = pop, y = gdpPercap, color = country)) + geom_point() # What happens if we assign a plot to a variable? # We've assigned the plot itself to a new variable. modernOceania <- gapminder %>% filter(continent == "Oceania" & year > 1980) %>% select(-continent) %>% ggplot(aes(x = pop, y = gdpPercap, color = country)) + geom_point() # Run the variable name to see the plot modernOceania # Add to the plot modernOceania + geom_smooth(method = "lm") # Layers get put on a plot in the order you specify. # geom_smooth first, then geom_point gapminder %>% filter(continent == "Oceania" & year > 1980) %>% select(-continent) %>% ggplot(aes(x = pop, y = gdpPercap, color = country)) + geom_smooth(method = "lm") + geom_point() # How to make visual features of the plot fixed values # First example: country mapped to point colors gapminder %>% filter(continent == "Oceania" & year > 1980) %>% select(-continent) %>% ggplot() + geom_point(aes(x = pop, y = gdpPercap, color = country)) # Contrast that with a fixed value of point colors gapminder %>% filter(continent == "Oceania" & year > 1980) %>% select(-continent) %>% ggplot() + geom_point(aes(x = pop, y = gdpPercap), color = "red") gapminder %>% filter(continent == "Oceania" & year > 1980) %>% select(-continent) %>% ggplot() + geom_point(aes(x = pop, y = gdpPercap, fill = country), color = "red", pch = 21, size = 3) + scale_x_log10() # Adjusting ggplot themes ------------------------------------------------- # This is the plot we generated at the end of the # day yesterday gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp)) + geom_point(aes(color = continent)) + scale_x_log10() + geom_smooth(method = "lm") # Use the theme_ layer gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp)) + geom_point(aes(color = continent)) + scale_x_log10() + geom_smooth(method = "lm") + theme_bw() # Custom themes to tweak a plot however you like. Here we removed grid lines mytheme <- theme(legend.title = element_text(colour = "steelblue", size = rel(2)), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp)) + geom_point(aes(color = continent)) + scale_x_log10() + geom_smooth(method = "lm") + mytheme # Extra themes from another package. Be sure to install.packages("ggthemes") # first. library(ggthemes) gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp)) + geom_point(aes(color = continent)) + scale_x_log10() + geom_smooth(method = "lm") + theme_excel() # Briefly introduce a color palette # Viridis. Be sure to install.packages("viridis") library(viridis) gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp)) + geom_point(aes(color = continent)) + scale_x_log10() + geom_smooth(method = "lm") + scale_color_viridis(discrete = TRUE) # Multi-panel plots ------------------------------------------------------- # Originally, we plotted life expectancy over time # for all the countries ggplot(gapminder, aes(x = year, y = lifeExp)) + geom_line() # That is a lot of messy lines and it is hard to # make sense of it at all. ggplot(gapminder, aes(x = year, y = lifeExp, group = country)) + geom_line(aes(color = continent)) # Let's plot the relationship between life expectancy # and time for each country (that is, one line per # country), but put each continent in its own # facet (that is, a sub-plot) ggplot(gapminder, aes(x = year, y = lifeExp, group = country)) + geom_line() + facet_wrap(~ continent) # Let's save a plot to our computer lifeExpVStime <- ggplot(gapminder, aes(x = year, y = lifeExp, group = country)) + geom_line(aes(color = continent)) + facet_wrap(~ continent) ggsave(filename = "lifeExpVStime.pdf", plot = lifeExpVStime, path = "figures/", device = "pdf") # You can also add the whole file path as the # filename argument, then you don't have to # specify the path argument ggsave(filename = "figures/lifeExpVStime.pdf", plot = lifeExpVStime) ?ggsave # Data import ------------------------------------------------------------- # Use the read.csv() function to import a .csv file into # R read.csv("data/species.csv") # Remember to assign the result of calling the read.csv() # function to a new R object species <- read.csv("data/species.csv") str(species) glimpse(species) # The view tab will update what the object looks like # IF you have reassigned the object species %>% select(taxa) species <- species %>% select(taxa) ?read.csv2 # What if the data are *almost* tidy? blood <- read.csv("data/wide_eg.csv") head(blood) # A flexible way to read in data is to use read.table() blood <- read.table("data/wide_eg.csv", skip = 2, header = TRUE, sep = ",") head(blood) # Go grab files directly from the web blood <- read.table("data/wide_eg.csv", skip = 2, header = TRUE, sep = ",") blood <- read.csv(url("https://mikoontz.github.io/data-carpentry-week/data/wide_eg.csv"), skip = 2) glimpse(blood) # Using readr functions from the tidyverse read_csv("data/species.csv") read_csv("https://mikoontz.github.io/data-carpentry-week/data/wide_eg.csv", skip = 2) # stringsAsFactors! species <- read.csv("data/species.csv") glimpse(species) species <- read.csv("data/species.csv", stringsAsFactors = FALSE) glimpse(species) species <- read_csv("data/species.csv") glimpse(species) # How to export data glimpse(blood) head(blood) write.csv(blood, file = "data/blood_clean.csv") write_csv(blood, path = "data/blood_clean_tidyverse.csv") # Some inconsistencies with write.csv and write_csv # Quotations added around character columns in write.csv() # write.csv(blood, file = "data/blood_clean_no_quotes.csv", quote = FALSE) # Handling row names in a write.csv write.csv(blood, file = "data/blood_clean_no_quote_no_row_names.csv", quote = FALSE, row.names = FALSE) nrow(blood) # number of rows in blood 1:nrow(blood) # vector from 1 to the number of rows in blood letters # all the lowercase letters letters[1:nrow(blood)] # the lowercase letters a through the number of rows in blood row.names(blood) <- letters[1:nrow(blood)] blood blood <- read.csv("data/blood_clean_no_quote_no_row_names.csv") row.names(blood) <- blood$control blood blood <- read.csv("data/blood_clean_no_quote_no_row_names.csv") blood %>% select(-(3:4)) tolower() # makes all characters into lower case