# Day 3

# Challenge 2
# Load any libraries you may need
# Subset the gapminder data to only Oceania countries post-1980.
# Remove the continent column
# Make a scatter plot of gdpPercap vs. population colored by country
# Advanced How would you determine the median population for the American countries between 1970 and 1980?

library(tidyverse)
library(gapminder)

# The FIRST thing that I do before I start working with a 
# dataset is to remind myself what it looks like.
glimpse(gapminder)

# When I read "only Oceania countries post-1980", I am thinking
# that I want specific rows that meet these 2 criteria (1st: rows
# where the continent column is "Oceania", 2nd: rows where the
# year is greater than 1980)

gapminder %>%
  filter(continent == "Oceania" & year > 1980)

# When I read that I want to "remove the continent column", I
# think that we need to operate on the variables in our data
# frame using the select() function

# This code will remove the year column. Modify it to remove
# the continent column.
gapminder %>%
  filter(continent == "Oceania" & year > 1980) %>%
  select(-year)

# I'm going to save the manipulated data frame as a new variable,
# but you don't necessarily need to if you pipe it right to a call
# to ggplot()

modernOceania <-
  gapminder %>%
  filter(continent == "Oceania" & year > 1980) %>%
  select(-continent)

glimpse(modernOceania)

# Break the plotting effort into small pieces and
# get each of them to work before adding complexity
ggplot(modernOceania, aes(x = pop, y = gdpPercap)) +
  geom_point()

# Now let's color by country
ggplot(modernOceania, aes(x = pop, y = gdpPercap, color = country)) +
  geom_point()

# What happens if we assign a plot to a variable?
# We've assigned the plot itself to a new variable.
modernOceania <-
  gapminder %>%
  filter(continent == "Oceania" & year > 1980) %>%
  select(-continent) %>%
  ggplot(aes(x = pop, y = gdpPercap, color = country)) +
  geom_point()

# Run the variable name to see the plot
modernOceania

# Add to the plot
modernOceania +
  geom_smooth(method = "lm")

# Layers get put on a plot in the order you specify.
# geom_smooth first, then geom_point
gapminder %>%
  filter(continent == "Oceania" & year > 1980) %>%
  select(-continent) %>%
  ggplot(aes(x = pop, y = gdpPercap, color = country)) +
  geom_smooth(method = "lm") +
  geom_point()

# How to make visual features of the plot fixed values
# First example: country mapped to point colors
gapminder %>%
  filter(continent == "Oceania" & year > 1980) %>%
  select(-continent) %>%
  ggplot() +
  geom_point(aes(x = pop, y = gdpPercap, color = country))

# Contrast that with a fixed value of point colors
gapminder %>%
  filter(continent == "Oceania" & year > 1980) %>%
  select(-continent) %>%
  ggplot() +
  geom_point(aes(x = pop, y = gdpPercap), color = "red")

gapminder %>%
  filter(continent == "Oceania" & year > 1980) %>%
  select(-continent) %>%
  ggplot() +
  geom_point(aes(x = pop, y = gdpPercap, fill = country), color = "red", pch = 21, size = 3) +
  scale_x_log10()


# Adjusting ggplot themes -------------------------------------------------
# This is the plot we generated at the end of the 
# day yesterday
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point(aes(color = continent)) +
  scale_x_log10() +
  geom_smooth(method = "lm")

# Use the theme_ layer
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point(aes(color = continent)) +
  scale_x_log10() +
  geom_smooth(method = "lm") +
  theme_bw()

# Custom themes to tweak a plot however you like. Here we removed grid lines
mytheme <- 
  theme(legend.title = element_text(colour = "steelblue", 
                                    size = rel(2)),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point(aes(color = continent)) +
  scale_x_log10() +
  geom_smooth(method = "lm") +
  mytheme

# Extra themes from another package. Be sure to install.packages("ggthemes") 
# first.
library(ggthemes)

gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point(aes(color = continent)) +
  scale_x_log10() +
  geom_smooth(method = "lm") +
  theme_excel()

# Briefly introduce a color palette
# Viridis. Be sure to install.packages("viridis")
library(viridis)

gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point(aes(color = continent)) +
  scale_x_log10() +
  geom_smooth(method = "lm") +
  scale_color_viridis(discrete = TRUE)


# Multi-panel plots -------------------------------------------------------

# Originally, we plotted life expectancy over time
# for all the countries
ggplot(gapminder, aes(x = year, y = lifeExp)) +
  geom_line()

# That is a lot of messy lines and it is hard to
# make sense of it at all.
ggplot(gapminder, aes(x = year, y = lifeExp, group = country)) +
  geom_line(aes(color = continent))
  
# Let's plot the relationship between life expectancy
# and time for each country (that is, one line per
# country), but put each continent in its own
# facet (that is, a sub-plot)

ggplot(gapminder, aes(x = year, y = lifeExp, group = country)) +
  geom_line() +
  facet_wrap(~ continent)


# Let's save a plot to our computer
lifeExpVStime <-
  ggplot(gapminder, aes(x = year, y = lifeExp, group = country)) +
  geom_line(aes(color = continent)) +
  facet_wrap(~ continent)

ggsave(filename = "lifeExpVStime.pdf",
       plot = lifeExpVStime, 
       path = "figures/",
       device = "pdf")

# You can also add the whole file path as the
# filename argument, then you don't have to
# specify the path argument
ggsave(filename = "figures/lifeExpVStime.pdf",
       plot = lifeExpVStime)
?ggsave


# Data import -------------------------------------------------------------

# Use the read.csv() function to import a .csv file into
# R

read.csv("data/species.csv")

# Remember to assign the result of calling the read.csv()
# function to a new R object

species <- read.csv("data/species.csv")

str(species)
glimpse(species)

# The view tab will update what the object looks like
# IF you have reassigned the object
species %>%
  select(taxa)

species <- 
  species %>%
  select(taxa)

?read.csv2

# What if the data are *almost* tidy?

blood <- read.csv("data/wide_eg.csv")
head(blood)

# A flexible way to read in data is to use read.table()
blood <- read.table("data/wide_eg.csv", 
                    skip = 2, 
                    header = TRUE, 
                    sep = ",")

head(blood)

# Go grab files directly from the web

blood <- read.table("data/wide_eg.csv", 
                    skip = 2, 
                    header = TRUE, 
                    sep = ",")

blood <- read.csv(url("https://mikoontz.github.io/data-carpentry-week/data/wide_eg.csv"), skip = 2)

glimpse(blood)

# Using readr functions from the tidyverse

read_csv("data/species.csv")
read_csv("https://mikoontz.github.io/data-carpentry-week/data/wide_eg.csv", skip = 2)


# stringsAsFactors!
species <- read.csv("data/species.csv")
glimpse(species)

species <- read.csv("data/species.csv", 
                    stringsAsFactors = FALSE)
glimpse(species)

species <- read_csv("data/species.csv")
glimpse(species)

# How to export data
glimpse(blood)
head(blood)

write.csv(blood, file = "data/blood_clean.csv")

write_csv(blood, path = "data/blood_clean_tidyverse.csv")

# Some inconsistencies with write.csv and write_csv

# Quotations added around character columns in write.csv()
# 
write.csv(blood, file = "data/blood_clean_no_quotes.csv",
          quote = FALSE)

# Handling row names in a write.csv
write.csv(blood, file = "data/blood_clean_no_quote_no_row_names.csv",
          quote = FALSE,
          row.names = FALSE)

nrow(blood) # number of rows in blood
1:nrow(blood) # vector from 1 to the number of rows in blood
letters # all the lowercase letters
letters[1:nrow(blood)] # the lowercase letters a through the number of rows in blood
row.names(blood) <- letters[1:nrow(blood)]

blood

blood <- read.csv("data/blood_clean_no_quote_no_row_names.csv")
row.names(blood) <- blood$control 
blood

blood <- read.csv("data/blood_clean_no_quote_no_row_names.csv")
blood %>%
  select(-(3:4))

tolower() # makes all characters into lower case