# Day 2

# You can set options for how Rstudio looks using Tools > Global Options 

# Reviewing the Day 1 Content ---------------------------------------------

# Challenge 2:

elephant1_kg <- 3492
elephant2_lb <- 7757

# now need to convert one to the other...google says conversion is  1lb  to 0.453592kg
# so let's make a new elephant2_kg
elephant2_kg <- elephant2_lb * 0.453592

# now compare which weighs more:
elephant1_kg > elephant2_kg


# DAY 2: Installing Packages ----------------------------------------------

# installing via the console:

install.packages(c("gapminder", "tidyr"))

# you can check your CRAN Mirror (where packages are being downloaded from) using
# Tools > Global Options > Packages. Then check "Change" at the top and find a new mirror

library(tidyverse)
library(gapminder)


# DATA CLASSES ------------------------------------------------------------

# the main types we will mostly deal with are:
## integer
## numeric
## character
## logical (T/F)
## factor

# Vectors are workhorse of R, can be numeric or character

# numeric
weight_g <- c(50, 60, 65, 82)
weight_g

# character
animals <- c("mouse", "rat", "dog")
animals

# use different functions to assess data type and shape
# look at length
length(weight_g)

# class
class(animals)

# str (structure)
str(animals)

weight_g <- c(weight_g, 90)

mixed <- c(animals, weight_g)

logical <- c(T, F, T, T, T, T)

mixed2 <- c(weight_g, logical)

# how to convert logical to numeric:

# functions to convert different data types
as.character(logical)
as.numeric(animals)
as.numeric(logical)
as.factor()
as.integer()


# FACTORS -----------------------------------------------------------------

# stores both character and numeric in same vector

animal_factors<-as.factor(animals)
as.integer(animal_factors)

levels(animal_factors)
nlevels(animal_factors)

# factors can be both character based or numeric based

hucs <- c("08123", "09234","07222")
as.factor(hucs)

# use an example of how to re-order your factors

lomedhi<- as.factor(c("low", "med", "high"))
levels(lomedhi)

# notice it is spelling sensitive
lomedhi<- as.factor(c("low", "med", "high", "med", "hi"))
levels(lomedhi)

# we nested a vector (c()) inside a function as.factor() to create a new factor vector.

levels(lomedhi)

# tell R the order you want the "levels" to occur in
lomedhi <- factor(x=lomedhi, levels = c("low","med","high"))

# notice it is now re-ordered as preferred
levels(lomedhi)

# you can add whatever levels you want, but if there is no value in the vector associated with that level, it will default to a NA.
lomedhi <- factor(x=lomedhi, levels = c("low","med","High"))

summary(lomedhi)

# dplyr -------------------------------------------------------------------

# first let's look at subsetting in base R
# we need to grab the dataset from yesterday and load it
load(url("https://mikoontz.github.io/data-carpentry-week/data/continents.RDA"))

# find total land area in continents with at least 10% of worlds pop

df <- continents[continents$percent_total_pop>10, ]
df
sum(df$area_km2)

# loading libraries
library(gapminder)
library(tidyverse)

class(gapminder)
str(gapminder)

head(gapminder)

gapminder<-tbl_df(gapminder) # notice deprecation note
gapminder<- tibble::as_tibble(gapminder)


# dplyr VERBS -------------------------------------------------------------

# select (works to subset to columns)
# filter (works to subset to rows)
# arrange (sort data in order you prefer)
# mutate (make new columns in same dataframe)
# summarize (summarize groups of data)

# how many unique countries?
unique(gapminder$country)

### FILTER

# always data, then filtering condition
filter(gapminder, country=="United States")

# multiple options: "|" says meet one OR other condition
filter(gapminder, country=="Afghanistan" | country=="United States")

# this is exclusively AND (so trying to meet both conditions, but conflict)
filter(gapminder, country=="Afghanistan" & country=="United States")

filter(gapminder, country=="United States", year > 2000)

### SELECT

# select 2 columns
select(gapminder, country, lifeExp)

# rename and select
select(gapminder, ThePlace = country, HowLongTheyLive = lifeExp)

# doesn't select any columns, just renames them
rename(gapminder, ThePlace=country, HowLongTheyLive = lifeExp)

# to keep a new dataframe, just use "<-" to assign
df1 <- select(gapminder, ThePlace = country, HowLongTheyLive = lifeExp)

# deselect columns but keep remaining columns
df2 <- select(gapminder, -c(pop, continent))

# use other select options to pick columns:
df3 <- select(gapminder, starts_with("c"))

# move or rerrange column order

df4 <- select(gapminder, pop, everything())

# try challenge with continents dataset but using dplyr
# "find total land area in continents with at least 10% of worlds pop"

totlandarea <- filter(continents, percent_total_pop>10)
totlandarea <- sum(totlandarea$area_km2)

## PIPES

# ctrl + shift + m == "%>%" 

totlandarea <- filter(continents, percent_total_pop>10) %>% 
  select(area_km2) %>% 
  sum

### MUTATE

# add a column (a new variable)
mutate(gapminder, total_gdp = gdpPercap * pop)

df5 <- gapminder %>% 
  mutate(total_gdp = gdpPercap * pop) %>% 
  select(country, total_gdp) %>% 
  filter(total_gdp > 1.868e+11) %>% 
  arrange(total_gdp)

glimpse(df5) # this is like "str()" but in dplyr

head(df5)

# view by descending value of total_gdp
df5 %>% arrange(desc(total_gdp)) %>% head

### SUMMARIZE

gapminder %>% 
  filter(country=="United States") %>% 
  group_by(year, country) %>% 
  summarize(mean_gdp = mean(gdpPercap), 
            median_gdp = median(gdpPercap))
  
  
# Challenge Part I:

# Reload packages in case we closed R
library(tidyverse)
library(gapminder)

# Split, apply, combine strategy of data summarization
glimpse(gapminder)

# We use mutate to add a column to a data frame by
# using element-by-element operations (add a column,
# but don't change the number of rows)

withTotGDP <- gapminder %>%
  mutate(totGDP = pop * gdpPercap)

glimpse(withTotGDP)

# Split, apply, combine

gapminder %>%
  mutate(totGDP = pop * gdpPercap) %>%
  group_by(year) %>%
  summarize(varGDP = var(totGDP)) %>%
  arrange(year)

# What happens if we don't do a group_by?
gapminder %>%
  mutate(totGDP = pop * gdpPercap) %>%
  summarize(varGDP = var(totGDP))

# How might we pull out just the first year and the last
# year from the summarized data frame?
gapminder %>%
  mutate(totGDP = pop * gdpPercap) %>%
  group_by(year) %>%
  summarize(varGDP = var(totGDP)) %>%
  filter(year == "1952" | year == "2007")

# How to compare the two values we just summarized
# and subsetted to?
gapminder %>%
  mutate(totGDP = pop * gdpPercap) %>%
  group_by(year) %>%
  summarize(varGDP = var(totGDP)) %>%
  filter(year == "1952" | year == "2007") %>%
  summarize(diff(varGDP))

# Grab the first row using slice(1)
firstGDP <-
  gapminder %>%
  mutate(totGDP = pop * gdpPercap) %>%
  group_by(year) %>%
  summarize(varGDP = var(totGDP)) %>%
  filter(year == "1952" | year == "2007") %>%
  select(varGDP) %>%
  slice(1)

lastGDP <-
  gapminder %>%
  mutate(totGDP = pop * gdpPercap) %>%
  group_by(year) %>%
  summarize(varGDP = var(totGDP)) %>%
  filter(year == "1952" | year == "2007") %>%
  select(varGDP) %>%
  slice(2)

# Subtract the two values we just saved!
lastGDP - firstGDP

# Can we reference elements in a tibble using square
# bracket notation
gapminder %>%
  mutate(totGDP = pop * gdpPercap) %>%
  group_by(year) %>%
  summarize(varGDP = var(totGDP)) %>%
  .[1, 1]

# Here's what the period does '.'
testDF <-
  gapminder %>%
  mutate(totGDP = pop * gdpPercap) %>%
  group_by(year) %>%
  summarize(varGDP = var(totGDP))

testDF %>%
  .[1, 3]

# ... is equivalent to

testDF[1, 1]

#
testDF <-
  gapminder %>%
  mutate(totGDP = pop * gdpPercap) %>%
  group_by(year) %>%
  summarize(varGDP = var(totGDP)) %>%
  select(varGDP)

# The answer to Challenge 2

gapminder %>%
  mutate(totGDP = pop * gdpPercap) %>%
  group_by(year) %>%
  summarize(var_totGDP = var(totGDP),
            var_perCapGDP = var(gdpPercap))


# Data visualization with ggplot2 -----------------------------------------

# Motivating example!
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) +
  geom_point()

# We want to think about 'mapping' our data to 
# visual features of a plot. We do that by using
# the aes() [we are specifying the aesthetic of
# a particular visual feature on a plot]
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp))

# Let's add our first geom_
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) +
  geom_point()

# Modify the code that we wrote above so that we can
# visualize how life expectancy has changed over time.

ggplot(gapminder, aes(x = year, y = lifeExp)) +
  geom_point()

# bonus! color the points by the continent that they
# are on

gapminder %>%
  ggplot(aes(x = year, y = lifeExp, color = continent)) +
  geom_point()

# Can we put an aes() function call within the geom_point()? Let's try.

# Add a line geometry to our ggplot

gapminder %>%
  ggplot(aes(x = year, y = lifeExp, color = continent)) +
  geom_point() +
  geom_line()

# Let's use different aesthetics for different geometries 
# by adding a line for each country

gapminder %>%
  ggplot(aes(x = year, y = lifeExp, color = continent)) +
  geom_point() +
  geom_line(aes(group = country))

# What happens when we put the grouping variable at the
# top level of the aes() hierarchy? 
gapminder %>%
  ggplot(aes(x = year, y = lifeExp, group = country)) +
  geom_point() + 
  geom_line(aes(color = continent))

# Rearrange order to put points on top of the lines
gapminder %>%
  ggplot(aes(x = year, y = lifeExp, group = country)) +
  geom_line(aes(color = continent)) +
  geom_point()

# How can we plot a continuous variable versus a categorical variable?

gapminder %>%
  ggplot(aes(x = continent, y = lifeExp)) +
  geom_boxplot()
  
# Challenge: using violin plots
gapminder %>%
  ggplot(aes(x = continent, y = lifeExp)) +
  geom_violin()

# Filling each violin plot by continent
gapminder %>%
  ggplot(aes(x = continent, y = lifeExp)) +
  geom_violin(aes(fill = continent))

# what if we put aes(fill = continent) in the beginning?
gapminder %>%
  ggplot(aes(x = continent, y = lifeExp, fill = continent)) +
  geom_violin()

# color versus fill for violin plots
# what if we put aes(fill = continent) in the beginning?
gapminder %>%
  ggplot(aes(x = continent, y = lifeExp, color = continent)) +
  geom_violin()

# both color and fill
gapminder %>%
  ggplot(aes(x = continent, y = lifeExp, color = continent, fill = continent)) +
  geom_violin()

# What happens if we don't put a value in an aesthetic?
gapminder %>%
  ggplot(aes(x = continent, y = lifeExp, fill = continent)) +
  geom_violin(color = "red")

# Get rid of the border?
gapminder %>%
  ggplot(aes(x = continent, y = lifeExp, fill = continent)) +
  geom_violin(color = NA)

# Why doesn't this work? fill = continent is trying to
# map data to a visual feature, so it must go in a call
# to the aes() function
gapminder %>%
  ggplot(aes(x = continent, y = lifeExp)) +
  geom_violin(color = "red", fill = continent)

# fixed!
gapminder %>%
  ggplot(aes(x = continent, y = lifeExp)) +
  geom_violin(color = "red", aes(fill = continent))

# Starting to modify the scales
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point()

# Those data are hard to read because there is a huge
# density of points at the low end of gdpPercap
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point() +
  scale_x_log10()
  
# Let's add a best fit line to our plot; default best fit
# line is a gam()
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point() +
  scale_x_log10() +
  geom_smooth()

# Fit a linear model using ordinary least squares regression
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point() +
  scale_x_log10() +
  geom_smooth(method = "lm")

# Challenge:
# This works, but I went over 20 characters.
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point() +
  scale_x_log10() +
  geom_smooth(method = "lm", aes(group = continent))

# Where else can we put the group aesthetic
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
  geom_point() +
  scale_x_log10() +
  geom_smooth(method = "lm")

# One best fit line, but color points by continent
gapminder %>%
  ggplot(aes(x = gdpPercap, y = lifeExp)) +
  geom_point(aes(color = continent)) +
  scale_x_log10() +
  geom_smooth(method = "lm")