# Day 4

suppressPackageStartupMessages(library(tidyverse)) # keep messages from appearing in html output or on your screen

# create a directory to put the data in
# dir.create("data") # make sure we have a data dir to download file to

# download a copy locally
download.file(url="https://mikoontz.github.io/data-carpentry-week/data/combined.csv", destfile = "data/combined.csv") # this downloads and names your file

# read in that data to your environment
survey <- read.csv("data/combined.csv")

### Summarize Data

# Now let's figure out what we have using `summary` and `glimpse` functions.

summary(survey)
glimpse(survey)

# a way to view counts of categories in a variable
table(survey$species_id) # this is a factor vector

# test if table() works on character data types
table(as.character(survey$species_id)) # works on character vectors

# let's look at sex column for NA's and blanks
table(survey$sex)

# So it looks like many different species, but abundances (counts) are widely variable. Some have 1, others have as many as 2000.

# Also appears to have many NA's in the `hindfoot_length` and `weight` columns. And we have "blanks" in the `sex` column.

## Filter Data

### Remove NAs and Blanks
# Let's get rid of NA's from **`weight`** & **`hindfoot_length`** columns, and the mysterious blanks (`""`) from the sex column. Write that object to a new dataframe called "*surveys_complete*".

surveys_complete <- survey %>% 
  filter(!is.na(weight), # remove NAs in weight
         !is.na(hindfoot_length), # remove NAs in hindfoot_length
         sex!="", # could also use !sex==""
         species_id!="") # remove missing species_id

## Extract the most common species_id
species_counts <- surveys_complete %>%
  group_by(species_id) %>% # grouping here so the count/tally will be by this group
  tally %>%
  filter(n >= 50) # filter to common species

# quick histogram of species obs
hist(species_counts$n)

## Only keep the most common species
surveys_complete2 <- surveys_complete %>%
  filter(species_id %in% species_counts$species_id)


# learning %in% -----------------------------------------------------------


animals<- c("racoon", "rabbit", "cat", "dog", "frog","penguin")

friendlyanimals <- c("frog", "dog")

animals %in% friendlyanimals # finds match with friendlyanimals only

!animals %in% friendlyanimals # finds only animals that don't match friendly animals

friendlyanimals %in% animals

friendlyanimals %in% animals # will work either way,
# usually use % going from "big table" to "lookup/little table"

## Quick note about weird factor things.

# notice there are levels with no observations
table(surveys_complete2$species_id) 

# factors retain levels even when no observations exist

# how to get rid of "empty factor levels"
surveys_complete2$species_id <- factor(surveys_complete2$species_id)

# that should have dropped unused factors from this column and re-assigned to the dataframe
table(surveys_complete2$species_id) 

# **Note** when you save this to `.csv` the data will not retain all these unused factors, so you won't need to convert or "drop levels" if you are exporting and reading a csv back in. 

## Export/Save Data as CSV

# Export without rownames using `write.csv`

write.csv(surveys_complete2, file = "data_output/surveys_complete.csv", row.names = FALSE)


# Visualize Data with `ggplot2`

# Need to read in our data and libraries first.

library(tidyverse)
surveys_complete <- read.csv("data_output/surveys_complete.csv")


# Now we have a clean dataset and we want to start looking at it. Let's make some visualizations and plot our data.

## Boxplot & Violin Plots 

# Create a boxplot/violinplot to visualize the distribution of the `weight` variable for each species.  
# Create a different box/violinplot to show the distribution of the `hindfoot_length` variable for each species.

# Let's look at `weight`

surveys_complete %>% 
ggplot() + geom_boxplot(aes(x=species_id, y=weight), fill="skyblue") +
theme_bw()

surveys_complete %>% 
ggplot() + geom_violin(aes(x=species_id, y=weight), fill="skyblue") +
theme_classic()


# Ok! Now let's look at `hindfoot_length`
surveys_complete %>% 
  ggplot() + geom_boxplot(aes(x=species_id, y=hindfoot_length), fill="salmon") +
  theme_bw()


surveys_complete %>% 
  ggplot() + geom_violin(aes(x=species, y=hindfoot_length), fill="salmon") +
  theme_bw()

# Boxplots are great for showing a lot of information about the spread of a variable, but some information from the raw data is lost. 

# Add a point geometry to your box/violinplot to visualze the raw `hindfoot_length data`

surveys_complete %>% 
  ggplot(aes(x=species_id, y=hindfoot_length)) +
  geom_boxplot() +
  geom_point(color="forestgreen") +
  theme_bw()


# Unfortunately, adding the raw data makes our figure “overplotted”. There are too many points overlapping and information is lost this way, too.

# - Make a new boxplot of `hindfoot_length` for each species, but this time plot a jittered version of the raw data as a geometry on top of your boxplot

surveys_complete %>% 
  ggplot(aes(x=species_id, y=hindfoot_length)) +
  geom_boxplot() +
  geom_jitter()


# Color by plot id:
  
surveys_complete %>% 
  ggplot(aes(x=species_id, y=hindfoot_length, color=plot_id)) +
  geom_jitter(alpha=0.5) +
  geom_boxplot()


# Whoa, plotted the `plot_id` as a numeric range instead of independent observations or locations. How do we fix?

surveys_complete %>% 
  ggplot(aes(x=species_id, y=hindfoot_length)) + # change to factor for plotting only
  geom_jitter(aes(color=as.factor(plot_id)),alpha=0.5) +
  geom_boxplot()

# adding some more elements
surveys_complete %>% 
  ggplot(aes(x=species_id, y=hindfoot_length)) + # change to factor for plotting only
  geom_jitter(aes(color=as.factor(plot_id)),alpha=0.5) +
  geom_boxplot() + 
  # add a better title and axis label
  xlab("Species ID") + ylab("Hindfoot Length (mm)")+
  ggtitle("A Wild Plot")


# How to change legend title?

surveys_complete %>% 
  ggplot(aes(x=species_id, y=hindfoot_length)) + # change to factor for plotting only
  geom_jitter(aes(color=as.factor(plot_id)),alpha=0.5) +
  geom_boxplot() + 
  # add a better title and axis label
  xlab("Species ID") + ylab("Hindfoot Length (mm)")+
  ggtitle("A Wild Plot") +
  scale_color_manual(name="Plot ID")


# Afternoon ---------------------------------------------------------------


# Custom functions --------------------------------------------------------

# Bare bones function call
hypotenuse <- function(a, b) {
  myHypotenuse <- sqrt(a^2 + b^2)
  return(myHypotenuse)
}

hypotenuse(4, 5)

# Difference between hypotenuse and myHypotenuse
# hypotenuse is the name of the function
# myHypotenuse is the name of an object that I've assigned within the
# function

hypotenuse <- function(a, b) {
  bookshelves <- sqrt(a^2 + b^2)
  return(bookshelves)
}

hypotenuse(4, 5)

# Objects defined within the function stay within the function!
bookshelves # doesn't exist, because I defined it within a function

# you can also pass an operation directly to return() without any need
# to define an intermediate variable
hypotenuse <- function(a, b) {
  return(sqrt(a^2 + b^2))
}

hypotenuse(43, 5)

# In R, you don't even need to use a return function
hypotenuse <- function(a, b) {
  sqrt(a^2 + b^2)
}

hypotenuse(43, 5)

# Naming your arguments
hypotenuse(a = 43, b = 5)
hypotenuse(b = 5, a = 43)

# Default argument values
# Remember the log() function?
?log

hypotenuse <- function(a, b = 12) {
  theHypotenuse <- sqrt(a^2 + b^2)
  return(theHypotenuse)
}

hypotenuse(a = 5)
hypotenuse(b = 4)

# What if you just put 5 instead of a = 5
hypotenuse(5)

# If you have a default value, what if you want to pass a different
# value for a particluar time that you call your function?
hypotenuse(a = 4, b = 30)


# Defensive programming ---------------------------------------------------

hypotenuse <- function(a, b) {

if(a <= 0) {
    stop("Triangle side must have a length greater than 0")
  }
  theHypotenuse <- sqrt(a^2 + b^2)
  return(theHypotenuse)
}

hypotenuse(a = 2, b = 12)
hypotenuse(a = -4, b = 12)

# local versus global variables with resepect to functions

hypotenuse <- function(a, b) {
  myHypotenuse <- sqrt(a^2 + b^2)
  return(myHypotenuse)
}


# Temperature conversion --------------------------------------------------
# Add to the code below to make a function that converts Fahrenheit
# to Kelvin
# Remember that temperature in K is equal to the F temperature
# minus 32 * 5/9 and the whole thing plus 273.15

F_to_X <- function(temp) {
  K <- ((temp - 32) * 5/9) + 273.15
  return(K)
}

# Freezing point of water in Kelvin
F_to_X(32)

myTemps <- c(0, 32, 98.6, 212)
F_to_X(myTemps)

# A function that converts K to C
# subtract 273.15 from K to get to C
K_to_C <- function(temp) {
  C <- temp - 273.5
  return(C)
}

K_to_C(100)

# We can nest functions and combine them at will

K_to_C(F_to_X(98.6))


# use the source() function to read other .R files in your 
# project and let you use the code that is within them.

source("scripts/temperature_conversions.R")


library(gapminder)
library(tidyverse)
glimpse(gapminder)

# Create a function that plots population growth through time
plotPopGrowth <- function(countryToPlot, theData = gapminder) {
  oneCountry <- theData %>%
    filter(country == countryToPlot)
  
  
}


# How do I go about making a function?
# Let's try with Canada

justCanada <-
  gapminder %>%
  filter(country == "Canada")

myPlot <-
  ggplot(justCanada, aes(x = year, y = pop)) +
  geom_line()

# Now try to generalize
plotPopGrowth <- function(countryToPlot, theData = gapminder) {
  oneCountry <- theData %>%
    filter(country == countryToPlot)%>%
    ggplot(aes(x = year, y = pop)) +
    geom_line()
  
  return(oneCountry) 
}

plotPopGrowth(countryToPlot = "Costa Rica")

blood <- read_csv("https://mikoontz.github.io/data-carpentry-week/data/wide_eg.csv", skip = 2)

library(tidyr)
head(blood)

blood_long <- gather(blood, key = condition, value = albumin, control, treatment1, treatment2)

ggplot(blood_long, aes(x = sex, y = albumin)) +
  geom_violin(aes(fill = sex))

load("data/continents.RDA")
continents
head(gapminder)

gm_withAttributes <- left_join(gapminder, continents)

head(gm_withAttributes)


plotPopGrowth <- function(countryToPlot, theData = gapminder) {
  oneCountry <- theData %>%
    filter(country == countryToPlot) %>%
    ggplot(aes(x = year, y = pop)) +
    geom_line()
  
  return(oneCountry) 
}

countries <- c("United States", "Canada")

plotPopGrowth(countries)


for(i in seq_along(unique(gapminder$country))) {

}