# Day 4 suppressPackageStartupMessages(library(tidyverse)) # keep messages from appearing in html output or on your screen # create a directory to put the data in # dir.create("data") # make sure we have a data dir to download file to # download a copy locally download.file(url="https://mikoontz.github.io/data-carpentry-week/data/combined.csv", destfile = "data/combined.csv") # this downloads and names your file # read in that data to your environment survey <- read.csv("data/combined.csv") ### Summarize Data # Now let's figure out what we have using `summary` and `glimpse` functions. summary(survey) glimpse(survey) # a way to view counts of categories in a variable table(survey$species_id) # this is a factor vector # test if table() works on character data types table(as.character(survey$species_id)) # works on character vectors # let's look at sex column for NA's and blanks table(survey$sex) # So it looks like many different species, but abundances (counts) are widely variable. Some have 1, others have as many as 2000. # Also appears to have many NA's in the `hindfoot_length` and `weight` columns. And we have "blanks" in the `sex` column. ## Filter Data ### Remove NAs and Blanks # Let's get rid of NA's from **`weight`** & **`hindfoot_length`** columns, and the mysterious blanks (`""`) from the sex column. Write that object to a new dataframe called "*surveys_complete*". surveys_complete <- survey %>% filter(!is.na(weight), # remove NAs in weight !is.na(hindfoot_length), # remove NAs in hindfoot_length sex!="", # could also use !sex=="" species_id!="") # remove missing species_id ## Extract the most common species_id species_counts <- surveys_complete %>% group_by(species_id) %>% # grouping here so the count/tally will be by this group tally %>% filter(n >= 50) # filter to common species # quick histogram of species obs hist(species_counts$n) ## Only keep the most common species surveys_complete2 <- surveys_complete %>% filter(species_id %in% species_counts$species_id) # learning %in% ----------------------------------------------------------- animals<- c("racoon", "rabbit", "cat", "dog", "frog","penguin") friendlyanimals <- c("frog", "dog") animals %in% friendlyanimals # finds match with friendlyanimals only !animals %in% friendlyanimals # finds only animals that don't match friendly animals friendlyanimals %in% animals friendlyanimals %in% animals # will work either way, # usually use % going from "big table" to "lookup/little table" ## Quick note about weird factor things. # notice there are levels with no observations table(surveys_complete2$species_id) # factors retain levels even when no observations exist # how to get rid of "empty factor levels" surveys_complete2$species_id <- factor(surveys_complete2$species_id) # that should have dropped unused factors from this column and re-assigned to the dataframe table(surveys_complete2$species_id) # **Note** when you save this to `.csv` the data will not retain all these unused factors, so you won't need to convert or "drop levels" if you are exporting and reading a csv back in. ## Export/Save Data as CSV # Export without rownames using `write.csv` write.csv(surveys_complete2, file = "data_output/surveys_complete.csv", row.names = FALSE) # Visualize Data with `ggplot2` # Need to read in our data and libraries first. library(tidyverse) surveys_complete <- read.csv("data_output/surveys_complete.csv") # Now we have a clean dataset and we want to start looking at it. Let's make some visualizations and plot our data. ## Boxplot & Violin Plots # Create a boxplot/violinplot to visualize the distribution of the `weight` variable for each species. # Create a different box/violinplot to show the distribution of the `hindfoot_length` variable for each species. # Let's look at `weight` surveys_complete %>% ggplot() + geom_boxplot(aes(x=species_id, y=weight), fill="skyblue") + theme_bw() surveys_complete %>% ggplot() + geom_violin(aes(x=species_id, y=weight), fill="skyblue") + theme_classic() # Ok! Now let's look at `hindfoot_length` surveys_complete %>% ggplot() + geom_boxplot(aes(x=species_id, y=hindfoot_length), fill="salmon") + theme_bw() surveys_complete %>% ggplot() + geom_violin(aes(x=species, y=hindfoot_length), fill="salmon") + theme_bw() # Boxplots are great for showing a lot of information about the spread of a variable, but some information from the raw data is lost. # Add a point geometry to your box/violinplot to visualze the raw `hindfoot_length data` surveys_complete %>% ggplot(aes(x=species_id, y=hindfoot_length)) + geom_boxplot() + geom_point(color="forestgreen") + theme_bw() # Unfortunately, adding the raw data makes our figure “overplotted”. There are too many points overlapping and information is lost this way, too. # - Make a new boxplot of `hindfoot_length` for each species, but this time plot a jittered version of the raw data as a geometry on top of your boxplot surveys_complete %>% ggplot(aes(x=species_id, y=hindfoot_length)) + geom_boxplot() + geom_jitter() # Color by plot id: surveys_complete %>% ggplot(aes(x=species_id, y=hindfoot_length, color=plot_id)) + geom_jitter(alpha=0.5) + geom_boxplot() # Whoa, plotted the `plot_id` as a numeric range instead of independent observations or locations. How do we fix? surveys_complete %>% ggplot(aes(x=species_id, y=hindfoot_length)) + # change to factor for plotting only geom_jitter(aes(color=as.factor(plot_id)),alpha=0.5) + geom_boxplot() # adding some more elements surveys_complete %>% ggplot(aes(x=species_id, y=hindfoot_length)) + # change to factor for plotting only geom_jitter(aes(color=as.factor(plot_id)),alpha=0.5) + geom_boxplot() + # add a better title and axis label xlab("Species ID") + ylab("Hindfoot Length (mm)")+ ggtitle("A Wild Plot") # How to change legend title? surveys_complete %>% ggplot(aes(x=species_id, y=hindfoot_length)) + # change to factor for plotting only geom_jitter(aes(color=as.factor(plot_id)),alpha=0.5) + geom_boxplot() + # add a better title and axis label xlab("Species ID") + ylab("Hindfoot Length (mm)")+ ggtitle("A Wild Plot") + scale_color_manual(name="Plot ID") # Afternoon --------------------------------------------------------------- # Custom functions -------------------------------------------------------- # Bare bones function call hypotenuse <- function(a, b) { myHypotenuse <- sqrt(a^2 + b^2) return(myHypotenuse) } hypotenuse(4, 5) # Difference between hypotenuse and myHypotenuse # hypotenuse is the name of the function # myHypotenuse is the name of an object that I've assigned within the # function hypotenuse <- function(a, b) { bookshelves <- sqrt(a^2 + b^2) return(bookshelves) } hypotenuse(4, 5) # Objects defined within the function stay within the function! bookshelves # doesn't exist, because I defined it within a function # you can also pass an operation directly to return() without any need # to define an intermediate variable hypotenuse <- function(a, b) { return(sqrt(a^2 + b^2)) } hypotenuse(43, 5) # In R, you don't even need to use a return function hypotenuse <- function(a, b) { sqrt(a^2 + b^2) } hypotenuse(43, 5) # Naming your arguments hypotenuse(a = 43, b = 5) hypotenuse(b = 5, a = 43) # Default argument values # Remember the log() function? ?log hypotenuse <- function(a, b = 12) { theHypotenuse <- sqrt(a^2 + b^2) return(theHypotenuse) } hypotenuse(a = 5) hypotenuse(b = 4) # What if you just put 5 instead of a = 5 hypotenuse(5) # If you have a default value, what if you want to pass a different # value for a particluar time that you call your function? hypotenuse(a = 4, b = 30) # Defensive programming --------------------------------------------------- hypotenuse <- function(a, b) { if(a <= 0) { stop("Triangle side must have a length greater than 0") } theHypotenuse <- sqrt(a^2 + b^2) return(theHypotenuse) } hypotenuse(a = 2, b = 12) hypotenuse(a = -4, b = 12) # local versus global variables with resepect to functions hypotenuse <- function(a, b) { myHypotenuse <- sqrt(a^2 + b^2) return(myHypotenuse) } # Temperature conversion -------------------------------------------------- # Add to the code below to make a function that converts Fahrenheit # to Kelvin # Remember that temperature in K is equal to the F temperature # minus 32 * 5/9 and the whole thing plus 273.15 F_to_X <- function(temp) { K <- ((temp - 32) * 5/9) + 273.15 return(K) } # Freezing point of water in Kelvin F_to_X(32) myTemps <- c(0, 32, 98.6, 212) F_to_X(myTemps) # A function that converts K to C # subtract 273.15 from K to get to C K_to_C <- function(temp) { C <- temp - 273.5 return(C) } K_to_C(100) # We can nest functions and combine them at will K_to_C(F_to_X(98.6)) # use the source() function to read other .R files in your # project and let you use the code that is within them. source("scripts/temperature_conversions.R") library(gapminder) library(tidyverse) glimpse(gapminder) # Create a function that plots population growth through time plotPopGrowth <- function(countryToPlot, theData = gapminder) { oneCountry <- theData %>% filter(country == countryToPlot) } # How do I go about making a function? # Let's try with Canada justCanada <- gapminder %>% filter(country == "Canada") myPlot <- ggplot(justCanada, aes(x = year, y = pop)) + geom_line() # Now try to generalize plotPopGrowth <- function(countryToPlot, theData = gapminder) { oneCountry <- theData %>% filter(country == countryToPlot)%>% ggplot(aes(x = year, y = pop)) + geom_line() return(oneCountry) } plotPopGrowth(countryToPlot = "Costa Rica") blood <- read_csv("https://mikoontz.github.io/data-carpentry-week/data/wide_eg.csv", skip = 2) library(tidyr) head(blood) blood_long <- gather(blood, key = condition, value = albumin, control, treatment1, treatment2) ggplot(blood_long, aes(x = sex, y = albumin)) + geom_violin(aes(fill = sex)) load("data/continents.RDA") continents head(gapminder) gm_withAttributes <- left_join(gapminder, continents) head(gm_withAttributes) plotPopGrowth <- function(countryToPlot, theData = gapminder) { oneCountry <- theData %>% filter(country == countryToPlot) %>% ggplot(aes(x = year, y = pop)) + geom_line() return(oneCountry) } countries <- c("United States", "Canada") plotPopGrowth(countries) for(i in seq_along(unique(gapminder$country))) { }