# Day 5

library(tidyverse)

blood <-
  read_csv("https://mikoontz.github.io/data-carpentry-week/data/wide_eg.csv", 
         skip = 2)

head(blood)

# These data are *not* in tidy format because a single variable is
# spread out across 3 columns instead of being stacked into a single
# column

# Ultimate goal is tidy data, which in this case will look like
# a 36 row data frame with 3 variables.

# The way to make this work is to understand key-value pairs

# The **key** describes the columns-- what is one word that we
# can use that will be the umbrella term for the columns that
# we want to gather. Umbrella == the variable.

# The **value** describes the cells within the columns that I 
# want to gather.

# use the gather function. Pass the data as the first argument, then
# the key, then the value, then the columns that you want to gather
# Importantly, we get the key and the value by **thinking about the
# data** and not necessarily by looking at our particular data frame
tidy_blood <- gather(blood, key = condition, value = albumin, control, treatment1, treatment2)


# ways to get the columns that you want to gather
# use - (minus sign) to say **don't** gather these columns
gather(blood, key = condition, value = albumin, -subject, -sex)

# Use the index values to gather a column
gather(blood, key = condition, value = albumin, 3:5)


# workflow using spread ---------------------------------------------------

# Read in our pcr data
pcr <- read_csv("https://dl.dropboxusercontent.com/s/xz6qid3w5ds3tbr/qPCR_amplification_fluorescence_by_cycle.csv?dl=0", skip = 7)

head(pcr)

pcr <-
  pcr %>%
  rename(Marker = 'Target Name', xRn = '?Rn')

# sometimes there are conflicts with function names between packages.
# if two packages have functions with the same name, the most recently
# loaded package will take precedence. Specify from which package
# you want to use a function using packageName:: 
# e.g., dpylr::select()

pcr <-
  pcr %>%
  dplyr::rename(Marker = 'Target Name', xRn = '?Rn')

# lets reshape our data to be in wide format so it will work with
# a different kind of software that requires that format
# Be sure to select only the columns relevant for spreading
pcr_wide <-
  pcr %>%
  select(-Rn) %>%
  spread(key = Cycle, value = xRn)

# write_csv(pcr_wide, "data/pcr_wide.csv")
glimpse(pcr_wide)
head(pcr_wide)

# Why did we need to remove the column that we don't want to
# spread
pcr_wide2 <-
  pcr %>%
  spread(key = Cycle, value = xRn)

glimpse(pcr_wide2)

# But consider that tidyverse (including dplyr and ggplot)
# expect tidy data
glimpse(pcr)

pcr %>%
  ggplot(aes(x = Cycle, y = xRn, color = Marker)) +
  geom_point() +
  geom_smooth()


# gather with multiple data types -----------------------------------------
# This is Mike spitballing here. Don't take any of this to heart, since it's
# just scrap paper
multiType <- data.frame(individual = 1:10, replicate(n = 8, expr = rnorm(10)))
colnames(multiType) <- c("individual", "ae12.1", "ae12.2", "oaf17.1", "oaf17.2", "maf50.1", "maf50.2", "oar29.1", "oar29.2")
head(multiType)

gather(multiType, key = locus, value = expression, -individual)


# Read in our pcr data
pcr <- read_csv("https://dl.dropboxusercontent.com/s/xz6qid3w5ds3tbr/qPCR_amplification_fluorescence_by_cycle.csv?dl=0", skip = 7)

spread(pcr, key = Cycle, value = Rn)

pcr <-
  pcr %>%
  dplyr::rename(Marker = 'Target Name', xRn = '?Rn')

xRn_wide <-
  pcr %>%
  select(-Rn) %>%
  spread(key = Cycle, value = xRn)

colnames(xRn_wide)[3:47] <- paste0("xRn.", colnames(xRn_wide)[3:47])
glimpse(xRn_wide)

Rn_wide <-
  pcr %>%
  select(-xRn) %>%
  spread(key = Cycle, value = Rn)

colnames(Rn_wide)[3:47] <- paste0("Rn.", colnames(Rn_wide)[3:47])

dim(xRn_wide)
dim(Rn_wide)

all_wide <-  full_join(xRn_wide, Rn_wide)
dim(all_wide)
glimpse(all_wide)


# CITING R ----------------------------------------------------------------

# citing a specific package
citation(package = "tidyverse")

# citing R program
citation()


# plotly -------------------------------------------------------------------

install.packages("plotly")
library(plotly)


library(tidyverse)
library(gapminder)


# A ggplot
plot1 <- ggplot(data=gapminder) + 
  geom_point(aes(x=year, y=pop, color=continent))

# make it plotly
ggplotly(plot1)


# JOINS in R --------------------------------------------------------------
library(tidyverse)
library(gapminder)

# let's create some dataframes:

set.seed(12345) # this makes it possible to recreate or sample the same numbers across any computer...the sample() function is a random sample generator.


x <- data.frame(key= LETTERS[c(1:3, 5)], 
                value1 = sample(1:10, 4), 
                stringsAsFactors = FALSE)

y <- data.frame(key = LETTERS[c(1:4)], 
                value2 = sample(1:10, 4), 
                stringsAsFactors = FALSE)

# View the data
x
y

s# Now let's practice some joins:

# INNER JOIN: find only matching columns and rows from each table
inner_join(x, y) # run without specifying a join column using "by="

inner_join(x, y, by = "key") # no message from dplyr, because explicit in join column

# right_join (get only matches from the right table, things from x that match in y)

right_join(x, y, by = "key")

# left join

left_join(x, y, by= "key")


# what if join column doesn't match in name?

x2 <- data.frame(letters= LETTERS[c(1:3, 5)], 
                value1 = sample(1:10, 4), 
                stringsAsFactors = FALSE)

y2 <- data.frame(key = LETTERS[c(1:4)], 
                value2 = sample(1:10, 4), 
                stringsAsFactors = FALSE)

inner_join(x2, y2) # error message is clear, need a by=

inner_join(x2, y2, by=c("letters"="key")) # this works

# this won't work because trying to find "key" column in x2...order of columns in by matters

inner_join(x2, y2, by=c("key"="letters"))

# full join

full_join(x2, y2, by = c("letters"="key"))


# appending or binding dataframes is slightly different, but easy to do:
# if two tables with exact same format exist (same column names):

# bind_rows(example1, example2) # for appending rows (need same cols)
# bind_cols(example1, example2) # append cols


# SEMI JOINS

semi_join(x, y, by = "key")

# ANTI_JOIN

# find everything that doesn't match between two tables

anti_join(x, y, by="key") # only records that don't exist in Y

# we records that don't exist in EITHER

full_join(anti_join(x, y, by ="key"), 
          anti_join(y, x, by = "key"), 
          by="key") %>% 
  arrange(key)

# SET OPERATIONS

df1 <- data_frame(x = LETTERS[1:2], 
                  y = c(1L, 1L))
df2 <- data_frame(x = LETTERS[1:2], 
                  y = 1:2)


# which rows are common in both datasets?
intersect(df1, df2)

# we want all unique rows between both datasets?
union(df1, df2)

# what's unique or different to one dataframe vs. the other?
setdiff(df2, df1)


# LOOPS -------------------------------------------------------------------


x <- c(1, 4, 98, 2)
log(x)

# basic for loop:
for(i in 1:10){
  print(i)
}

# letters loop

letters

# can do this simply with i:
for(i in seq_along(letters)){
  print(i)
}

# only prints position in vector, how do we print letter?

for(i in seq_along(letters)){ # seq_along is handy for subsetting or indexing a position in a vector
  print(letters[i]) # use brackets to access the position in vector
}


# alternatively, just use a number index to access position in vector
for(i in 1:10){
  print(letters[i]) # use brackets to access the position in vector
}


# LOOPING WITH GGPLOT -----------------------------------------------------

# libraries
library(tidyverse)
library(gapminder)

# function to print a specific country:
plotPopGrowth <- function(countryToPlot, theData = gapminder) {
  # this filters data to a specific country
  oneCountry <- theData %>%
    filter(country == countryToPlot)
  
  # can add some error checking if you want
  if(!nrow(oneCountry)>0) { # checks if there are values for country
    stop("No Data for this Country, please try again")
  }
  
  # this runs the plot
  ggplot(data=oneCountry) + 
    geom_line(aes(x=year, y=pop, color=country))
}

plotPopGrowth("Afghanistans") # gives error
plotPopGrowth("Afghanistan") 

# try saving each country to a file with a for loop:

# need a list of unique countries
countries <- unique(gapminder$country)
countries


# now we can write the for loop:

for(i in seq_along(countries)){
  plotPopGrowth(countries[i])
  ggsave(filename = paste("figures/plot_", countries[i],".png", sep = ","))
  print(paste0("Plot saved for ", countries[i]))
}


# LOOP THROUGH CSVs and READ IN -------------------------------------------


# use csv's from this example: https://github.com/gge-ucd/wRangling_Seminar/blob/master/documents/ice_C02_web_data.Rmd

# download zip from here:
download.file(url = "https://github.com/gge-ucd/wRangling_Seminar/raw/master/data/2001_mauna_loa_met_data.zip", destfile = "data/2001_mauna_loa_met.zip")

# unzip files (for MAC)
unzip(zipfile = "data/2001_mauna_loa_met.zip", exdir = "data_output/")


# unzip files for WINDOWS (get rid of / at end of exdir)
unzip(zipfile = "data/2001_mauna_loa_met.zip", exdir = "data_output")


# Read multiple csv's into list and dataframes
filenames <- list.files(path ="data_output", pattern="*.txt")

filenames

# list of the column names (from Mauna Loa website)
metColnames <- c("siteID", "year", "month", "day", "hour24", "min", "windDir",   "windSpeed_m_s", "windSteady", "baro_hPa", "temp_C_2m", "temp_C_10m", "temp_C_towertop", "rel_humid", "precip_intens_mm_hr")

# base R
system.time(
  for(i in filenames){
    filepath <- file.path("data_output", i)
    assign(i, read.table(filepath, header = F, col.names = metColnames))
  }
)

# the file.path command makes a file path:
file.path("data_output", filenames[1])


# tidyverse
system.time(for(i in filenames){
  filepath <- file.path("data_output", i)
  assign(i, read_table(filepath, col_names = metColnames,
                       col_types = c("ciiiiiininnnnii")))
})

# can bind all together with bind_rows:
# bind together (works either way)
df <- bind_rows(met_mlo_insitu_1_obop_minute_2001_01.txt, 
                met_mlo_insitu_1_obop_minute_2001_02.txt,
                met_mlo_insitu_1_obop_minute_2001_03.txt,
                met_mlo_insitu_1_obop_minute_2001_04.txt,
                met_mlo_insitu_1_obop_minute_2001_05.txt,
                met_mlo_insitu_1_obop_minute_2001_06.txt,
                met_mlo_insitu_1_obop_minute_2001_07.txt,
                met_mlo_insitu_1_obop_minute_2001_08.txt,
                met_mlo_insitu_1_obop_minute_2001_09.txt,
                met_mlo_insitu_1_obop_minute_2001_10.txt,
                met_mlo_insitu_1_obop_minute_2001_11.txt,
                met_mlo_insitu_1_obop_minute_2001_12.txt)


# lists files in environment ending in txt
ls(pattern="txt")

mget() # helps interpret a named or quoted list as things in your environment

# put it together with bind_rows:
df<-bind_rows(mget(ls(pattern = "txt")))