# Day 5 library(tidyverse) blood <- read_csv("https://mikoontz.github.io/data-carpentry-week/data/wide_eg.csv", skip = 2) head(blood) # These data are *not* in tidy format because a single variable is # spread out across 3 columns instead of being stacked into a single # column # Ultimate goal is tidy data, which in this case will look like # a 36 row data frame with 3 variables. # The way to make this work is to understand key-value pairs # The **key** describes the columns-- what is one word that we # can use that will be the umbrella term for the columns that # we want to gather. Umbrella == the variable. # The **value** describes the cells within the columns that I # want to gather. # use the gather function. Pass the data as the first argument, then # the key, then the value, then the columns that you want to gather # Importantly, we get the key and the value by **thinking about the # data** and not necessarily by looking at our particular data frame tidy_blood <- gather(blood, key = condition, value = albumin, control, treatment1, treatment2) # ways to get the columns that you want to gather # use - (minus sign) to say **don't** gather these columns gather(blood, key = condition, value = albumin, -subject, -sex) # Use the index values to gather a column gather(blood, key = condition, value = albumin, 3:5) # workflow using spread --------------------------------------------------- # Read in our pcr data pcr <- read_csv("https://dl.dropboxusercontent.com/s/xz6qid3w5ds3tbr/qPCR_amplification_fluorescence_by_cycle.csv?dl=0", skip = 7) head(pcr) pcr <- pcr %>% rename(Marker = 'Target Name', xRn = '?Rn') # sometimes there are conflicts with function names between packages. # if two packages have functions with the same name, the most recently # loaded package will take precedence. Specify from which package # you want to use a function using packageName:: # e.g., dpylr::select() pcr <- pcr %>% dplyr::rename(Marker = 'Target Name', xRn = '?Rn') # lets reshape our data to be in wide format so it will work with # a different kind of software that requires that format # Be sure to select only the columns relevant for spreading pcr_wide <- pcr %>% select(-Rn) %>% spread(key = Cycle, value = xRn) # write_csv(pcr_wide, "data/pcr_wide.csv") glimpse(pcr_wide) head(pcr_wide) # Why did we need to remove the column that we don't want to # spread pcr_wide2 <- pcr %>% spread(key = Cycle, value = xRn) glimpse(pcr_wide2) # But consider that tidyverse (including dplyr and ggplot) # expect tidy data glimpse(pcr) pcr %>% ggplot(aes(x = Cycle, y = xRn, color = Marker)) + geom_point() + geom_smooth() # gather with multiple data types ----------------------------------------- # This is Mike spitballing here. Don't take any of this to heart, since it's # just scrap paper multiType <- data.frame(individual = 1:10, replicate(n = 8, expr = rnorm(10))) colnames(multiType) <- c("individual", "ae12.1", "ae12.2", "oaf17.1", "oaf17.2", "maf50.1", "maf50.2", "oar29.1", "oar29.2") head(multiType) gather(multiType, key = locus, value = expression, -individual) # Read in our pcr data pcr <- read_csv("https://dl.dropboxusercontent.com/s/xz6qid3w5ds3tbr/qPCR_amplification_fluorescence_by_cycle.csv?dl=0", skip = 7) spread(pcr, key = Cycle, value = Rn) pcr <- pcr %>% dplyr::rename(Marker = 'Target Name', xRn = '?Rn') xRn_wide <- pcr %>% select(-Rn) %>% spread(key = Cycle, value = xRn) colnames(xRn_wide)[3:47] <- paste0("xRn.", colnames(xRn_wide)[3:47]) glimpse(xRn_wide) Rn_wide <- pcr %>% select(-xRn) %>% spread(key = Cycle, value = Rn) colnames(Rn_wide)[3:47] <- paste0("Rn.", colnames(Rn_wide)[3:47]) dim(xRn_wide) dim(Rn_wide) all_wide <- full_join(xRn_wide, Rn_wide) dim(all_wide) glimpse(all_wide) # CITING R ---------------------------------------------------------------- # citing a specific package citation(package = "tidyverse") # citing R program citation() # plotly ------------------------------------------------------------------- install.packages("plotly") library(plotly) library(tidyverse) library(gapminder) # A ggplot plot1 <- ggplot(data=gapminder) + geom_point(aes(x=year, y=pop, color=continent)) # make it plotly ggplotly(plot1) # JOINS in R -------------------------------------------------------------- library(tidyverse) library(gapminder) # let's create some dataframes: set.seed(12345) # this makes it possible to recreate or sample the same numbers across any computer...the sample() function is a random sample generator. x <- data.frame(key= LETTERS[c(1:3, 5)], value1 = sample(1:10, 4), stringsAsFactors = FALSE) y <- data.frame(key = LETTERS[c(1:4)], value2 = sample(1:10, 4), stringsAsFactors = FALSE) # View the data x y s# Now let's practice some joins: # INNER JOIN: find only matching columns and rows from each table inner_join(x, y) # run without specifying a join column using "by=" inner_join(x, y, by = "key") # no message from dplyr, because explicit in join column # right_join (get only matches from the right table, things from x that match in y) right_join(x, y, by = "key") # left join left_join(x, y, by= "key") # what if join column doesn't match in name? x2 <- data.frame(letters= LETTERS[c(1:3, 5)], value1 = sample(1:10, 4), stringsAsFactors = FALSE) y2 <- data.frame(key = LETTERS[c(1:4)], value2 = sample(1:10, 4), stringsAsFactors = FALSE) inner_join(x2, y2) # error message is clear, need a by= inner_join(x2, y2, by=c("letters"="key")) # this works # this won't work because trying to find "key" column in x2...order of columns in by matters inner_join(x2, y2, by=c("key"="letters")) # full join full_join(x2, y2, by = c("letters"="key")) # appending or binding dataframes is slightly different, but easy to do: # if two tables with exact same format exist (same column names): # bind_rows(example1, example2) # for appending rows (need same cols) # bind_cols(example1, example2) # append cols # SEMI JOINS semi_join(x, y, by = "key") # ANTI_JOIN # find everything that doesn't match between two tables anti_join(x, y, by="key") # only records that don't exist in Y # we records that don't exist in EITHER full_join(anti_join(x, y, by ="key"), anti_join(y, x, by = "key"), by="key") %>% arrange(key) # SET OPERATIONS df1 <- data_frame(x = LETTERS[1:2], y = c(1L, 1L)) df2 <- data_frame(x = LETTERS[1:2], y = 1:2) # which rows are common in both datasets? intersect(df1, df2) # we want all unique rows between both datasets? union(df1, df2) # what's unique or different to one dataframe vs. the other? setdiff(df2, df1) # LOOPS ------------------------------------------------------------------- x <- c(1, 4, 98, 2) log(x) # basic for loop: for(i in 1:10){ print(i) } # letters loop letters # can do this simply with i: for(i in seq_along(letters)){ print(i) } # only prints position in vector, how do we print letter? for(i in seq_along(letters)){ # seq_along is handy for subsetting or indexing a position in a vector print(letters[i]) # use brackets to access the position in vector } # alternatively, just use a number index to access position in vector for(i in 1:10){ print(letters[i]) # use brackets to access the position in vector } # LOOPING WITH GGPLOT ----------------------------------------------------- # libraries library(tidyverse) library(gapminder) # function to print a specific country: plotPopGrowth <- function(countryToPlot, theData = gapminder) { # this filters data to a specific country oneCountry <- theData %>% filter(country == countryToPlot) # can add some error checking if you want if(!nrow(oneCountry)>0) { # checks if there are values for country stop("No Data for this Country, please try again") } # this runs the plot ggplot(data=oneCountry) + geom_line(aes(x=year, y=pop, color=country)) } plotPopGrowth("Afghanistans") # gives error plotPopGrowth("Afghanistan") # try saving each country to a file with a for loop: # need a list of unique countries countries <- unique(gapminder$country) countries # now we can write the for loop: for(i in seq_along(countries)){ plotPopGrowth(countries[i]) ggsave(filename = paste("figures/plot_", countries[i],".png", sep = ",")) print(paste0("Plot saved for ", countries[i])) } # LOOP THROUGH CSVs and READ IN ------------------------------------------- # use csv's from this example: https://github.com/gge-ucd/wRangling_Seminar/blob/master/documents/ice_C02_web_data.Rmd # download zip from here: download.file(url = "https://github.com/gge-ucd/wRangling_Seminar/raw/master/data/2001_mauna_loa_met_data.zip", destfile = "data/2001_mauna_loa_met.zip") # unzip files (for MAC) unzip(zipfile = "data/2001_mauna_loa_met.zip", exdir = "data_output/") # unzip files for WINDOWS (get rid of / at end of exdir) unzip(zipfile = "data/2001_mauna_loa_met.zip", exdir = "data_output") # Read multiple csv's into list and dataframes filenames <- list.files(path ="data_output", pattern="*.txt") filenames # list of the column names (from Mauna Loa website) metColnames <- c("siteID", "year", "month", "day", "hour24", "min", "windDir", "windSpeed_m_s", "windSteady", "baro_hPa", "temp_C_2m", "temp_C_10m", "temp_C_towertop", "rel_humid", "precip_intens_mm_hr") # base R system.time( for(i in filenames){ filepath <- file.path("data_output", i) assign(i, read.table(filepath, header = F, col.names = metColnames)) } ) # the file.path command makes a file path: file.path("data_output", filenames[1]) # tidyverse system.time(for(i in filenames){ filepath <- file.path("data_output", i) assign(i, read_table(filepath, col_names = metColnames, col_types = c("ciiiiiininnnnii"))) }) # can bind all together with bind_rows: # bind together (works either way) df <- bind_rows(met_mlo_insitu_1_obop_minute_2001_01.txt, met_mlo_insitu_1_obop_minute_2001_02.txt, met_mlo_insitu_1_obop_minute_2001_03.txt, met_mlo_insitu_1_obop_minute_2001_04.txt, met_mlo_insitu_1_obop_minute_2001_05.txt, met_mlo_insitu_1_obop_minute_2001_06.txt, met_mlo_insitu_1_obop_minute_2001_07.txt, met_mlo_insitu_1_obop_minute_2001_08.txt, met_mlo_insitu_1_obop_minute_2001_09.txt, met_mlo_insitu_1_obop_minute_2001_10.txt, met_mlo_insitu_1_obop_minute_2001_11.txt, met_mlo_insitu_1_obop_minute_2001_12.txt) # lists files in environment ending in txt ls(pattern="txt") mget() # helps interpret a named or quoted list as things in your environment # put it together with bind_rows: df<-bind_rows(mget(ls(pattern = "txt")))