####### Statistics Programming with R, Session 2 ####### # Part 1: Statistical Testing and Prediction data(mtcars) View(mtcars) ?mtcars install.packages("ggplot2") library(ggplot2) ggplot(mtcars, aes(x = wt, y = mpg)) + geom_point() cor(mtcars$wt, mtcars$mpg) cor.test(mtcars$wt, mtcars$mpg) ct <- cor.test(mtcars$wt, mtcars$mpg) ct ct$p.value str(ct) fit <- lm(mpg ~ wt, data = mtcars) summary(fit) predict(fit) newcar <- data.frame(wt = 4.5) predict(fit, newdata = newcar) ggplot(mtcars, aes(x = wt, y = mpg)) + geom_point() + geom_smooth(method = "lm") mfit <- lm(mpg ~ wt + am, mtcars) summary(mfit) mfit <- lm(mpg ~ I(wt + am), mtcars) ggplot(mtcars, aes(x = wt, y = mpg, color = am)) + geom_point() ggplot(mtcars, aes(x = factor(am), y = mpg)) + geom_boxplot() mfit <- lm(mpg ~ wt + disp, mtcars) summary(mfit) mfit <- lm(mpg ~ wt + qsec, mtcars) summary(mfit) ggplot(mtcars, aes(x = wt, y = mpg, color = qsec)) + geom_point() ggplot(mtcars, aes(x = wt, y = mpg, color = qsec)) + geom_point(size = 5) + scale_color_continuous(low = "blue", high = "yellow") mfit <- lm(mpg ~ I(wt^2) + disp, mtcars) mtcars$weight_squared <- mtcars$wt^2 mfit <- lm(mpg ~ weight_squared + disp, mtcars) ?nls # Part 2 : Exploratory Data Analysis install.packages("dplyr") library(dplyr) browseVignettes() load(url("http://dgrtwo.github.io/files/undata-213.RData")) dim(x) View(x) x <- tbl_df(x) class(x) select(x, rcid, date, vote) select(x, rcid, date, vote, unres, uniquename) select(x, rcid:unres, uniquename) select(x, -uniquename) select(x, rcid:unres, country = uniquename) newx <- select(x, rcid:unres, country = uniquename) f(g(h(x))) h(x) %>% g() %>% f() newx <- x %>% select(rcid:unres, country = uniquename) newx <- x %>% select(rcid:vote, country = uniquename) %>% filter(vote < 8) newx$country gsub('"', '', newx$country) newx <- x %>% select(rcid:vote, country = uniquename) %>% filter(vote < 8) %>% mutate(country = gsub('"', '', country)) newx <- x %>% select(rcid:vote, country = uniquename) %>% filter(vote < 8) %>% mutate(country = gsub('"', '', country), unres = gsub('"', '', unres)) #newx <- x %>% select(rcid:vote, country = uniquename) %>% filter(vote < 8) %>% # mutate(country = gsub('"', '', country), unres = gsub('"', '', unres), vote = vote*2) newx <- newx %>% mutate(vote = factor(votes[vote])) install.packages("tidyr") library(tidyr) # either votes <- c("Yes", "Abstain", "No") newx <- x %>% select(rcid:vote, country = uniquename) %>% filter(vote < 8) %>% mutate(country = gsub('"', '', country), unres = gsub('"', '', unres)) %>% mutate(vote = factor(votes[vote])) newx$year = as.numeric(sapply(newx$date, function(x){strsplit(x, split = '-')[[1]][1]})) newx$month = as.numeric(sapply(newx$date, function(x){strsplit(x, split = '-')[[1]][2]})) newx$day = as.numeric(sapply(newx$date, function(x){strsplit(x, split = '-')[[1]][3]})) # or votes <- c("Yes", "Abstain", "No") newx2 <- x %>% select(rcid:vote, country = uniquename) %>% filter(vote < 8) %>% mutate(country = gsub('"', '', country), unres = gsub('"', '', unres)) %>% separate(date, c("year", "month", "day")) %>% mutate(year = as.numeric(year), month = as.numeric(month), day = as.numeric(day)) %>% mutate(vote = factor(votes[vote])) newx <- newx2 # everyone votesumm <- newx %>% group_by(year) %>% summarize(numvotes = n()) votesumm <- newx %>% group_by(year) %>% summarize(numvotes = n(), yes = mean(vote == "Yes")) ggplot(votesumm, aes(x = year, y = yes)) + geom_line() ggplot(votesumm, aes(x = year, y = yes)) + geom_line() + geom_smooth() votesumm <- newx %>% group_by(year, country) %>% summarize(numvotes = n(), yes = mean(vote == "Yes")) sort(unique(votesumm$country)) chosen_countries <- c("Finland", "United States of America", "U.S.S.R", "Australia", "Germany", "Germany, East", "Germany, West") country_summary <- votesumm %>% filter(country %in% chosen_countries) ggplot(country_summary, aes(x = year, y = yes, color = country)) + geom_line() ggplot(country_summary, aes(x = year, y = yes, color = country, fill = country)) + geom_line() + geom_smooth() ggplot(country_summary, aes(x = year, y = yes, color = country, fill = country)) + geom_line() + geom_smooth() + scale_color_brewer(palette = "Set1") + scale_fill_brewer(palette = "Set1")