--- title: "Linear Regression" author: "JJB + Course" date: "02/20/2019" output: html_document: toc: true toc_float: collapsed: false --- {r setup = TRUE, include = FALSE} knitr::opts_chunk$set( comment = "#>", collapse = TRUE )  # Linear Relationships Recall that a linear line is given by the formula of: $y = b + m\cdot x$ where: -$y$is the response -$b$is the intercept or where the line crosses the _y_-axis. -$m$is the **slope** or change between values. -$x$is the predictor or input. {r compute-linear-relationship} # Define constants m = 1.5 b = 2 n = 20 # Compute values x = seq(-10, 10, length.out = n) y = m*x + b  {r show-linear-relationship, echo = FALSE} # install.packages("ggplot2") library("ggplot2") df = data.frame(x = x, y = y) ggplot(df) + aes(x = x, y = y) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + geom_line(color = "blue") + geom_point(size = 3, color = "orange") + labs(title = "Observing a Linear Relationship", subtitle = paste("with", n, "observations"), x = "Predictor Value (x)", y = "Response Value (y)") + theme_bw()  {r data-gen-procedures, echo = FALSE} gen_linear_data = function(n, m = 1, b = 2, start = -10, end = 10) { # Compute values x = seq(start, end, length.out = n) y = m * x + b out = data.frame(x = x, y = y) attr(out, "n") = n out } gen_quad_data = function(n, a = 1, b = 2, c = 3, start = -10, end = 10) { # Compute values x = seq(start, end, length.out = n) y = a * x^2 + b * x + c out = data.frame(x = x, y = y) attr(out, "n") = n out } make_relationship_graph = function(df, estimate = FALSE, title = "Observing Relationships") { estimate_layer = if(estimate) { geom_smooth(method = "lm", color = "blue") } else { geom_line(color = "blue") } ggplot(df) + aes(x = x, y = y) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + estimate_layer + geom_point(size = 3, color = "orange") + labs( title = title, subtitle = paste("with", attr(df, "n"), "observations"), x = "Predictor Value (x)", y = "Response Value (y)" ) + theme_bw() }  {r sample-relationships} make_relationship_graph(gen_linear_data(10), title = "Observing a Positive Linear Relationship") make_relationship_graph(gen_linear_data(10, m = -1), title = "Observing a Negative Linear Relationship") make_relationship_graph(gen_quad_data(10, a = 1, start = 0), title = "Observing a Quadratic Relationship (non-linear)")  What happens if the points are not _perfectly_ straight? {r compute-fragmented-linear-relationship} # Recalculate y with additional "noise" or error. y = m*x + b + rnorm(n, sd = 3) # Add gaussian noise  {r show-fragmented-linear-relationship, echo = FALSE} # Update the data.frame df$y = y # Replot graph ggplot(df) + aes(x = x, y = y) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + # geom_smooth(method = "lm") + geom_point(size = 3, color = "orange") + labs(title = "Observing a Partial Linear Relationship", subtitle = paste("with", n, "observations"), x = "Predictor Value (x)", y = "Response Value (y)") + theme_bw()  How can we estimate what the "linear line" that goes through the majority of points is? # SLR **Simple Linear Regression (SLR)** is given as: ${y_i} = {\beta _0} + {\beta _1}{x_i} + {\varepsilon _i}$ where: - $i$ is the observation identifier. - $y_i$ is the response value for the observation. - $\beta _0$ is the intercept value. - $\beta _1$ is the response value. - $x_i$ is the predictor or input for the observation. ## Example: Simulating SLR Data Before running algorithms on real-world data, we first try to see if we can recover an "oracle" model, where we know the exact parameters the data was simulated under. {r simulating-data} # Set seed for reproducibility set.seed(9123) # Number of observations n = 500 # Design matrix: n x 2 X = cbind(1, rnorm(n, mean = 2)) # True beta values: 2 x 1 beta = c(2.5, 4) # Compute response variable with error: n x 1 y = X[, 1] * beta[1] + X[, 2] * beta[2] + rnorm(n, sd = 3)  {r show-pattern, echo = FALSE} df = data.frame(x = X[, 2], y = y) ggplot(df, aes(x, y)) + geom_point() + labs(title = "Generated Random Normal Data", subtitle = paste("Created using", n, "observations"), x = "Simulated Predictor Values", y = "True Response")  ## Example: Computing SLR with analytical forms Let's apply the analytical SLR solutions. {r compute-slr} # Set seed for reproducibility set.seed(9123) # Mean of Response (y) y_mu = mean(y) # Mean of Predictor (x) x = X[, 2] x_mu = mean(x) # Estimate Slope beta1_hat = sum((x - x_mu) * (y - y_mu)) / sum((x - x_mu) ^ 2) # Estimate Intercept beta0_hat = y_mu - beta1_hat * x_mu cbind(c(beta0_hat, beta1_hat), beta)  These estimates are then shown on the graph via the "blue" line. {r show-lm-form, echo = FALSE} library("ggplot2") df = data.frame(x = X[, 2], y = y) ggplot(df, aes(x, y)) + geom_point() + geom_smooth(method = "lm", color = "blue") + labs(title = "Estimated Coefficients on Generated Data", subtitle = paste("with", n, "observations"), x = "Simulated Predictor Values", y = "True Response")  ## Example: Optimizing for SLR {r optim-r-slr} set.seed(8812) beta_hat = rep(0, 2) # Provide initial beta value: 2 x 1 # Write the cost function to minimize min_rss_slr = function(par, X, y) { rss = sum((y - (X[,1]*par[1] + X[,2] * par[2]))^2) return(rss) } # Perform the minimization model_opt = optim(par = beta, fn = min_rss_slr, method = "BFGS", control=list(trace=TRUE), X = X, y = y) # Check parameter difference cbind(model_opt$par, beta)  The optimization stopped when we moved away from the "minimum" of the data. {r show-optim-path} ggplot(data.frame(x = c(1, 2, 3), y = c(4545.000693, 4542.177432, 4545.000693)), aes(x, y)) + geom_point() + geom_line() + geom_point(data = data.frame(x = 2, y = 4542.177432), color = "red", size =3 ) + labs(title = "Optimization of RSS", subtitle = "using the BFGS optimizer", y = "RSS value", x = "Iteration")  # Matrices ## Example: Creating a Matrix Matrices are built in a different way than data.frames. In particular, data is supplied as a 1D vector and then dimensions are added. {r construct-matrix} # Construct a matrix z = matrix( c(1, 2, 3, 4, 5, 6), nrow = 3, ncol = 2 ) # Two dimensions dim(z) # Length is the number of elements length(z)  **Note:** You only need to supply _one_ dimension and _R_ will automatically calculate the other. {r auto-calc-dimension} # Construct a matrix z_auto = matrix( c(1, 2, 3, 4, 5, 6), nrow = 3 ) # Show their the same. all.equal(z, z_auto)  Construct matrix by **column**. {r fill-order-col} z_col = matrix(c(1, 2, 3, 4, 5, 6), nrow = 3, ncol = 2, byrow = FALSE) # Set to FALSE (default) z_col # Note, this is the default. all.equal(z, z_col)  Construct matrix by **row**. {r fill-order-row} z_row = matrix(c(1, 2, 3, 4, 5, 6), nrow = 3, ncol = 2, byrow = TRUE) # Set to TRUE z_row  ## Example: Adding Data to a Matrix Unlike a data.frame, the matrix has a special way of adding new _data_. {r} # Starting matrix the_matrix = matrix( c(1, 2, 3, 4), nrow = 2, ncol = 2 ) # Note: Use the following sparingly # Add data by column cbind(the_matrix, c(5, 6)) # Add data by row rbind(the_matrix, c(5, 6))  # MLR ## Example: Fitting a Linear Regression with lm Fitting an ordinary linear model in _R_ uses: {r, eval = FALSE} model_fit = lm( y ~ x1 + x2, data = data_set )  where: - the first argument is a formula with the response variable to left of the ~, read "is modeled as", and the predictor variables to the right - the second argument specifies the data set. Inside the formula, model terms can be combined using + to give:^[${\beta _0}$represents the intercept term and is automatically included.] $${y_i} = {\beta _0} + {\beta _1}{x_{i,1} } + {\beta _2}{x_{i,2} } + {\varepsilon _i}$$ ## Example: Fits with lm {r model-data} model_fit = lm(mpg ~ wt + qsec, data = mtcars) model_fit  ## Example: The Formula Object and Design Matrix Specify the response (⟵) and predictors (→) {r sample-model-frame-manipulation} model_formula = mpg ~ wt + qsec  Construct a model frame to hold the formula and data {r model-frame-here} model_frame = model.frame(model_formula, data = mtcars)  Retrieve the Design Matrix on the right (→) hand side of the formula {r model-design-matrix} X = model.matrix(model_formula, model_frame) X  Retrieve the Response on the left (⟵) hand side of the formula {r extract-model-response} y = model.response(model_frame) y  Comparing the difference between model matrices _with_ and _without_ the intercept. Incorporate the intercept back in: {r view-model-matrix-differences} X = model.matrix( mpg ~ wt + qsec, # By default, the intercept is included. data = mtcars ) head(X)  Remove the intercept: {r remove-model-int} X_noint = model.matrix( mpg ~ wt + qsec - 1, # note the -1 to remove intercept data = mtcars ) head(X_noint)  ## Example: Inference with lm By using the summary() function, we can obtain detailed information about the model's properties. These properties allow us to perform _inference_ as to the quality of the model from a statistical point of view. {r model-summary} summary(model_fit)  ## Example: Calling lm.fit with pre-made X and y Sometimes, we may wish to directly obtain a fit without using a formula approach. {r directly-call-lm} model_fit_design = lm.fit(x = X, y = matrix(mtcars$mpg))  Note: The output structure is relatively _intacted_ compared to an lm() call. {r model-fit-example} model_fit_design$coefficients model_fit$coefficients  ## Example: Predictions Predict the responses with new data using the predict function. {r my-predictions} future_car = data.frame(wt = 0.5, qsec = 12) y_hat = predict(model_fit, newdata = future_car) y_hat  This is equivalent to calling: \begin{align*} \widehat {MPG} &= {{\hat \beta }_0} + {{\hat \beta }_1}wt + {{\hat \beta }_2}qsec \\ \widehat {MPG} &= {{\hat \beta }_0} + {{\hat \beta }_1}\left( {0.5} \right) + {{\hat \beta }_2}\left( {12} \right) \end{align*} **Note:** Leaving off newdata shows the predictions for the training data set. {r my-model-training-predictions} y_hat_model = predict(model_fit) head(y_hat_model)  **Note:** predict is also a generic function. # Factors and the Design Matrix ## Example: Factors in Data Frames {r factors-in-data-frames} id = c(1, 2, 3, 55) sex = c("M", "F", "F", "M") height = c(6.1, 5.5, 5.2, 5.9) subject_heights_vec = data.frame( id = id, sex = sex, height = height ) # Notice sex is a factor inside the data.frame str(subject_heights_vec) # But outside, on the vector it is a character class(sex) summary(subject_heights_vec) summary(sex)  {r df-without-factors} # Data frame without factors subject_heights = data.frame( id = c(1, 2, 3, 55), sex = c("M", "F", "F", "M"), height = c(6.1, 5.5, 5.2, 5.9), stringsAsFactors = FALSE # Manually disabling factors )  Data Frame with Factors {r df-with-factors} subject_heights_fct = data.frame( id = c(1, 2, 3, 55), sex = c("M", "F", "F", "M"), height = c(6.1, 5.5, 5.2, 5.9), stringsAsFactors = TRUE # default behavior )  Notice output difference between character and factor {r} summary(subject_heights) summary(subject_heights_fct)  This is shown in the structure between the two data.frames {r} str(subject_heights) str(subject_heights_fct)  ## Example: Factor Vector {r} sex = c("M", "F", "F", "M") # Character vector sex_factor = factor(sex) # Convert from character to factor as.numeric(sex_factor) # Retrieve levels  {r} letters  {r example-of-binary-op, eval = FALSE} # This will fail. 3 * "F"  {r} 3 * 1  ### Exercise: Make a Design Matrix {r sample-data} class_seating = data.frame( distance = c("back", "back", "front", "back", "front", "front"), side = c("right", "left", "middle", "middle", "right", "left") )  Checking with R ... (Recall: -1 omits the intercept.) {r example-model-call}  ## Example: Factor Math Failures {r factor-math-folies, eval = FALSE} x = c(3L, -1L, 22L, 9L, 0L, 22L, 9L) # Create Integer Vector my_factor = as.factor(x) # Cast integer to factor my_factor + 10 # Error in an unexpected way min(my_factor) # Show stopping error  ## Example: Exploring a Factor {r explore-factor} x = c(3L, -1L, 22L, 9L, 0L, 22L, 9L) # Create Integer Vector my_factor = as.factor(x) # Cast integer to factor levels(my_factor) # List of Levels my_factor[1] = 9 # Modify with a pre-existing level my_factor[2] = 18 # Error if level isn't present already my_factor x_recovery = levels(my_factor)[my_factor] # Recover element-wise values x_recovery class(x_recovery) x_recovery = as.numeric(x_recovery) # Cast to appropriate type x_recovery + 10  ## Example: Ordered Factors {r ordered-factors-demo} yields = c("hi", "low", "med", "low", "med", "low") # Factor without Order yields_fct = factor(yields) yields_fct # Notice order isn't set appropriately yields_order = factor(yields, ordered = TRUE) # Add Order yields_order # Correct ordering from low to high yields_fixed_order = factor(yields, levels = c("low", "med", "hi"), ordered = TRUE) yields_fixed_order  ## Exercise: To be a Factor or an Ordered Factor Determine whether the following should be either a factor or an ordered factor **Months:** (Jan, Feb, ... , Nov, Dec) **Colors:** (red, orange, ... , black, green) **Alphabet:** (a, b, ... , y, z)