--- title: "Linear Regression" author: "JJB + Course" date: "7/30/2018" output: html_document --- # SLR ## Example: Simulating SLR Data {r} # Simulate data # Set seed for reproducibility set.seed(9123) # Number of observations n = 500 X = cbind(1, rnorm(n, mean = 2)) # Design matrix: n x 2 beta = c(2.5, 4) # True beta values: 2 x 1 y = X[,1]*beta[1] + X[,2]*beta[2] + rnorm(n, sd = 3) # Response variable with error: n x 1  {r, echo = FALSE} library("ggplot2") df = data.frame(x = X[, 2], y = y) ggplot(df, aes(x, y)) + geom_point() + labs(title = "Generated Data", subtitle = paste("with", n, "observations"), x = "Simulated Predictor Values", y = "True Response")  {r} # Set seed for reproducibility set.seed(9123) # Mean of Response (y) y_mu = mean(y) # Mean of Predictor (x) x = X[, 2] x_mu = mean(x) # Estimate Slope beta1_hat = sum((x - x_mu) * (y - y_mu)) / sum((x - x_mu) ^ 2) # Estimate Intercept beta0_hat = y_mu - beta1_hat * x_mu cbind(c(beta0_hat, beta1_hat), beta)  {r, echo = FALSE} library("ggplot2") df = data.frame(x = X[, 2], y = y) ggplot(df, aes(x, y)) + geom_point() + geom_smooth(method = lm) + labs(title = "Estimated Coefficients on Generated Data", subtitle = paste("with", n, "observations"), x = "Simulated Predictor Values", y = "True Response")  ## Example: Optimizing for SLR {r} set.seed(8812) beta_hat = rep(0, 2) # Provide initial beta value: 2 x 1 # Write the cost function to minimize min_rss_slr = function(par, X, y) { rss = sum((y - (X[,1]*par[1] + X[,2] * par[2]))^2) return(rss) } # Perform the minimization model_opt = optim(par = beta, fn = min_rss_slr, method = "BFGS", control=list(trace=TRUE), X = X, y = y) # Check parameter difference cbind(model_opt$par, beta)  {r} ggplot(data.frame(x = c(1, 2, 3), y = c(4545.000693, 4542.177432, 4545.000693)), aes(x, y)) + geom_point() + geom_line() + geom_point(data = data.frame(x = 2, y = 4542.177432), color = "red", size =3 ) + labs(title = "Optimization of RSS", subtitle = "using the BFGS optimizer", y = "RSS value", x = "Iteration")  # MLR ## Example: Fitting a Linear Regression with lm Fitting an ordinary linear model in _R_ uses: {r, eval = FALSE} model_fit = lm( y ~ x1 + x2, data = data_set )  where: - the first argument is a formula with the dependent variable to left of the ~, read "is modeled as", and the independent variables to the right - the second argument specifies the data set. Inside the formula, model terms can be combined using + to give:^[${\beta _0}$represents the intercept term and is automatically included.] $${y_i} = {\beta _0} + {\beta _1}{x_{i,1} } + {\beta _2}{x_{i,2} } + {\varepsilon _i}$$ ## Example: Fits with lm {r} model_fit = lm(mpg ~ wt + qsec, data = mtcars) model_fit  ## Example: The Formula Object and Design Matrix {r} # Add the intercept X = model.matrix(mpg ~ wt + qsec, data = mtcars) head(X) # Remove the intercept X_noint = model.matrix(mpg ~ wt + qsec - 1, data = mtcars) head(X_noint)  ## Example: Inference with lm {r} summary(model_fit)  ## Example: Calling lm.fit with pre-made X and y {r} model_fit_design = lm.fit(x = X, y = matrix(mtcars$mpg)) model_fit_design$coefficients model_fit$coefficients  ## Example: Predictions Predict the responses with new data using the predict function {r} future_car = data.frame(wt = 0.5, qsec = 12) y_hat = predict(model_fit, new_data = future_car) head(y_hat)  **Note:** predict is also a generic function. ## Constructing lm Generic ### Example: Writing a my_lm() function - Generic Dispatcher To begin, we start with the basic definition for a generic method. {r make_generic} my_lm = function(x, ...){ UseMethod("my_lm") }  **Note:** Under this approach, we can extend my_lm to work with formula (e.g y ~ x) ### Example: Writing a my_lm() function - Class Constructor Next, we'll make a constructor for the class results: {r} new_my_lm = function(beta_hat, cov_mat, sigma2, df) { structure(list(coefs = beta_hat, cov_mat = cov_mat, sigma = sqrt(sigma2), df = df), class = c("my_lm", "list")) }  ### Example: Writing a my_lm() function - Dispatch on Formula {r} my_lm.formula = function(formula, data = list(), ...) { # Create a design matrix m_info = model.frame(formula = formula, data = data) X = model.matrix(attr(m_info, "terms"), data = m_info) # Extract response y = model.response(m_info) return(my_lm(X, y, ...)) }  ### Writing a my_lm() function - Dispatch on Matrix {r} my_lm.matrix = function(X, y, ...) { # Translation of (X'X)^-1 X' y beta_hat = solve(t(X) %*% X) %*% t(X) %*% y # Compute the Degrees of Freedom df = nrow(X) - ncol(X) # n - p # Compute the Standard Deviation of the Residuals sigma2 = sum((y - X %*% beta_hat) ^ 2) / df # Compute the Covariance Matrix # Cov(Beta_hat) = sigma^2 * (X' X)^(-1) cov_mat = sigma2 * solve(t(X) %*% X) # Make name symmetric in covariance matrix rownames(cov_mat) = colnames(X) colnames(cov_mat) = colnames(X) # Return a list new_my_lm(beta_hat, cov_mat, sigma2, df) }  **Note:** The implementation is not stable if multicollinearity is present. Use [QR decomposition]( https://bookdown.org/rdpeng/advstatcomp/textbooks-vs-computers.html). ### Writing a my_lm() function - Default Method {r} my_lm.default = function(object, ...){ stop("Cannot operate on ", class(object), ".") }  ## Print Functions ### Example: Writing a print.my_lm() function We can hook the my_lm class directly into generic print function {r} print.my_lm = function(x, ...) { cat("\nCoefficients:\n") print(x$coefs) }  Here we end up calling the default matrix print method using x$coefs. ### Example: Comparing print() Output (print.my_lm() vs. print.lm()) {r} # Our Implementation of lm print(my_lm(X = cbind(1, mtcars$disp), y = mtcars$mpg)) # Base R implementation print(lm(mpg ~ disp, data = mtcars))  ## Summary Functions ### Example: Writing a summary.my_lm() function {r writing_summary_my_lm} # Note that summary(object, ...) instead of summary(x, ...)! summary.my_lm = function(object, ...){ estimate = object$coefs # Beta Hat sterr = sqrt(diag(object$cov_mat)) # STD Error t_test = estimate / sterr # t-Test value pval = 2*pt(-abs(t_test), df = object$df) # p-value # Make output matrix mat = cbind("Estimate"= estimate, "Std. Err" = sterr, "t value" = t_test, "Pr(>|t|)" = pval) rownames(mat) = rownames(object$cov_mat) # Naming return(structure(list(mat = mat), class = "summary.my_lm")) }  ### Example: Writing a print.summary.my_lm() function We can control how the summary generic function should look like on print via print.summary.my_lm. That is, we are defining a print function for another generic. {r} # Note that print(x,...)!! print.summary.my_lm = function(x, ...) { printCoefmat(x$mat, P.value = TRUE, has.Pvalue = TRUE) }  ### Example: Comparing summary() Output: summary.my_lm() {r} # Our Implementation of lm print(summary(my_lm(X = cbind("(Intercept)" = 1, "disp" = mtcars$disp), y = mtcars\$mpg)))  ### Example: Comparing summary() Output: summary.my_lm() {r} # Base R implementation print(summary(lm(mpg~disp, data = mtcars)))