# Overview of Regressions with R #### # Tuesday 1st and Wednesday 2nd July 2025 # UCL CASC course # Welcome # Day 1 #### # Linear Regression #### # set working directory setwd("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/datasets") setwd("/Volumes/groupfolders-2/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/R/Teachers R script during the course") setwd("/Volumes/groupfolders/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/R/Teachers R script during the course") #list files in directory list.files() #Load tango data tango <- read.csv("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/datasets/QPCR tango.csv") tango <- read.csv('QPCR tango.csv') #Load all the datasets load("Regressions with R datasets.Rdata") #variables names names(tango) #Check data types class(tango$Genotype) class(tango$RU) class(tango$Tango) #Set as categoric tango$Genotype=factor(tango$Genotype) ?factor tango$RU=factor(tango$RU) ?as.factor #Summarise data summary(tango$Tango) hist(tango$Tango) #mean of Tango expression levels by RU (Treatment) tapply(tango$Tango,tango$RU, mean) ?tapply #T-test t.test(Tango~RU, data=tango)#better way t.test(tango$Tango~tango$RU) #Mann-Whitney test wilcox.test(Tango~RU, data=tango) #Summarise predictors #frequencies/count t1=table(tango$RU) t2=table(tango$Genotype) #proportions prop.table(t1) prop.table(t2) prop.table(t2)*100#percentages #Fit Linear Regression models #lm(Y~X1+X2+X3, data=data) #glm(Y~X1+X2+X3, data=data, family='gaussian') #lm is better as it gives more useful output #Intercept only model m0 = lm(Tango~1, data=tango) summary(m0) #Tango=0.86 #confidence interval confint(m0) #With Treatment type (RU) as predictor m1 = lm(Tango~RU, data=tango) summary(m1) #Tango=0.71+0.30*Present #0.71 (intercept): Mean tango expression #for Absent group #0.30 (slope for P):mean difference in #tango expression between Present(P) #and Absent (A) groups. #p-value:0.055 (Difference is not statistically #significant) relevel()#function for changing reference confint(m1) # Welcome back after lunch ==== summary(m1) confint(m1) # 95% confidence interval ?confint coef(m1) # diagnostic tools for linear models ==== names(tango) names(summary(m1)) summary(m1)$r.squared summary(m1)$adj.r.squared summary(m1) deviance(m1) BIC(m1) AIC(m1) # Akaike Information Criterion residuals(m1) hist(residuals(m1)) plot(tango$RU,residuals(m1)) class(tango$RU) plot(as.numeric(tango$RU),residuals(m1)) abline(h=0) # h for horizontal influence.measures(m1) hist(dfbeta(m1)[,1]) # intercept hist(dfbeta(m1)[,2]) # slope hist(cooks.distance(m1)) hist(dffits(m1)) install.packages('car') library(car) vif(m1) # exercise 1 ==== summary(tango$Genotype) levels(tango$Genotype) class(tango$Genotype) m1 <- lm(Tango ~ Genotype, data = tango) summary(m1) # default reference level is Abeta tapply(tango$Tango, tango$Genotype, mean) # Consider both treatment and genotype jointly. # Start with the plot, then add them in # the model. m3 <- lm(Tango ~ RU + Genotype, data = tango) summary(m3) # the simplest of plots: plot(as.numeric(factor(tango$Genotype)), tango$Tango, col=c("red","green")[tango$RU], pch=c("X","O")[tango$RU]) # or with a bit more editing: par(pty="s") layout(1) plot(as.numeric(factor(tango$Genotype)), tango$Tango, ylab="Tango",xlab="Genecode", cex=2,xaxt="n", col=c("red","green")[tango$RU], cex.axis=2, cex.lab=1.5, pch=c("X","O")[tango$RU]) axis(1,1:3,labels=levels(factor(tango$Genotype)), cex.axis=2) legend("top",legend=levels(tango$RU)[c(1,2)], col=c("red","green")[c(1,2)], pch=c("X","O")[c(1,2)],box.lty=2) # a. Are both terms needed in the model? # YES # b. Is the interaction term significant? # NO, so remove the interaction m4 <- lm(Tango ~ RU * Genotype , data = tango) summary(m4) # tango = constant + b1*treatment + b2*genotype + b3*treatment*genotype # tango = 0.79 + 0.31*Treated - 0.4*RNAI + 0.15*UASArm + 0.03*Treated*RNAI # - 0.08*Treated*UASArm # c. Complete the gaps in the following statement: # look at model 3 (without the interaction terms): m3 <- lm(Tango ~ RU + Genotype, data = tango) summary(m3) # Treatment is associated with a 0.3 change in the mean/average tango # expression whilst adjusting/accounting for genotype. # d. Is there collinearity between the two independent variables, genotype and # treatment? vif(m3) m0 <- lm(Tango ~ 1, data = tango) m1 <- lm(Tango ~ Genotype, data = tango) m2 <- lm(Tango ~ RU, data = tango) m3 <- lm(Tango ~ RU + Genotype, data = tango) m4 <- lm(Tango ~ RU * Genotype , data = tango) summary(m0) summary(m1) # default reference level is Abeta summary(m2) # best model summary(m3) summary(m4) BIC(m0) # intercept only BIC(m1) # RU only BIC(m2) # genotype only BIC(m3) # RU and genotype BIC(m4) # plus interaction summary(m0)$adj.r.squared summary(m1)$adj.r.squared summary(m2)$adj.r.squared summary(m3)$adj.r.squared summary(m4)$adj.r.squared # m3: RU and genotype residuals(m3) hist(residuals(m3)) hist(dfbeta(m3)[,1]) hist(dfbeta(m3)[,2]) install.packages('performance') library(performance) check_model(m3) #or use: plot(m3) # remember to hit enter for the next plot # predictions/fitted values: fitted(m3) predict(m3) # predict can be used to predict the outcome from # specific values of the explanotary variables: predict(m3, newdata=data.frame(RU='A',Genotype='RNAi')) #0.414 = predicted average tango expression for subjects # not receiving treatment and of the RNAi genotype # to stratify: m5 <- lm(Tango~RU, data=tango[tango$Genotype=='Abeta',]) summary(m5) # notice the lower df (degrees of freedom) # which are directly connected to the sample size used # recap of some of the functions used so far ==== # setwd() # read.csv # list.files() # load # names() # class # factor # as.factor # summary # hist # tapply # t.test # wilcox.test # table # prop.table # lm # glm # summary # confint # relevel # summary(model)$adj.r.squared # BIC # deviance # predict # fitted # check_model # vif # plot(model) # residuals # influence.measures # dfbeta # dffits # cooks.distance # example of relevel function ==== # relevel(tango$Genotype, ref="UAS Arm") m7 <- lm(Tango~relevel(tango$Genotype, ref="UAS Arm"), data=tango) summary(m7) # or: tango$G2 <- relevel(tango$Genotype, ref="UAS Arm") m7 <- lm(Tango~G2, data=tango) summary(m7) # Welcome to Day 2 ==== # Logistic regression ==== # getting to know the titanic dataset ==== setwd("/Volumes/groupfolders-1/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/R/Teachers R script during the course") load("Regressions with R datasets.Rdata") View(titanic) dim(titanic) titanic$survived class(titanic$survived) titanic$survived <- factor(titanic$survived) summary(titanic$survived) titanic$age class(titanic$age) summary(titanic$age) hist(titanic$age) plot(titanic$age, titanic$survived) # fit the logistic model (glm) ==== m1 <- glm(survived~age,data=titanic, family=binomial) summary(m1) # null model: intercept model # residual deviance: deviance of the fitted model deviance(m1) AIC(m1) # logit(prob) = -0.13 -0.007*age # need to exp the coefficients exp(coef(m1)) exp(confint(m1)) # the coefficient for age: OR=0.99, 95%CI (0.98,1.00) # exercise 3 ==== # Explore the joint association of survival with sex, # age, fare and passenger class. #Convert categorical variables into factors titanic$survived <- factor(titanic$survived) titanic$pclass <- factor(titanic$pclass) class(titanic$age) class(titanic$sex) class(titanic$fare) m2 <- glm(survived ~ age + sex + fare + pclass, data = titanic, family=binomial()) summary(m2) # or by adding ordered(pclass), you will check # for a linear or quadratic effect betweeen the levels m2 <- glm(survived ~ age + sex + fare + ordered(pclass), data = titanic, family=binomial()) summary(m2) #Exponentiate coefficients and confidence intervals exp(coef(m2)) exp(confint(m2)) # What’s the Odds Ratio for gender? 0.082 # the odds survival for males is 0.082 times the odds # of survival for females, in other words, # the odds for males decreased by (1-0.082)*100=91.8% # compared the females adjusting for sex, fare, pcalss # for age: (1-0.966)*100=3.4% --> for a year increase # in age, the odds of surival decrease by 3.4% after # adjusting for sex, fare, pcalss # What is the reference level for passenger class? 1st #The reference level for passenger class is therefore pclass1 # Does a higher fare increase or decrease the odds of survival? # the OR is just over 1, meaning that increases in fare # increase the odds of survival, albeit not statistically # significant m3 <- glm(survived ~ age + sex + fare, data = titanic, family=binomial()) summary(m3) m4 <- glm(survived ~ age + sex + pclass, data = titanic, family=binomial()) summary(m4) exp(coef(m4)) BIC(m2) BIC(m3) BIC(m4) # Would you consider including any interactions in the model, if so which ones? # model with all possible interactions -> m5 <- glm(survived ~ age * sex * pclass , data = titanic, family=binomial()) summary(m5) exp(coef(m5)) # manually adding the desired interactions -> m6 <- glm(survived ~ age + sex + pclass + sex*age, data = titanic, family=binomial()) summary(m6) exp(coef(m6)) # over and above the main effect of older age (slight reduction) # and males (further reduction), there was even further # reduction in the odds of survival for older males # What is the predicted odds and probability of survival # for a 20 year old female at 2nd # class at a fare of 30 dollars? # default option for predict is to calculate the # predicted odds, but that can be changed to prob # via the type argument predict(m2, newdata = data.frame(age=20,sex='female', pclass='2', fare=30)) # predicted odds predict(m2, newdata = data.frame(age=20,sex='female', pclass='2', fare=30), type='response') # predicted probability # dealing with missing data ==== titanic2 <- titanic[!is.na(titanic$age),] dim(titanic2) install.packages("generalhoslem") library(generalhoslem) mnew <- glm(survived~age, family=binomial, data=titanic2) logitgof(titanic2$survived,fitted(mnew)) # looking for a non-sign difference between # observed outcome and fitted values # Ordinal logistic regression ==== # getting to know the climbing dataset ==== View(cd) names(cd) cd$height table(cd$height) dim(cd) cd$height <- factor(cd$height, levels=1:3, labels=c('bottom','middle','top')) summary(cd$height) summary(cd$genomic) table(cd$genomic,cd$height) addmargins(table(cd$genomic,cd$height)) # fitting ordinal regression models ==== install.packages('ordinal') library(ordinal) ?clm m1 <- clm(height~genomic, data=cd) summary(m1) exp(coef(m1)) # ln cumulative odds = 1.52 # exp cum odds = 4.5 # >1 -> increase in the odds of flying higher for # wt group vs foxonull group -> 350% increase in the # odds (4.5-1)*100=350 table(cd$genomic,cd$height) exp(coef(m1)) exp(confint(m1)) nominal_test(m1) # Exercise 4 ==== cd$RU <- factor(cd$RU) cd$genomic <- factor(cd$genomic) names(cd) class(cd$day) m0 <- clm(height ~ 1, data=cd) summary(m0) m1 <- clm(height ~ RU, data = cd) summary(m1) exp(coef(m1)) m2 <- clm(height ~ day, data = cd) summary(m2) exp(coef(m2)) m3 <- clm(height ~ genomic, data = cd) summary(m3) exp(coef(m3)) m4 <- clm(height ~ RU + genomic + day, data = cd) summary(m4) exp(coef(m4)) # those treated had a 97% increase in odds of flying # higher, (1.97-1)*100 # 6.12 -> (6.12-1)*100=512% increase in the odds of # flyging higher for wt vs foxonull after adjusting # for treatment and duration in days m5 <- clm(height ~ RU * genomic * day, data = cd) summary(m5) BIC(m0) BIC(m1) BIC(m2) BIC(m3) BIC(m4) BIC(m5) # final model: m4 exp(coef(m4)) exp(confint(m4)) nominal_test(m4) # those with the wt gene have higher cumulative odds # of flying higher compared to foxonull after adjusting # treatment and days m6 <- clm(height ~ RU + genomic + factor(day), data = cd) summary(m6) exp(coef(m6)) BIC(m6) predict(m4, data.frame(genomic='wt',RU='P',day=5), type='class') # predicts outcome class predict(m4, data.frame(genomic='wt',RU='P',day=5), type='prob') # predicted probability for each # outcome level predict(m4, data.frame(genomic='wt',RU='P',day=5), type='cum.prob') # predicted cumulative # probability for each outcome level #Welcome back from lunch (Day 2)#### load("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/datasets/Regressions with R datasets.Rdata") #Poisson Regression #### list.files() fa = read.csv('feeding assay.csv') sapply(fa, class) fa$Genotype=factor(fa$Genotype) fa$Food=factor(fa$Food) table(fa$Food) #Model with food type m1 = glm(Sumfeeding~Food, data=fa, family = poisson) summary(m1) #ln(Sumfeeding)=3.46+0.33*1Y+0.13*2Y exp(m1$coefficients) tapply(fa$Sumfeeding, fa$Food, mean) exp(confint(m1)) #Model with Food type and Genotype m2=glm(Sumfeeding~Food+Genotype,data=fa, family=poisson) summary(m2) table(fa$Genotype) exp(m2$coefficients) exp(confint(m2)) #Adding an offset variable #(Total number of flies in each unit) fa$Totalfeeding=fa$Sumfeeding+fa$Sumnonfeeding m2b=glm(Sumfeeding~Food+Genotype+ offset(log(Totalfeeding)), data=fa, family=poisson) summary(m2b) exp(m2b$coefficients) #Testing equidispersion tapply(fa$Sumfeeding, fa$Food, function(X){c(mean(X), var(X))}) tapply(fa$Sumfeeding, fa$Food, mean) tapply(fa$Sumfeeding, fa$Food, var) install.packages('AER') library(AER) dispersiontest(m2) #Assumption of equidispersion is not satisfied #Negative Binomial Regression library(MASS) m3 = glm.nb(Sumfeeding~Food+Genotype, data=fa) summary(m3) exp(m3$coefficients) exp(confint(m3)) BIC(m2) BIC(m3) #Cox Regression (Survival Analysis)#### lfs = read.csv('lifespan.csv') table(lfs$event) install.packages('survival') library(survival) #Create survival object (for outcome) lfs0 = Surv(lfs$day, lfs$event, type='right') ?Surv summary(lfs0) #Survival Plots m.surv = survfit(lfs0~genotype,data=lfs) plot(m.surv) class(lfs$genotype) table(lfs$genotype) plot(m.surv,lty=2:3,pty='s',mark.time=T) legend('bottomleft', legend=levels(lfs$genotype), lty=2:3) #Test difference in survival (log-rank test) survdiff(lfs0~genotype, data=lfs) #Null:No difference in survival distribution #between the groups summary(survdiff(lfs0~genotype, data=lfs)) summary(m.surv) #Cox regression #coxph() m1 = coxph(lfs0~genotype, data=lfs) summary(m1) #HR: 0.12 #Hazard for S106/Hazard for foxonull #Hazard of death is 88% lower in S106 #compared to foxonull #Hazard of death is 8 times higher in #foxonull compared to S106 1/0.12 1/c(0.1009, 0.141) #Add treatment type m2=coxph(lfs0~genotype+RU, data=lfs) summary(m2) #HR for genotype:0.12 #Hazard is 88% lower for S106 compared #to foxonull after accounting for #Treatment type #HR for treatment type: 0.71 ?predict.coxph() #Predict hazard for fly with #genotype=foxonull, Treated predict(m2, newdata=data.frame(genotype='foxonullS106Foxo',RU='P'), type='risk') # predict(m2, # newdata=data.frame(genotype='foxonullS106Foxo',RU='P'), # type='survival') ?survfit m2.pred=survfit(m2, newdata= data.frame(genotype='foxonullS106Foxo',RU='P'), time=) plot(m2.pred)