# Overview of Regressions with R ####
# Tuesday 1st and Wednesday 2nd July 2025 
# UCL CASC course
# Welcome 

# Day 1 ####

# Linear Regression ####
# set working directory
setwd("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/datasets")
setwd("/Volumes/groupfolders-2/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/R/Teachers R script during the course")
setwd("/Volumes/groupfolders/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/R/Teachers R script during the course")

#list files in directory
list.files()

#Load tango data
tango <- read.csv("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/datasets/QPCR tango.csv")

tango <- read.csv('QPCR tango.csv')

#Load all the datasets
load("Regressions with R datasets.Rdata")

#variables names
names(tango)

#Check data types
class(tango$Genotype)

class(tango$RU)

class(tango$Tango)

#Set as categoric
tango$Genotype=factor(tango$Genotype)
?factor

tango$RU=factor(tango$RU)
?as.factor

#Summarise data
summary(tango$Tango)

hist(tango$Tango)

#mean of Tango expression levels by RU (Treatment)
tapply(tango$Tango,tango$RU, mean)

?tapply

#T-test
t.test(Tango~RU, data=tango)#better way

t.test(tango$Tango~tango$RU)

#Mann-Whitney test
wilcox.test(Tango~RU, data=tango)

#Summarise predictors

#frequencies/count
t1=table(tango$RU)

t2=table(tango$Genotype)

#proportions
prop.table(t1)

prop.table(t2)

prop.table(t2)*100#percentages

#Fit Linear Regression models
#lm(Y~X1+X2+X3, data=data)

#glm(Y~X1+X2+X3, data=data, family='gaussian')

#lm is better as it gives more useful output

#Intercept only model
m0 = lm(Tango~1, data=tango)

summary(m0)

#Tango=0.86

#confidence interval
confint(m0)

#With Treatment type (RU) as predictor
m1 = lm(Tango~RU, data=tango)

summary(m1)

#Tango=0.71+0.30*Present

#0.71 (intercept): Mean tango expression
#for Absent group

#0.30 (slope for P):mean difference in
#tango expression between Present(P)
#and Absent (A) groups. 

#p-value:0.055 (Difference is not statistically 
#significant)

relevel()#function for changing reference

confint(m1)

# Welcome back after lunch ====

summary(m1)
confint(m1) # 95% confidence interval
?confint
coef(m1)

# diagnostic tools for linear models ====
names(tango)
names(summary(m1))
summary(m1)$r.squared
summary(m1)$adj.r.squared
summary(m1)
deviance(m1)
BIC(m1)
AIC(m1) # Akaike Information Criterion
residuals(m1)
hist(residuals(m1))
plot(tango$RU,residuals(m1))
class(tango$RU)
plot(as.numeric(tango$RU),residuals(m1))
abline(h=0) # h for horizontal

influence.measures(m1)
hist(dfbeta(m1)[,1]) # intercept
hist(dfbeta(m1)[,2]) # slope
hist(cooks.distance(m1))
hist(dffits(m1))

install.packages('car')
library(car)
vif(m1)

# exercise 1 ====

summary(tango$Genotype)
levels(tango$Genotype)
class(tango$Genotype)

m1 <- lm(Tango ~ Genotype, data = tango)
summary(m1) # default reference level is Abeta
tapply(tango$Tango, tango$Genotype, mean)


# Consider both treatment and genotype jointly. 
# Start with the plot, then add them in
# the model.
m3 <- lm(Tango ~ RU + Genotype, data = tango)
summary(m3)

# the simplest of plots:
plot(as.numeric(factor(tango$Genotype)),
     tango$Tango,
     col=c("red","green")[tango$RU],
     pch=c("X","O")[tango$RU])

# or with a bit more editing:
par(pty="s")
layout(1)
plot(as.numeric(factor(tango$Genotype)),
     tango$Tango,
     ylab="Tango",xlab="Genecode",
     cex=2,xaxt="n",
     col=c("red","green")[tango$RU],
     cex.axis=2,
     cex.lab=1.5,
     pch=c("X","O")[tango$RU])

axis(1,1:3,labels=levels(factor(tango$Genotype)),
     cex.axis=2)

legend("top",legend=levels(tango$RU)[c(1,2)],
       col=c("red","green")[c(1,2)],
       pch=c("X","O")[c(1,2)],box.lty=2)

# a. Are both terms needed in the model? 
# YES
#   b. Is the interaction term significant?
# NO, so remove the interaction 
m4 <- lm(Tango ~ RU * Genotype , data = tango)
summary(m4)

# tango = constant + b1*treatment + b2*genotype + b3*treatment*genotype
# tango = 0.79 + 0.31*Treated - 0.4*RNAI + 0.15*UASArm + 0.03*Treated*RNAI 
#      - 0.08*Treated*UASArm


#   c. Complete the gaps in the following statement:
# look at model 3 (without the interaction terms):
m3 <- lm(Tango ~ RU + Genotype, data = tango)
summary(m3)
#   Treatment is associated with a 0.3 change in the mean/average tango
# expression whilst adjusting/accounting for genotype.

# d. Is there collinearity between the two independent variables, genotype and
# treatment?
vif(m3)



m0 <- lm(Tango ~ 1, data = tango)
m1 <- lm(Tango ~ Genotype, data = tango)
m2 <- lm(Tango ~ RU, data = tango)
m3 <- lm(Tango ~ RU + Genotype, data = tango)
m4 <- lm(Tango ~ RU * Genotype , data = tango)

summary(m0)
summary(m1) # default reference level is Abeta
summary(m2) # best model 
summary(m3)
summary(m4)

BIC(m0) # intercept only
BIC(m1) # RU only
BIC(m2) # genotype only
BIC(m3) # RU and genotype
BIC(m4) # plus interaction

summary(m0)$adj.r.squared
summary(m1)$adj.r.squared
summary(m2)$adj.r.squared
summary(m3)$adj.r.squared
summary(m4)$adj.r.squared

# m3: RU and genotype

residuals(m3)
hist(residuals(m3))
hist(dfbeta(m3)[,1])
hist(dfbeta(m3)[,2])

install.packages('performance')
library(performance)
check_model(m3)

#or use:
plot(m3) # remember to hit enter for the next plot

# predictions/fitted values:
fitted(m3)
predict(m3)
# predict can be used to predict the outcome from
# specific values of the explanotary variables:
predict(m3, newdata=data.frame(RU='A',Genotype='RNAi'))
#0.414 = predicted average tango expression for subjects
# not receiving treatment and of the RNAi genotype

# to stratify: 
m5 <- lm(Tango~RU, data=tango[tango$Genotype=='Abeta',])
summary(m5) # notice the lower df (degrees of freedom)
# which are directly connected to the sample size used

# recap of some of the functions used so far ====
# setwd()
# read.csv
# list.files()
# load
# names()
# class
# factor
# as.factor
# summary
# hist
# tapply
# t.test
# wilcox.test
# table
# prop.table
# lm
# glm
# summary
# confint
# relevel
# summary(model)$adj.r.squared
# BIC
# deviance
# predict
# fitted 
# check_model
# vif
# plot(model)
# residuals
# influence.measures
# dfbeta
# dffits
# cooks.distance

# example of relevel function ====
# relevel(tango$Genotype, ref="UAS Arm")

m7 <- lm(Tango~relevel(tango$Genotype, ref="UAS Arm"), data=tango)
summary(m7)

# or:
tango$G2 <- relevel(tango$Genotype, ref="UAS Arm")
m7 <- lm(Tango~G2, data=tango)
summary(m7)







# Welcome to Day 2 ====
# Logistic regression ====
# getting to know the titanic dataset ====

setwd("/Volumes/groupfolders-1/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/R/Teachers R script during the course")
load("Regressions with R datasets.Rdata")

View(titanic)
dim(titanic)
titanic$survived
class(titanic$survived)
titanic$survived <- factor(titanic$survived)
summary(titanic$survived)

titanic$age
class(titanic$age)
summary(titanic$age)
hist(titanic$age)

plot(titanic$age, titanic$survived)

# fit the logistic model (glm) ====
m1 <- glm(survived~age,data=titanic,
          family=binomial)
summary(m1)
# null model: intercept model
# residual deviance: deviance of the fitted model
deviance(m1)
AIC(m1)

# logit(prob) = -0.13 -0.007*age
# need to exp the coefficients
exp(coef(m1))
exp(confint(m1))

# the coefficient for age: OR=0.99, 95%CI (0.98,1.00) 

# exercise 3 ====

# Explore the joint association of survival with sex, 
# age, fare and passenger class.

#Convert categorical variables into factors 

titanic$survived <- factor(titanic$survived)
titanic$pclass <- factor(titanic$pclass)
class(titanic$age)
class(titanic$sex)
class(titanic$fare)


m2 <- glm(survived ~ age +
            sex + 
            fare + 
            pclass,
          data = titanic, 
          family=binomial())
summary(m2)

# or by adding ordered(pclass), you will check 
# for a linear or quadratic effect betweeen the levels
m2 <- glm(survived ~ age +
            sex + 
            fare + 
            ordered(pclass),
          data = titanic, 
          family=binomial())
summary(m2)


#Exponentiate coefficients and confidence intervals
exp(coef(m2))
exp(confint(m2))

# What’s the Odds Ratio for gender? 0.082
# the odds survival for males is 0.082 times the odds
# of survival for females, in other words,
# the odds for males decreased by (1-0.082)*100=91.8%
# compared the females adjusting for sex, fare, pcalss

# for age: (1-0.966)*100=3.4% --> for a year increase
# in age, the odds of surival decrease by 3.4% after
# adjusting for sex, fare, pcalss

# What is the reference level for passenger class? 1st
#The reference level for passenger class is therefore pclass1

# Does a higher fare increase or decrease the odds of survival?
# the OR is just over 1, meaning that increases in fare
# increase the odds of survival, albeit not statistically 
# significant 

m3 <- glm(survived ~ age +
            sex + 
            fare,
          data = titanic, 
          family=binomial())
summary(m3)


m4 <- glm(survived ~ age +
            sex + 
            pclass,
          data = titanic, 
          family=binomial())
summary(m4)
exp(coef(m4))

BIC(m2)
BIC(m3)
BIC(m4)


# Would you consider including any interactions in the model, if so which ones?
# model with all possible interactions -> 
m5 <- glm(survived ~ age *
            sex * 
            pclass ,
          data = titanic, 
          family=binomial())
summary(m5)
exp(coef(m5))

# manually adding the desired interactions ->
m6 <- glm(survived ~ age +
            sex + 
            pclass + 
            sex*age,
          data = titanic, 
          family=binomial())
summary(m6)
exp(coef(m6))

# over and above the main effect of older age (slight reduction) 
# and males (further reduction), there was even further
# reduction in the odds of survival for older males

# What is the predicted odds and probability of survival 
# for a 20 year old female at 2nd
# class at a fare of 30 dollars?

# default option for predict is to calculate the 
# predicted odds, but that can be changed to prob
# via the type argument
predict(m2, newdata = data.frame(age=20,sex='female',
                pclass='2',
                fare=30)) # predicted odds

predict(m2, newdata = data.frame(age=20,sex='female',
                                 pclass='2',
                                 fare=30),
        type='response') # predicted probability

# dealing with missing data ====
titanic2 <- titanic[!is.na(titanic$age),]
dim(titanic2)

install.packages("generalhoslem")
library(generalhoslem)
mnew <- glm(survived~age,
            family=binomial,
        data=titanic2)
logitgof(titanic2$survived,fitted(mnew))
# looking for a non-sign difference between
# observed outcome and fitted values







# Ordinal logistic regression ====
# getting to know the climbing dataset ====
View(cd)
names(cd)
cd$height
table(cd$height)
dim(cd)

cd$height <- factor(cd$height,
                    levels=1:3,
                    labels=c('bottom','middle','top'))
summary(cd$height)

summary(cd$genomic)

table(cd$genomic,cd$height)
addmargins(table(cd$genomic,cd$height))

# fitting ordinal regression models ====

install.packages('ordinal')
library(ordinal)

?clm

m1 <- clm(height~genomic, 
          data=cd)
summary(m1)
exp(coef(m1))
# ln cumulative odds = 1.52
# exp cum odds = 4.5
# >1 -> increase in the odds of flying higher for 
# wt group vs foxonull group -> 350% increase in the
# odds (4.5-1)*100=350

table(cd$genomic,cd$height)
exp(coef(m1))
exp(confint(m1))
nominal_test(m1)

# Exercise 4 ====
cd$RU <- factor(cd$RU)
cd$genomic <- factor(cd$genomic)    
names(cd)
class(cd$day)

m0 <- clm(height ~ 1,
          data=cd)
summary(m0)

m1 <- clm(height ~ RU,
          data = cd)
summary(m1)
exp(coef(m1))

m2 <- clm(height ~ day,
          data = cd)
summary(m2)
exp(coef(m2))

m3 <- clm(height ~ genomic,
          data = cd)
summary(m3)
exp(coef(m3))

m4 <- clm(height ~ RU + genomic + day,
          data = cd)
summary(m4)
exp(coef(m4))
# those treated had a 97% increase in odds of flying
# higher, (1.97-1)*100
# 6.12 -> (6.12-1)*100=512% increase in the odds of 
# flyging higher for wt vs foxonull after adjusting 
# for treatment and duration in days

m5 <- clm(height ~ RU * genomic * day,
          data = cd)
summary(m5)

BIC(m0)
BIC(m1)
BIC(m2)
BIC(m3)
BIC(m4)
BIC(m5)

# final model: m4
exp(coef(m4))
exp(confint(m4))
nominal_test(m4)

# those with the wt gene have higher cumulative odds
# of flying higher compared to foxonull after adjusting
# treatment and days

m6 <- clm(height ~ RU + genomic + factor(day),
          data = cd)
summary(m6)
exp(coef(m6))
BIC(m6)

predict(m4, data.frame(genomic='wt',RU='P',day=5),
        type='class') # predicts outcome class
predict(m4, data.frame(genomic='wt',RU='P',day=5),
        type='prob') # predicted probability for each 
  # outcome level
predict(m4, data.frame(genomic='wt',RU='P',day=5),
        type='cum.prob') # predicted cumulative 
# probability for each outcome level

#Welcome back from lunch (Day 2)####


load("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Regressions with R/### LATEST ###/datasets/Regressions with R datasets.Rdata")

#Poisson Regression ####
list.files()
fa = read.csv('feeding assay.csv')

sapply(fa, class)

fa$Genotype=factor(fa$Genotype)
fa$Food=factor(fa$Food)

table(fa$Food)

#Model with food type
m1 = glm(Sumfeeding~Food, data=fa,
         family = poisson)

summary(m1)

#ln(Sumfeeding)=3.46+0.33*1Y+0.13*2Y

exp(m1$coefficients)

tapply(fa$Sumfeeding, fa$Food, mean)

exp(confint(m1))

#Model with Food type and Genotype

m2=glm(Sumfeeding~Food+Genotype,data=fa,
       family=poisson)

summary(m2)

table(fa$Genotype)

exp(m2$coefficients)
exp(confint(m2))

#Adding an offset variable
#(Total number of flies in each unit)

fa$Totalfeeding=fa$Sumfeeding+fa$Sumnonfeeding

m2b=glm(Sumfeeding~Food+Genotype+
          offset(log(Totalfeeding)), 
        data=fa,
        family=poisson)

summary(m2b)

exp(m2b$coefficients)

#Testing equidispersion
tapply(fa$Sumfeeding, fa$Food, 
       function(X){c(mean(X), var(X))})

tapply(fa$Sumfeeding, fa$Food, mean)

tapply(fa$Sumfeeding, fa$Food, var)

install.packages('AER')
library(AER)

dispersiontest(m2)
#Assumption of equidispersion is not satisfied

#Negative Binomial Regression
library(MASS)

m3 = glm.nb(Sumfeeding~Food+Genotype, data=fa)

summary(m3)

exp(m3$coefficients)

exp(confint(m3))

BIC(m2)
BIC(m3)

#Cox Regression (Survival Analysis)####

lfs = read.csv('lifespan.csv')

table(lfs$event)

install.packages('survival')
library(survival)

#Create survival object (for outcome)

lfs0 = Surv(lfs$day, lfs$event, 
            type='right')

?Surv

summary(lfs0)

#Survival Plots
m.surv = survfit(lfs0~genotype,data=lfs)

plot(m.surv)

class(lfs$genotype)

table(lfs$genotype)

plot(m.surv,lty=2:3,pty='s',mark.time=T)
legend('bottomleft',
       legend=levels(lfs$genotype),
       lty=2:3)

#Test difference in survival (log-rank test)
survdiff(lfs0~genotype, data=lfs)

#Null:No difference in survival distribution
#between the groups

summary(survdiff(lfs0~genotype, data=lfs))

summary(m.surv)

#Cox regression
#coxph()

m1 = coxph(lfs0~genotype, data=lfs)

summary(m1)


#HR: 0.12

#Hazard for S106/Hazard for foxonull

#Hazard of death is 88% lower in S106
#compared to foxonull

#Hazard of death is 8 times higher in 
#foxonull compared to S106

1/0.12

1/c(0.1009, 0.141)

#Add treatment type

m2=coxph(lfs0~genotype+RU, data=lfs)

summary(m2)

#HR for genotype:0.12

#Hazard is 88% lower for S106 compared
#to foxonull after accounting for
#Treatment type

#HR for treatment type: 0.71

?predict.coxph()

#Predict hazard for fly with
#genotype=foxonull, Treated

predict(m2, 
newdata=data.frame(genotype='foxonullS106Foxo',RU='P'),
type='risk')

# predict(m2, 
#         newdata=data.frame(genotype='foxonullS106Foxo',RU='P'),
#         type='survival')



?survfit
m2.pred=survfit(m2, 
    newdata=
data.frame(genotype='foxonullS106Foxo',RU='P'), 
time=)

plot(m2.pred)