#### Introduction to Statistics with R ####
# using four #s or 4 =s at the end of a comment,
# you create a collapsable heading 

# The course will begin at 9.45am 
# on Monday 7th July, 2025.
# We look forward to meeting you 
# then.

# When you arrive, please do the 
# following:
# - Login to the computer if you 
#   are using our desktop PCs 
#   (we can provide login details)
# - Open Rstudio (look under start 
#   menu and search 'Rstudio')
# - Open the following web page:
#   http://casc-platforms.com/summerR.html 


# UCL Guest Wifi (Requires setting up sky account)

# Applet
# https://statspace.elearning.ubc.ca/handle/123456789/76
# CLT applet leading to:
# https://www.zoology.ubc.ca/~whitlock/Kingfisher/CLT.htm 

# comment 
# http://casc-platforms.com/ 

# Fundamentals rules of R 
# comment 
# case sensitive
# calculator
# vocabulary, syntax, grammar 
# objects and functions = 2 main families of entries
# function = 'verb' / action
# objects = e.g. datasets, variables, table, graph etc
# object orientated language
# symbols: () {} [] , '' "" = > < etc 

# defining a new object ====
qone=81
# or 
qone <- 81
# if you add a space between < and - :
qone < - 81 # is qone less than -81?

# first use of a function ====
# need a function to calculate the square root
# squareroot(qone) # this is wrong as that is
# not the function name
sqrt(qone)

# some other objects ====
pi
letters
LETTERS

# some other functions:
date()

# setting the working directory, i.e. the ====
# folder you have saved the datasets, etc.

getwd() # gives you the location of the current
    # working directory
# Session - Set Working Directory - Choose Directory
setwd("/Volumes/groupfolders/ICH_StatsAdmin/Lecture Notes-Presentations/Summer Winter School/2024 2025/Teacher's script file")

setwd("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Summer Winter School/2024 2025/Teacher's script file")

list.files()

# read in R the acupuncture data set ====
acu = read.csv("acupuncture data.csv")
peth <- read.csv("pethidine.csv")

# types of variables ====


acu
View(acu)
# age # this won't work because i have not defined the 
    # dataset
acu$age
dim(acu) # dimensions: number of rows & numbers of columns
names(acu)

acu$age
class(acu$age)
# numeric 
# integer for round whole numbers
# factor = categorical
# character which cannot be analysed in R, i.e. text and 
# needs to become a factor to be analysed 

class(acu$pk1)
class(acu$sex)

# acu dataset is a 2 dimensional object, i.e it has 
# rows and columns
# datasetname[rownumbers,columnnumbers]
acu[,2] # the 2nd column of the acu dataset
# instead of:
acu$age

# pull out all the data of the 5th subject/row:
acu[5,]

acu[acu[,1]==138, ] # from the acu dataset, give me
# all the data of all the variables from the
# subject whose ID number (column 1) is 138
# or: 
acu[acu$id==138, ] 

class(acu$id)

class(acu) # data frame = data set

# help ====
?sqrt

# deleting an object ====
# rm
rm(xx) # remove/delete 

# remove a column and export 
acu <- acu[,-94]
# i am redefining acu to be equal to the original acu
# dataset except column number 95
# recap of functions used so far ====
# citation() # how to cite R
# sqrt # square root
# getwd # get working directory
# setwd # change the working directory
# read.csv # read a csv file in R
# names # list of all the variable names 
# dim # dimensions of a dataset
# class # type of variable/object 
# View # open the data in a new tab
# list.files # list all files in wd
# ? 
# rm

# Alternatively (using variable name): 
#   #using subset function
#   subset(acu, select=-gender)
# or 
# acu$gender=NULL
# 
# and date format 

# Welcome to Day 2 ====


# Edit - Folding - Collapse all 
# Editing and using variables ====
names(peth)
names(peth)[5]
peth[42,6] # the 42nd row of the 6th column (Gestage)
# or
peth[42,"Gestage"]
# or
peth[peth$Baby=='42','Gestage']
View(peth)
class(peth)

# all the data of babies with
# gestation age < 35
peth[peth$Gestage<=35,]

# or if you are intersted in
# specific columns:
peth[peth$Gestage<=35,c('Female','CD4')]

# i am now interested in the Female variable
# for those babies with really low OR (|) high
# gestational age
peth[peth$Gestage<=33 | peth$Gestage>=41, 'Female']

# order of symbols: 
# <=
# >=
# ==

# examples of the c function
c(letters, LETTERS)
c(5,1,89,-45,pi)

# create a new variable
peth$ID <- 1:42 # from 1 to 42
View(peth)
dim(peth)
# or
peth$ID <- 1:dim(peth)[1] # from 1 to the total number
# of rows in the peth dataset

# to reorder variables/columns:
dim(peth)
names(peth)
peth$ID <- 1:dim(peth)[1]
peth$ID
dim(peth)
names(peth)
peth <- peth[,c(10,1:9)] # I have moved ID from the 10th column
# to the first column followed by the original 9 columns
names(peth)
peth[,c(1,10,2:9)] # example where ID is moved to the 
# 2nd place


# create a new variable in the acu datasets, called age2
# which should be equal to age squared (^)
acu$age2 <- acu$age^2
acu$age2
View(acu)


# Edit - Clear Console 
# Summarising categorical variables ====
names(acu)[1:10]
# sex: 0:males, 1 :females
acu$sex
class(acu$sex)
summary(acu$sex)
# create a new variable sex2 to be the factor of sex
# factor = categorical variable
acu$sex2 <- factor(acu$sex)
acu$sex2
class(acu$sex2)
summary(acu$sex2)
# we will now add labels to levels 0 and 1
acu$sex2 <- factor(acu$sex,
                   levels=c(0,1),
                   labels=c('males',"females"))
summary(acu$sex2)
# to calculate prop/%s we need a table of frequencies
table(acu$sex2)
prop.table(table(acu$sex2)) # takes a table and makes it
# into proportions
prt <- prop.table(table(acu$sex2))*100
round(prt) # rounding to whole numbers
round(prt,2) # with two decimal points
# or in one step: 
round(prop.table(table(acu$sex2))*100)

# to add the % sign next to the percentages: 
paste(round(prt), '%')
?round

table(acu$sex2)
addmargins(table(acu$sex2)) # add totals round a table

# manually calculation of odds:
64/337
# or name the table and pull the numbers automatically:
ts <- table(acu$sex2)
ts
ts[1]/ts[2] # ratio of probabilities

acu$sex <- factor(acu$sex)
# graphs for a single categorical variable ====
acu$withdrawal_reason
class(acu$withdrawal_reason)
barplot(table(acu$withdrawal_reason))
summary(acu$withdrawal_reason)

acu$wr <- factor(acu$withdrawal_reason)
class(acu$wr)
summary(acu$wr)
barplot(table(acu$wr),
        main='Title of the plot: First barplot',
        sub='Subtitle position',
        col=1:8, # color of the bars,
        xlab='Withdrawal reason', # labels of the xaxis
        ylab='Frequencies' # label of y axis
        )

# for sex
barplot(ts)


# Summarising numerical variables ====
names(acu)[1:10]
# pk1: baseline headache score
acu$pk1
class(acu$pk1)
summary(acu$pk1)
mean(acu$pk1)
median(acu$pk1)
min(acu$pk1)
max(acu$pk1)
IQR(acu$pk1)
sd(acu$pk1)
var(acu$pk1)
quantile(acu$pk1,0.65) 

# graph of a single numerical variable ====
hist(acu$pk1,breaks=5,
     ylim=c(0,200), # limits of the y axis
     main='Histogram of pk1'
     )

abline(v=27.5) # add a line on an existing graph
  # can only be used in 3 occassions
  # 1st to add a vertical line (v)
  # 2nd to add a horizontal line (h)
  # 3rd to add a line with a given constant and slope 

abline(v=median(acu$pk1))

lines(x=c(42.6,42.6),y=c(-0.43,80))
  # the lines function adds a line based on 
  # x and y coordinates


# Exercises ====

# Compare the length of the pk5 variable against the 
# actual number of observed measurements through summary
# pk5: headache score at 12 months
length(acu$pk5)
acu$pk5
class(acu$pk5)
summary(acu$pk5)
401-100
# 301 observed values


# How many females were aged between 20 and 25 
# years old and how many 
# males were in the same age group?
# use of square brackets
from20to25 <- acu[acu$age>=20 | acu$age<=25, 'sex']
from20to25
length(from20to25)
table(from20to25)

# Check the class of the ‘group’ variable and 
# change the class to 
# something more appropriate, i.e. factor.
acu$group
class(acu$group)
acu$group2 <- factor(acu$group)
# or 
acu$group2 <- factor(acu$group,
                     levels=c(0,1),
                     labels=c('placebo','acupuncture'))
class(acu$group2)
summary(acu$group2)

# Define the following object in R:   
   mixedvar <- c(1,54,'cat',6,'red')
   mixedvar
# What is the class of mixedvar? 
   class(mixedvar)
# What happens 
# if you turn it into a 
# numeric variable?
   mixedvar <- as.numeric(mixedvar)   
   mixedvar
   
#   
# Summarise the variables age, sex, migraine and 
# chronicity from 
# the acupuncture dataset (variables 2, 3, 4 and 5). 
# Check how many 
# non-missing values each summary is based on. 
# For numerical variables,
# also calculate the standard deviation and 
# interquartile range. 
# 

summary(acu$age)  
sd(acu$age)
IQR(acu$age)
  # no NAs for age

summary(acu$chronicity)  
sd(acu$chronicity)
IQR(acu$chronicity)
  # no NAs for chronicity

summary(acu)

summary(acu$sex2)
table(acu$migraine)   
summary(acu$migraine)

# the quickest way to get to the number of NAs is the
# summary function 
   
# Note: For the variable sex, 0 = male and 1 = female. 
# For the variable 
# migraine, 0 = no (tension-type) and 
# 1 = yes (migraine). 
# group= 0: placebo, 1: acupuncture


# exploring the Normal distribution of the data ====
hist(acu$pk1)
summary(acu$pk1)
mean(acu$pk1)+1.96*sd(acu$pk1) # upper limit of the range
mean(acu$pk1)-1.96*sd(acu$pk1) # lower limit of the range

# exercise 8 ====
hist(acu$chronicity)
urr <- mean(acu$chronicity)+1.96*sd(acu$chronicity) # upper
lrr <- mean(acu$chronicity)-1.96*sd(acu$chronicity) # lower
summary(acu$chronicity)

urr
lrr
table(acu$chronicity>=48.4)
# or 
table(acu$chronicity>=urr)
table(acu$chronicity<=lrr)

table(acu$chronicity>=48.4 | acu$chronicity<=-5.5)

# Comparing groups ====
# correlation coefficient - association between two
# numerical variables ====
acu$pk1
summary(acu$pk1)
hist(acu$pk1)

acu$allmedsbaseline
summary(acu$allmedsbaseline)
hist(acu$allmedsbaseline)

cor(acu$pk1,acu$allmedsbaseline) # Pearson's correlation coef
plot(acu$pk1,acu$allmedsbaseline) # scatterplot


# one numeric and one categoric ====
mean(acu$age[acu$sex==0]) # average age for males
mean(acu$age[acu$sex==1]) # average age for females
# or i can use a new function: tapply
tapply(FUN=mean,
       X=acu$age, # X: numerical variable
       INDEX=acu$sex2) # INDEX: categorical variable
# apply function MEAN on the age variable and split
# the results by males and females (i.e.the sex variable)
# or you can shorten the command by using the arguments in
# their default order:
?tapply
# tapply(X,INDEX,FUN)
tapply(acu$age,acu$sex2,mean)

tapply(acu$age,acu$sex2, summary)
tapply(acu$age,acu$sex2, sd)

tapply(acu$age,acu$sex2, median)


# two categorical variables ====
table(acu$group2,acu$response)

#Welcome to Day 3 ####

#Recap of day 2 (last session)####


acu$group2 <- factor(acu$group,
                     levels=c(0,1),
                     labels=c('placebo','acupuncture'))


acu$sex2 <- factor(acu$sex,
                   levels=c(0,1),
                   labels=c('males',"females"))

#numeric differences
tapply(acu$age, acu$sex2, mean)

tapply(acu$age, acu$sex2, sd)

tapply(acu$age, acu$sex2, 
       function(X){c(mean(X),sd(X))})

?tapply

tapply(FUN=mean, INDEX=acu$sex2, X=acu$age)

#categoric differences
#compare number of good responses between acupuncture
#and control groups

table(acu$group2, acu$response)

#add label to response

acu$response2 = factor(acu$response,
                       levels=c(0,1),
                       labels=c('poor', 'good'))

t.gr = table(acu$group2, acu$response2)

#get proportions
prop.table(t.gr)#overall proportions

prop.table(t.gr, margin = 1)#get row proportions

#get %
prop.table(t.gr, margin = 1)*100

#difference in proportions

pt.gr=prop.table(t.gr, margin = 1)#get row proportions

pt.gr[2,2]-pt.gr[1,2]

#Relative Risk (RR)
pt.gr[2,2]/pt.gr[1,2]
#1.68 times higher likelihood (risk) of good response
#in acupuncture group compared to placebo group

#Odds ratio

odds_acu= pt.gr[2,2]/pt.gr[2,1]

odds_pla= pt.gr[1,2]/pt.gr[1,1]

#OR
odds_acu/odds_pla
#Odds of a good response is 2.48 times higher in
#acupuncture group compared to placebo

#Another way of getting RR and OR (using epiR)
install.packages('epiR')
library(epiR)

?epi.2by2

epi.2by2(t.gr)#Uses table wrongly

#change ordering of rows and columns
t.gr2=t.gr[c(2,1), c(2,1)]

t.gr2

epi.2by2(t.gr2)

#missing data
#noted as NA in R

is.na(acu$pk5) #identify those with missing values

which(is.na(acu$pk5))


sum(is.na(acu$pk5))#number of missing values

table(is.na(acu$pk5))#number of missing 
#and non-missing

#identify those with non-missing data
!is.na(acu$pk5) 

?is.na

#replace -999 as NA
#acu$pk5[acu$pk5==-999]=NA

#number that are non-missing
sum(!is.na(acu$pk5))

#Create subset of original data containing
#those with non-missing pk5

acu2 = acu[!is.na(acu$pk5),]

dim(acu2)

#Identify all rows with complete data in acu

complete.cases(acu)

#number with complete data (no missing values)
sum(complete.cases(acu))


#Exercise 9 ####
tapply(acu2$age, acu2$group2, 
       function(X){c(mean(X), sd(X))})


t.sex=table(acu2$sex2, acu2$group2)

t.sex

round(prop.table(t.sex, margin=2)*100)

#Set migraine as categoric

acu2$migraine2 = factor(acu2$migraine,
                        levels = c(0,1),
    labels=c('tension-type headache','migraine'))

table(acu2$migraine2, acu2$group2)

t.mg = table(acu2$migraine2, acu2$group2)

prop.table(t.mg, margin=2)*100

#chronicity
tapply(acu2$chronicity, acu2$group2, 
       function(X){c(mean(X), sd(X))})

#Break until 1.55pm

# Exercise 10 ####

# preliminary calculations
mean_Ex10 <- mean(acu$age)
sd_Ex10 <- sd(acu$age)
n_Ex10 <- sum(!is.na(acu$age))
se_Ex10 <- sd_Ex10/sqrt(n_Ex10)

# 95% confidence interval
mean_Ex10+(1.96*se_Ex10)
mean_Ex10-(1.96*se_Ex10)

# does this work?
SE = sd_Ex10/sqrt(sum(acu$age))
# No

# 95% range
mean_Ex10+(1.96*sd_Ex10)
mean_Ex10-(1.96*sd_Ex10)

 # break until 3:45

# t.test can give us CIs

t.test(acu$age)

#Welcome to Day 4 ####

# Significance testing ####

#One Sample t-test ####
t.test(acu$age)
# p-value < 2.2e-16
# p < 2.2 x 10^-16
# p < 0.0001 / 0.001

t.test(acu$age, mu =  45)
# observed vs hypothesis ... whats the difference?
# translate that into a 'test statistic'
# whats the probability of the observed

# two-sample t-test ####

# (1) Is there a significant difference in 
# the average age of patients in the acupuncture 
# dataset that completed the 12-month follow-up 
# and those that did not?

# age is outcome (mean)
# completed 12 month fup vs. not (e.g. pk5)

# data frame only containing people with 12 month
# follow-up
acu2 <- acu[!is.na(acu$pk5),]
dim(acu2)
# only those with missing 12 month data
acu3 <- acu[is.na(acu$pk5),]
dim(acu3)

# summary with 12 month follow up data
summary(acu2$age)
# without
summary(acu3$age)

# do t-test
t.test(acu2$age, acu3$age, var.equal = FALSE)
46.33887 - 43.13000 

# Normal distribution of the outcome 
# variable in each group
hist(acu2$age)
hist(acu3$age)

# Sample size at least 20 in each group
length(acu2$age) # doesn't check missing data
dim(acu2)
summary(acu2$age)
# these do
sum(!is.na(acu2$age))
sum(!is.na(acu3$age))
#but this one is scary!
length(acu2$age[!is.na(acu2$age)])
# maybe this is good too ...
length(na.omit(acu2$age))
length(na.omit(acu3$age))

# Approximate equality of the standard 
# deviations of the two groups/variables.
# one SD is not more than twice the other.
sd(acu2$age)
sd(acu3$age)

# (2) Is there a significant difference in the 
# 12 month follow up values of patients in the 
# groups receiving and not receiving 
# acupuncture?

# outcome = pk5
# acupuncture vs. control

t.test(acu$pk5 ~ acu$group2)
# ~ tilde = "depends on"

# or, alternatives:
t.test(pk5 ~ group2, data = acu)

# break until 11.20

# paired t-test ####

# (3) Is there a significant difference
# in the baseline and 12-month follow 
# up headache scores of the control 
# group in the acupuncture data set?

# baseline (pk1) vs. 12 month (pk5)
# on the subset of control treatment.

t.test(acu$pk1[acu$group == 0], 
  acu$pk5[acu$group == 0], 
  paired = TRUE)
 
# There is a significant decrease in 
# the control group over time.
# (p < 0.001)

summary(acu$pk1[acu$group == 0])
summary(acu$pk5[acu$group == 0])

# same as one sample t-test ...
acu$diff <- acu$pk5 - acu$pk1

t.test(acu$diff[acu$group == 0], 
  mu = 0)

# assumptions? 

# normally distributed data
# ... of the differences
hist(acu$diff[acu$group == 0])

# n > 20
sum(!is.na(acu$diff) & acu$group == 0) 

# chi- squared test ####

# (4) How does patients’ response 
# (>35% improvement from baseline in 
# headache severity score compared to 
# baseline) compare between acupuncture 
# and control treatment?

# summary (as we saw earlier)
t.gr
round(100*prop.table(t.gr, margin = 1),1)

# let's test to see if this is signif
chisq.test()

# significant!

# alternative ...

prop.test(t.gr)

# assumptions:
# n >20 in both groups (min 40 overall)
# 0.1 < p < 0.9 in both groups.
addmargins(t.gr)

# mann-whitney U test ####

# (1) Is there a significant difference in 
# the average age of patients in the acupuncture 
# dataset that completed the 12-month follow-up 
# and those that did not?

# acu2 = data frame containing just the people
# with 12 month follow up
dim(acu2)
# acu3 = those that dropped out.
dim(acu3)

# do t-test
t.test(acu2$age, acu3$age)

# non-parametric equivalent
wilcox.test(acu2$age, acu3$age)
median(acu2$age)
median(acu3$age)

# (2) does acupuncture work?

# t-test
t.test(acu$pk5 ~ acu$group2)

# alternative ...
wilcox.test(acu$pk5 ~ acu$group2)

tapply(acu$pk5, acu$group2, summary)

# lunch until 1.45 pm - see you soon!

#Fisher's exact test ####

#Assumption for chi-square test

#- n>20 in each group
#- non-extreme proportions/percentages in each group
#(i.e. 0.1<p<0.9)

#If above are violated, use Fisher's test
fisher.test(t.gr)

#p-value<0.001
#The difference in proportion of good responses
#between acupuncture and placebo groups is 
#statistically significant

prop.table(t.gr, margin=1)

#ANOVA ####

table(acu$acupuncturist)

#RQ: Is there a difference in baseline headache
#score (pk1) across the acupuncturists participants
#were assigned to

#Null hypothesis: no difference in pk1 across
#levels of acupuncturist

aovtest = aov(pk1~acupuncturist, data=acu)

summary(aovtest)

#difference in average pk1 between the groups is
#not statistically significant (p-value=0.724)

tapply(acu$pk1, acu$acupuncturist, mean)

tapply(acu$pk1, acu$acupuncturist, sd)

#post-hoc test

#bonferroni correction
pairwise.t.test(acu$pk1,acu$acupuncturist,
                p.adjust.method = 'bonferroni')

?p.adjust.methods

#Kruskal-Wallis ANOVA ####

#RQ: Is there a difference in baseline headache
#score (pk1) across the acupuncturists participants
#were assigned to

#Null hypothesis: no difference in pk1 across
#levels of acupuncturist

kruskal.test(pk1~acupuncturist, data=acu)

#There is some difference in the distribution of 
#pk1 across the groups (p-value=0.012)

pairwise.wilcox.test(acu$pk1, acu$acupuncturist,
              p.adjust.method ='bonferroni')


tapply(acu$pk1, acu$acupuncturist, median)

table(acu$acupuncturist)

#pairwise.prop.test() pairwise test for 
#chi square

#Exercise 11 ####
#q1
t.test(acu$pk1[acu$group == 1],
       acu$pk5[acu$group == 1],
       paired = TRUE)

#p-value: 3.74 *10^-14
#p-value<0.001

#Significant difference between the baseline 
#and 12 months scores for acu group

hist(acu$diff[acu$group==1])

#wilcoxon paired test
wilcox.test(acu$diff[acu$group==1])

wilcox.test(acu$pk1[acu$group == 1],
       acu$pk5[acu$group == 1],
       paired = TRUE)


#q2
#1st method (different paired t-test for
#each group)

t.test(acu$pk1[acu$group == 1],
       acu$pk5[acu$group == 1],
       paired = TRUE)

t.test(acu$pk1[acu$group == 0],
       acu$pk5[acu$group == 0],
       paired = TRUE)

#2nd method (Two-sample t-test on diff)
acu$diff = acu$pk1-acu$pk5

hist(acu$diff[acu$group==1])
hist(acu$diff[acu$group==0])

t.test(diff~group2, data=acu)

#q3
t.test(acu$pk1)
t.test(acu$pk5)

#Baseline (pk1): 95%CI:(24.92, 28.10)

#12-month (pk5): 95%CI:(17.31, 20.85)

t.test(acu$diff)

#95% CI for difference: (5.14, 7.83)


#Welcome to Day 5! ####

#Load data and setup (peth) ####
peth <- read.csv("pethidine.csv")

peth$sex=factor(peth$Female,
            levels=c(0,1),
            labels=c('male','female'))


peth$Pethidine2=factor(peth$Pethidine,
                levels=c(0,1),
                labels=c('No','Yes'))

peth$Ethnicity2=factor(peth$Ethnicity,
          levels=c(0,1,2),
    labels=c('White','Black','Asian'))


#check for missing
sapply(peth, function(X){sum(is.na(X))})

sum(is.na(peth$CD4))


#Linear Regression ####

#m <-lm(Y~X, data=data)#one predictor

#m <-lm(Y~X1+X2+X3, data=data)#multiple predictors


?lm

#One numeric predictor (Birthweight)####
m1 = lm(VmaxFRC~Birthweight, data=peth)
summary(m1)

#VmaxFRC=26.63+0.04*BW

#Intercept: If BW=0, mean VMaxFRC will
#be 26.63

#Slope (for BW): For each 1g increase
#in BW, VmaxFRC is expected to increase
#by 0.04

#For 100g increase in BW, VmaxFRC is
#expected to increase by 4 

#p-value for BW (0.036): The association 
#between BW and VmaxFRC is statistically
#significant

confint(m1)

#95% CI for slope for BW: (0.003,0.07)

m1$coefficients
names(m1)

#Adjusted R-squared: 0.085 or 8.5%
#8.5% of total variability in Vmax is 
#explained by model including BW only

#plot fitted model
plot(peth$Birthweight, peth$VmaxFRC,
     xlim=c(0,3000))

#add line
abline(m1, col='red')

#One Binary Predictor (Sex)####

m2 = lm(VmaxFRC~sex, data=peth)
summary(m2)

#VmaxFRC=91.88+23.42*Female

#91.88(Intercept): The mean value of
#Vmax if Female=0(ie. Male). The mean
#Vmax for males is 91.88

#Slope(23.42): The mean difference in 
#Vmax between female and male is 23.42

#or: Females have higher Vmax on average 
#by 23.42 compared to males

#p-value for sex (0.098): The difference
#in Vmax between males and females is 
#not stats sig

confint(m2)

#95% CI for slope: (-4.49, 51.33)

plot(jitter(peth$Female), peth$VmaxFRC)

# recap of functions so far (before lunch Day 5) ====
# sapply
# lm
# summary
# coef
# confint
# names
# plot
# abline 
# exercise 13 ====
m3 <- lm(VmaxFRC~Pethidine2, data=peth)
summary(m3)   
# 108: predicted value of VMAXFRC for the group
# not taking pethidine

# -38: decrease in average VMAXFRC for
# mothers taking pethidine vs not

# p = 0.02 < 0.05 so a statistically 
# significant decrease

coef(m3)
confint(m3)
# we are compatible with a decrease between
# 6.4 to 70.4 in the population with 
# 95% confidence 

names(summary(m3))
summary(m3)$adj.r # 11.1%
# compare to earlier models:
summary(m2)$adj.r # 4.7%
summary(m1)$adj.r # 8.4%

# predictions:
m3.pred <- predict(m3, 
    newdata = data.frame(Pethidine2=c('No','Yes')))
m3.pred
# 108 is the average predicted Vmax for the No group
# 69.6 is the average predicted Vmax for the Yes group
tapply(peth$VmaxFRC,peth$Pethidine2,mean,na.rm=T)

# boxplot
plot(peth$Pethidine2, peth$VmaxFRC)
# instead, you should prefer the dot plot
plot(jitter(peth$Pethidine), peth$VmaxFRC, 
     xlab = "Pethidine Use", 
     ylab = "VmaxFRC", 
     xaxt="n") # x axis type=null
axis(1, # 1: x axis 
     at=c(0,1), # draw marks at points 0 and 
     labels=c("No","Yes")) # label the marks No and Yes
points(x = c(0,1), # the x coordinates of the points
       y = m3.pred, # the y coordinates of the points
       pch="-", # point character/symbol = -
       col="red", # colour
       cex=4) # character expansion/size of symbol

# two numerical predictors ====
m1 <- lm(VmaxFRC ~ Birthweight, data=peth)
# I now want to adjust for gestation age 
m4 <- lm(VmaxFRC ~ Gestage + Birthweight,
         data=peth)
summary(m4)
coef(m4)
# 100gr increase in birthweight is associated with
# 8units increase (100*0.08) on average vmax while
# adjusting for gestational age 
confint(m4)
# The 8units increase could be between 3.3 to 13.2 units
# in the population with 95% confidence 
coef(m1)
predict(m4, newdata=data.frame('Gestage'=40,
                            'Birthweight'=2400))
# the predicted average Vmax for babies borth at 
# 40 weeks weighing 2400 grs is 107

# use of dummy variables with ethnicity ====
m5 <- lm(VmaxFRC ~ Ethnicity2,
         data=peth)
summary(m5)
# White: the reference category/level
# chosen automatically by R (but it CAN
# be changed, with function relevel)
# Intercept = average vmax when all predictors
# are equal to 0, i.e. black=0, asian=0 --> WHITE
# intercept is the value of the outcome for the
  # reference category

# -46: change of the vmax for black compared to
  # white, decrease of 46 units

# -29: change of the vmax for asian compared to
# white, decrease of 29 units

m6 <- lm(VmaxFRC ~ relevel(Ethnicity2,ref='Black'),
         data=peth)
summary(m6)


# exercise 15 ====

m1 <- lm(VmaxFRC ~ Birthweight
          , data=peth)
m2 <- lm(VmaxFRC ~ sex
          , data=peth)
m3 <- lm(VmaxFRC ~ Gestage + Birthweight
          , data=peth)
m4 <- lm(VmaxFRC ~ Ethnicity2
          , data=peth)
m5 <- lm(VmaxFRC ~ Ethnicity2 + Gestage + Birthweight
         + sex,
          , data=peth)

deviance(m1) # VmaxFRC~Birthweight
deviance(m2) # VmaxFRC~sex
deviance(m3) # VmaxFRC~Gestage + Birthweigh
deviance(m4) # VmaxFRC~ Ethnicity2
deviance(m5) # VmaxFRC ~ Ethnicity2+sex+Gestage+Birthweigh 
# m5: lowest deviance

AIC(m1)
AIC(m2)  
AIC(m3)
AIC(m4)
AIC(m5)
# consistent with deviance, m5 the best


BIC(m1)
BIC(m2) 
BIC(m3)
BIC(m4)
BIC(m5)
# consistent with deviance, m5 the best

summary(m5)


# diagnostics (residuals, influential and vif)
summary(m5)

hist(residuals(m5))
plot(peth$Gestage[!is.na(peth$Female) & !is.na(peth$VmaxFRC)],
     residuals(m5))
abline(h=0,col='red')

head(dfbeta(m5)) # first 6 entries
tail(dfbeta(m5)) #  bottom 6 entries
hist(dfbeta(m5)[,1]) # for the intercept
hist(dfbeta(m5)[,2]) # for black
hist(dfbeta(m5)[,3]) # for asian
hist(dfbeta(m5)[,4]) # for gestage and more

# dffits()
# cooks.distance()

install.packages('car') # companion of applied regression
library(car)
vif(m5)