#### Introduction to Statistics with R #### # using four #s or 4 =s at the end of a comment, # you create a collapsable heading # The course will begin at 9.45am # on Monday 7th July, 2025. # We look forward to meeting you # then. # When you arrive, please do the # following: # - Login to the computer if you # are using our desktop PCs # (we can provide login details) # - Open Rstudio (look under start # menu and search 'Rstudio') # - Open the following web page: # http://casc-platforms.com/summerR.html # UCL Guest Wifi (Requires setting up sky account) # Applet # https://statspace.elearning.ubc.ca/handle/123456789/76 # CLT applet leading to: # https://www.zoology.ubc.ca/~whitlock/Kingfisher/CLT.htm # comment # http://casc-platforms.com/ # Fundamentals rules of R # comment # case sensitive # calculator # vocabulary, syntax, grammar # objects and functions = 2 main families of entries # function = 'verb' / action # objects = e.g. datasets, variables, table, graph etc # object orientated language # symbols: () {} [] , '' "" = > < etc # defining a new object ==== qone=81 # or qone <- 81 # if you add a space between < and - : qone < - 81 # is qone less than -81? # first use of a function ==== # need a function to calculate the square root # squareroot(qone) # this is wrong as that is # not the function name sqrt(qone) # some other objects ==== pi letters LETTERS # some other functions: date() # setting the working directory, i.e. the ==== # folder you have saved the datasets, etc. getwd() # gives you the location of the current # working directory # Session - Set Working Directory - Choose Directory setwd("/Volumes/groupfolders/ICH_StatsAdmin/Lecture Notes-Presentations/Summer Winter School/2024 2025/Teacher's script file") setwd("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Summer Winter School/2024 2025/Teacher's script file") list.files() # read in R the acupuncture data set ==== acu = read.csv("acupuncture data.csv") peth <- read.csv("pethidine.csv") # types of variables ==== acu View(acu) # age # this won't work because i have not defined the # dataset acu$age dim(acu) # dimensions: number of rows & numbers of columns names(acu) acu$age class(acu$age) # numeric # integer for round whole numbers # factor = categorical # character which cannot be analysed in R, i.e. text and # needs to become a factor to be analysed class(acu$pk1) class(acu$sex) # acu dataset is a 2 dimensional object, i.e it has # rows and columns # datasetname[rownumbers,columnnumbers] acu[,2] # the 2nd column of the acu dataset # instead of: acu$age # pull out all the data of the 5th subject/row: acu[5,] acu[acu[,1]==138, ] # from the acu dataset, give me # all the data of all the variables from the # subject whose ID number (column 1) is 138 # or: acu[acu$id==138, ] class(acu$id) class(acu) # data frame = data set # help ==== ?sqrt # deleting an object ==== # rm rm(xx) # remove/delete # remove a column and export acu <- acu[,-94] # i am redefining acu to be equal to the original acu # dataset except column number 95 # recap of functions used so far ==== # citation() # how to cite R # sqrt # square root # getwd # get working directory # setwd # change the working directory # read.csv # read a csv file in R # names # list of all the variable names # dim # dimensions of a dataset # class # type of variable/object # View # open the data in a new tab # list.files # list all files in wd # ? # rm # Alternatively (using variable name): # #using subset function # subset(acu, select=-gender) # or # acu$gender=NULL # # and date format # Welcome to Day 2 ==== # Edit - Folding - Collapse all # Editing and using variables ==== names(peth) names(peth)[5] peth[42,6] # the 42nd row of the 6th column (Gestage) # or peth[42,"Gestage"] # or peth[peth$Baby=='42','Gestage'] View(peth) class(peth) # all the data of babies with # gestation age < 35 peth[peth$Gestage<=35,] # or if you are intersted in # specific columns: peth[peth$Gestage<=35,c('Female','CD4')] # i am now interested in the Female variable # for those babies with really low OR (|) high # gestational age peth[peth$Gestage<=33 | peth$Gestage>=41, 'Female'] # order of symbols: # <= # >= # == # examples of the c function c(letters, LETTERS) c(5,1,89,-45,pi) # create a new variable peth$ID <- 1:42 # from 1 to 42 View(peth) dim(peth) # or peth$ID <- 1:dim(peth)[1] # from 1 to the total number # of rows in the peth dataset # to reorder variables/columns: dim(peth) names(peth) peth$ID <- 1:dim(peth)[1] peth$ID dim(peth) names(peth) peth <- peth[,c(10,1:9)] # I have moved ID from the 10th column # to the first column followed by the original 9 columns names(peth) peth[,c(1,10,2:9)] # example where ID is moved to the # 2nd place # create a new variable in the acu datasets, called age2 # which should be equal to age squared (^) acu$age2 <- acu$age^2 acu$age2 View(acu) # Edit - Clear Console # Summarising categorical variables ==== names(acu)[1:10] # sex: 0:males, 1 :females acu$sex class(acu$sex) summary(acu$sex) # create a new variable sex2 to be the factor of sex # factor = categorical variable acu$sex2 <- factor(acu$sex) acu$sex2 class(acu$sex2) summary(acu$sex2) # we will now add labels to levels 0 and 1 acu$sex2 <- factor(acu$sex, levels=c(0,1), labels=c('males',"females")) summary(acu$sex2) # to calculate prop/%s we need a table of frequencies table(acu$sex2) prop.table(table(acu$sex2)) # takes a table and makes it # into proportions prt <- prop.table(table(acu$sex2))*100 round(prt) # rounding to whole numbers round(prt,2) # with two decimal points # or in one step: round(prop.table(table(acu$sex2))*100) # to add the % sign next to the percentages: paste(round(prt), '%') ?round table(acu$sex2) addmargins(table(acu$sex2)) # add totals round a table # manually calculation of odds: 64/337 # or name the table and pull the numbers automatically: ts <- table(acu$sex2) ts ts[1]/ts[2] # ratio of probabilities acu$sex <- factor(acu$sex) # graphs for a single categorical variable ==== acu$withdrawal_reason class(acu$withdrawal_reason) barplot(table(acu$withdrawal_reason)) summary(acu$withdrawal_reason) acu$wr <- factor(acu$withdrawal_reason) class(acu$wr) summary(acu$wr) barplot(table(acu$wr), main='Title of the plot: First barplot', sub='Subtitle position', col=1:8, # color of the bars, xlab='Withdrawal reason', # labels of the xaxis ylab='Frequencies' # label of y axis ) # for sex barplot(ts) # Summarising numerical variables ==== names(acu)[1:10] # pk1: baseline headache score acu$pk1 class(acu$pk1) summary(acu$pk1) mean(acu$pk1) median(acu$pk1) min(acu$pk1) max(acu$pk1) IQR(acu$pk1) sd(acu$pk1) var(acu$pk1) quantile(acu$pk1,0.65) # graph of a single numerical variable ==== hist(acu$pk1,breaks=5, ylim=c(0,200), # limits of the y axis main='Histogram of pk1' ) abline(v=27.5) # add a line on an existing graph # can only be used in 3 occassions # 1st to add a vertical line (v) # 2nd to add a horizontal line (h) # 3rd to add a line with a given constant and slope abline(v=median(acu$pk1)) lines(x=c(42.6,42.6),y=c(-0.43,80)) # the lines function adds a line based on # x and y coordinates # Exercises ==== # Compare the length of the pk5 variable against the # actual number of observed measurements through summary # pk5: headache score at 12 months length(acu$pk5) acu$pk5 class(acu$pk5) summary(acu$pk5) 401-100 # 301 observed values # How many females were aged between 20 and 25 # years old and how many # males were in the same age group? # use of square brackets from20to25 <- acu[acu$age>=20 | acu$age<=25, 'sex'] from20to25 length(from20to25) table(from20to25) # Check the class of the ‘group’ variable and # change the class to # something more appropriate, i.e. factor. acu$group class(acu$group) acu$group2 <- factor(acu$group) # or acu$group2 <- factor(acu$group, levels=c(0,1), labels=c('placebo','acupuncture')) class(acu$group2) summary(acu$group2) # Define the following object in R: mixedvar <- c(1,54,'cat',6,'red') mixedvar # What is the class of mixedvar? class(mixedvar) # What happens # if you turn it into a # numeric variable? mixedvar <- as.numeric(mixedvar) mixedvar # # Summarise the variables age, sex, migraine and # chronicity from # the acupuncture dataset (variables 2, 3, 4 and 5). # Check how many # non-missing values each summary is based on. # For numerical variables, # also calculate the standard deviation and # interquartile range. # summary(acu$age) sd(acu$age) IQR(acu$age) # no NAs for age summary(acu$chronicity) sd(acu$chronicity) IQR(acu$chronicity) # no NAs for chronicity summary(acu) summary(acu$sex2) table(acu$migraine) summary(acu$migraine) # the quickest way to get to the number of NAs is the # summary function # Note: For the variable sex, 0 = male and 1 = female. # For the variable # migraine, 0 = no (tension-type) and # 1 = yes (migraine). # group= 0: placebo, 1: acupuncture # exploring the Normal distribution of the data ==== hist(acu$pk1) summary(acu$pk1) mean(acu$pk1)+1.96*sd(acu$pk1) # upper limit of the range mean(acu$pk1)-1.96*sd(acu$pk1) # lower limit of the range # exercise 8 ==== hist(acu$chronicity) urr <- mean(acu$chronicity)+1.96*sd(acu$chronicity) # upper lrr <- mean(acu$chronicity)-1.96*sd(acu$chronicity) # lower summary(acu$chronicity) urr lrr table(acu$chronicity>=48.4) # or table(acu$chronicity>=urr) table(acu$chronicity<=lrr) table(acu$chronicity>=48.4 | acu$chronicity<=-5.5) # Comparing groups ==== # correlation coefficient - association between two # numerical variables ==== acu$pk1 summary(acu$pk1) hist(acu$pk1) acu$allmedsbaseline summary(acu$allmedsbaseline) hist(acu$allmedsbaseline) cor(acu$pk1,acu$allmedsbaseline) # Pearson's correlation coef plot(acu$pk1,acu$allmedsbaseline) # scatterplot # one numeric and one categoric ==== mean(acu$age[acu$sex==0]) # average age for males mean(acu$age[acu$sex==1]) # average age for females # or i can use a new function: tapply tapply(FUN=mean, X=acu$age, # X: numerical variable INDEX=acu$sex2) # INDEX: categorical variable # apply function MEAN on the age variable and split # the results by males and females (i.e.the sex variable) # or you can shorten the command by using the arguments in # their default order: ?tapply # tapply(X,INDEX,FUN) tapply(acu$age,acu$sex2,mean) tapply(acu$age,acu$sex2, summary) tapply(acu$age,acu$sex2, sd) tapply(acu$age,acu$sex2, median) # two categorical variables ==== table(acu$group2,acu$response) #Welcome to Day 3 #### #Recap of day 2 (last session)#### acu$group2 <- factor(acu$group, levels=c(0,1), labels=c('placebo','acupuncture')) acu$sex2 <- factor(acu$sex, levels=c(0,1), labels=c('males',"females")) #numeric differences tapply(acu$age, acu$sex2, mean) tapply(acu$age, acu$sex2, sd) tapply(acu$age, acu$sex2, function(X){c(mean(X),sd(X))}) ?tapply tapply(FUN=mean, INDEX=acu$sex2, X=acu$age) #categoric differences #compare number of good responses between acupuncture #and control groups table(acu$group2, acu$response) #add label to response acu$response2 = factor(acu$response, levels=c(0,1), labels=c('poor', 'good')) t.gr = table(acu$group2, acu$response2) #get proportions prop.table(t.gr)#overall proportions prop.table(t.gr, margin = 1)#get row proportions #get % prop.table(t.gr, margin = 1)*100 #difference in proportions pt.gr=prop.table(t.gr, margin = 1)#get row proportions pt.gr[2,2]-pt.gr[1,2] #Relative Risk (RR) pt.gr[2,2]/pt.gr[1,2] #1.68 times higher likelihood (risk) of good response #in acupuncture group compared to placebo group #Odds ratio odds_acu= pt.gr[2,2]/pt.gr[2,1] odds_pla= pt.gr[1,2]/pt.gr[1,1] #OR odds_acu/odds_pla #Odds of a good response is 2.48 times higher in #acupuncture group compared to placebo #Another way of getting RR and OR (using epiR) install.packages('epiR') library(epiR) ?epi.2by2 epi.2by2(t.gr)#Uses table wrongly #change ordering of rows and columns t.gr2=t.gr[c(2,1), c(2,1)] t.gr2 epi.2by2(t.gr2) #missing data #noted as NA in R is.na(acu$pk5) #identify those with missing values which(is.na(acu$pk5)) sum(is.na(acu$pk5))#number of missing values table(is.na(acu$pk5))#number of missing #and non-missing #identify those with non-missing data !is.na(acu$pk5) ?is.na #replace -999 as NA #acu$pk5[acu$pk5==-999]=NA #number that are non-missing sum(!is.na(acu$pk5)) #Create subset of original data containing #those with non-missing pk5 acu2 = acu[!is.na(acu$pk5),] dim(acu2) #Identify all rows with complete data in acu complete.cases(acu) #number with complete data (no missing values) sum(complete.cases(acu)) #Exercise 9 #### tapply(acu2$age, acu2$group2, function(X){c(mean(X), sd(X))}) t.sex=table(acu2$sex2, acu2$group2) t.sex round(prop.table(t.sex, margin=2)*100) #Set migraine as categoric acu2$migraine2 = factor(acu2$migraine, levels = c(0,1), labels=c('tension-type headache','migraine')) table(acu2$migraine2, acu2$group2) t.mg = table(acu2$migraine2, acu2$group2) prop.table(t.mg, margin=2)*100 #chronicity tapply(acu2$chronicity, acu2$group2, function(X){c(mean(X), sd(X))}) #Break until 1.55pm # Exercise 10 #### # preliminary calculations mean_Ex10 <- mean(acu$age) sd_Ex10 <- sd(acu$age) n_Ex10 <- sum(!is.na(acu$age)) se_Ex10 <- sd_Ex10/sqrt(n_Ex10) # 95% confidence interval mean_Ex10+(1.96*se_Ex10) mean_Ex10-(1.96*se_Ex10) # does this work? SE = sd_Ex10/sqrt(sum(acu$age)) # No # 95% range mean_Ex10+(1.96*sd_Ex10) mean_Ex10-(1.96*sd_Ex10) # break until 3:45 # t.test can give us CIs t.test(acu$age) #Welcome to Day 4 #### # Significance testing #### #One Sample t-test #### t.test(acu$age) # p-value < 2.2e-16 # p < 2.2 x 10^-16 # p < 0.0001 / 0.001 t.test(acu$age, mu = 45) # observed vs hypothesis ... whats the difference? # translate that into a 'test statistic' # whats the probability of the observed # two-sample t-test #### # (1) Is there a significant difference in # the average age of patients in the acupuncture # dataset that completed the 12-month follow-up # and those that did not? # age is outcome (mean) # completed 12 month fup vs. not (e.g. pk5) # data frame only containing people with 12 month # follow-up acu2 <- acu[!is.na(acu$pk5),] dim(acu2) # only those with missing 12 month data acu3 <- acu[is.na(acu$pk5),] dim(acu3) # summary with 12 month follow up data summary(acu2$age) # without summary(acu3$age) # do t-test t.test(acu2$age, acu3$age, var.equal = FALSE) 46.33887 - 43.13000 # Normal distribution of the outcome # variable in each group hist(acu2$age) hist(acu3$age) # Sample size at least 20 in each group length(acu2$age) # doesn't check missing data dim(acu2) summary(acu2$age) # these do sum(!is.na(acu2$age)) sum(!is.na(acu3$age)) #but this one is scary! length(acu2$age[!is.na(acu2$age)]) # maybe this is good too ... length(na.omit(acu2$age)) length(na.omit(acu3$age)) # Approximate equality of the standard # deviations of the two groups/variables. # one SD is not more than twice the other. sd(acu2$age) sd(acu3$age) # (2) Is there a significant difference in the # 12 month follow up values of patients in the # groups receiving and not receiving # acupuncture? # outcome = pk5 # acupuncture vs. control t.test(acu$pk5 ~ acu$group2) # ~ tilde = "depends on" # or, alternatives: t.test(pk5 ~ group2, data = acu) # break until 11.20 # paired t-test #### # (3) Is there a significant difference # in the baseline and 12-month follow # up headache scores of the control # group in the acupuncture data set? # baseline (pk1) vs. 12 month (pk5) # on the subset of control treatment. t.test(acu$pk1[acu$group == 0], acu$pk5[acu$group == 0], paired = TRUE) # There is a significant decrease in # the control group over time. # (p < 0.001) summary(acu$pk1[acu$group == 0]) summary(acu$pk5[acu$group == 0]) # same as one sample t-test ... acu$diff <- acu$pk5 - acu$pk1 t.test(acu$diff[acu$group == 0], mu = 0) # assumptions? # normally distributed data # ... of the differences hist(acu$diff[acu$group == 0]) # n > 20 sum(!is.na(acu$diff) & acu$group == 0) # chi- squared test #### # (4) How does patients’ response # (>35% improvement from baseline in # headache severity score compared to # baseline) compare between acupuncture # and control treatment? # summary (as we saw earlier) t.gr round(100*prop.table(t.gr, margin = 1),1) # let's test to see if this is signif chisq.test() # significant! # alternative ... prop.test(t.gr) # assumptions: # n >20 in both groups (min 40 overall) # 0.1 < p < 0.9 in both groups. addmargins(t.gr) # mann-whitney U test #### # (1) Is there a significant difference in # the average age of patients in the acupuncture # dataset that completed the 12-month follow-up # and those that did not? # acu2 = data frame containing just the people # with 12 month follow up dim(acu2) # acu3 = those that dropped out. dim(acu3) # do t-test t.test(acu2$age, acu3$age) # non-parametric equivalent wilcox.test(acu2$age, acu3$age) median(acu2$age) median(acu3$age) # (2) does acupuncture work? # t-test t.test(acu$pk5 ~ acu$group2) # alternative ... wilcox.test(acu$pk5 ~ acu$group2) tapply(acu$pk5, acu$group2, summary) # lunch until 1.45 pm - see you soon! #Fisher's exact test #### #Assumption for chi-square test #- n>20 in each group #- non-extreme proportions/percentages in each group #(i.e. 0.1 WHITE # intercept is the value of the outcome for the # reference category # -46: change of the vmax for black compared to # white, decrease of 46 units # -29: change of the vmax for asian compared to # white, decrease of 29 units m6 <- lm(VmaxFRC ~ relevel(Ethnicity2,ref='Black'), data=peth) summary(m6) # exercise 15 ==== m1 <- lm(VmaxFRC ~ Birthweight , data=peth) m2 <- lm(VmaxFRC ~ sex , data=peth) m3 <- lm(VmaxFRC ~ Gestage + Birthweight , data=peth) m4 <- lm(VmaxFRC ~ Ethnicity2 , data=peth) m5 <- lm(VmaxFRC ~ Ethnicity2 + Gestage + Birthweight + sex, , data=peth) deviance(m1) # VmaxFRC~Birthweight deviance(m2) # VmaxFRC~sex deviance(m3) # VmaxFRC~Gestage + Birthweigh deviance(m4) # VmaxFRC~ Ethnicity2 deviance(m5) # VmaxFRC ~ Ethnicity2+sex+Gestage+Birthweigh # m5: lowest deviance AIC(m1) AIC(m2) AIC(m3) AIC(m4) AIC(m5) # consistent with deviance, m5 the best BIC(m1) BIC(m2) BIC(m3) BIC(m4) BIC(m5) # consistent with deviance, m5 the best summary(m5) # diagnostics (residuals, influential and vif) summary(m5) hist(residuals(m5)) plot(peth$Gestage[!is.na(peth$Female) & !is.na(peth$VmaxFRC)], residuals(m5)) abline(h=0,col='red') head(dfbeta(m5)) # first 6 entries tail(dfbeta(m5)) # bottom 6 entries hist(dfbeta(m5)[,1]) # for the intercept hist(dfbeta(m5)[,2]) # for black hist(dfbeta(m5)[,3]) # for asian hist(dfbeta(m5)[,4]) # for gestage and more # dffits() # cooks.distance() install.packages('car') # companion of applied regression library(car) vif(m5)