#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~# # Further Topics in R - 26th November 2025 # #~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~# # Welcome! # Join the live lecture (available from 9.15am) ### Before 9.30am: # - Please Open RStudio # - Visit http://www.casc-platforms.com/furtherR # and leave this open # - Open a pdf copy of the notes # - Make sure you have downloaded the all the datasets # and unzipped this data folder. ### Timetable: # 9:15am - 9:30am: Registration # 9:30am - 11:00am: Session I # 11:00am - 11:15am: Break # 11:15am - 12:45pm: Session II # 12.45pm - 1.45pm: Lunch Break # 1.45pm - 3.00pm: Session III # 3.00pm - 3.15pm: Break # 3.15pm - 5.00pm: Session IV # Session 1: Recap R Environments #### # Object height = c(180,176,190,168) weight = c(78,90,100,55) # or height <- c(180,176,190,168) weight <- c(78,90,100,55) height[1] weight[3] # extract multiple values height[1:3] weight[c(2,4)] # Data frames data.frame() df1 = data.frame(height, weight) # extract data from data frame # weight for the first individual df1[1,2] df1[1,"weight"] # all the weights df1[,"weight"] df1$weight # height for participants 1 and 3 df1[c(1,3),"height"] class(df1) #list l1 = list(height=height, weight=weight, df1=df1) l1 # to access the elements of list: [[]] l1[[1]] l1[["height"]] l1[["df1"]] # Functions: function_name(argument/object) mean(df1$weight) is.na(df1$height) # check for missing values sum(is.na(df1$height)) # count number of missing values sum(c(2,5,7)) # Other functions: length(height) round() is.na() unique(weight) # Brackets # () : Define the arguments of a function # [] : Refer to elements of an object # {} : Defining our own functions and loops # Setting Working Directory getwd() # Check working directory setwd("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Further topics in R/Datasets") baseline = read.csv("weightloss/baseline.csv") weights1 = read.csv("weightloss/weights1.csv") weights2 = read.csv("weightloss/weights2.csv") # glimpse into the data head(weights2) # what type object have I loaded class(baseline) # Organising the Data Frame (merging them) # Merge weights1 and weights2 according to rows weights1 = weights1[,-3] # remove column 3 (app.date) # or weights1 = subset(weights1, select=-c(app.date)) # re-organising weights2 to match weights1 weights2 = weights2[,c(1,3,2)] # Function to combine by row: rbind() weights.row = rbind(weights1, weights2) # Combine by column: cbind() dfcomb = cbind(baseline, weights.row) # gives error # Using the merge function dfcomb = merge(baseline, weights.row) # default for the by = intersect(names(baseline), names(weights.row)) # or dfcomb = merge(baseline, weights.row, by = "id") # or dfcomb = merge(baseline, weights.row, by.x = "id", by.y = "id") # want to keep all of the information for both of the data frames dfcomb = merge(baseline, weights.row, all = TRUE) # TIDYVERSE: Merging #### # install & load tidyverse install.packages("tidyverse") library(tidyverse) # The pipe operator: %>% (Ctrl/Cmd + Shift + M) # base R baseline_female <- baseline[baseline$male == 0,] # tidyverse baseline_female <- baseline %>% filter(male == 0) # Merging # base R: dfcomb = merge(baseline, weights.row) baseline %>% inner_join(weights.row) # base R: dfcomb = merge(baseline, weights.row, by = "id") baseline %>% inner_join(weights.row, by = "id") # base R: dfcomb = merge(baseline, weights.row, by.x = "id", by.y = "id") baseline %>% inner_join(weights.row, by = c("id" = "id")) # we have more detail on Tidyverse in the notes on Moodle! # Exercise 2 (page 18) library(datasets) # how many rows nrow(beaver1) nrow(beaver2) #or dim(beaver1) dim(beaver2) # saved it to a 'visible' variable dfb1 = beaver1 dfb2 = beaver2 # combine by row beavercombo = rbind(dfb1,dfb2) # or tidyverse: # beavercombo <- dfb1 %>% bind_rows(dfb2) # checking the rbind dim(beavercombo) head(beavercombo) sum(is.na(beavercombo)) # take out 'day' and 'activ' # tip: Alt/Options + - gives <- dfb1 <- subset(dfb1, select = -c(day,activ)) dfb2 <- subset(dfb2, select = -c(day,activ)) beverall = merge(dfb1, dfb2, by = "time") # tidyverse: dfb1 %>% inner_join(dfb2, by = "time") # write to CSV (save) write.csv(beavercombo, file="beavercombo.csv") #Conditional statements #### #If statement # if (condition is TRUE){ # Do this # } else{ # Do this instead # } #Example val.a = 1 val.b = 2 if (val.a==1){ val.a } else{ val.b } if (val.a==9){ val.a } else{ val.b } #or ifelse(val.a==9, val.a, val.b) val.a == 1 #equality check val.a != 1 #inequality check val.a > 1 #OR (|) val.a ==1 | val.b ==1 val.a ==3 | val.b ==1 #AND (&) val.a ==1 & val.b==2 val.a <=1 & val.b>5 #Create a new vector x.vect = c(1, 3, 6.7, 3.9, 0.13, 5.1, 7.03) x.vect == 1 # check if each element is 1 #How many are 1 sum(x.vect == 1) #Are any of them 1? any(x.vect == 1) #Are all of them 1? all(x.vect == 1) #Suppose we want if any of them are equal to 1, 3 or 5 x.vect == c(1, 3, 5) # WRONG Way of checking!!! #Correct way: using %in% x.vect %in% c(1,3,5) #Are any of the values of x.vect contained in c(1,3,5)? any(x.vect %in% c(1,3,5)) #More if statement examples #Check if environment contains object: height ls() #Gives a list of objects in env if ('height' %in% ls()){ 'Object already exists' } else{ 'Object does not exist' } rm(height) #remove height if ('height' %in% ls()){ 'Object already exists' } else{ 'Object does not exist' } is.numeric()#Checks if numeric is.factor() #checks if factor/categoric if (is.numeric(x.vect)){ mean(x.vect) } else if (is.factor(x.vect)){ table(x.vect) } class(x.vect) is.numeric(x.vect) if (is.numeric(x.vect)){ mean(x.vect) } else if (is.factor(x.vect)){ table(x.vect) }else{ 'Not a numeric or categoric variable' } xchar = c('London','Manchester', 'Kent') class(xchar) if (is.numeric(xchar)){ mean(x.vect) } else if (is.factor(xchar)){ table(xchar) }else{ 'Not a numeric or categoric variable' } #Exercise 3 #### list.files() lab = read.csv('lab data.csv') if (is.numeric(lab$ID)) { lab <- lab[order(lab$ID), ] } else if (is.numeric(lab$volunteer)) { lab <- lab[order(lab$volunteer), ] } else { print("Data frame not sorted") } #change volunteer to factor lab$volunteer = as.factor(lab$volunteer) if (is.numeric(lab$ID)) { lab <- lab[order(lab$ID), ] } else if (is.numeric(lab$volunteer)) { lab <- lab[order(lab$volunteer), ] } else { print("Data frame not sorted") } #Loops #### #For loop # for (series of values){ # Do this # } #Square all integer values from 1 to 5 for (i in 1:5){ i^2 } for (i in 1:5){ print(i^2) } #Calculate log of heights using a for loop for (i in 1:length(height)){ print(log(height[i])) } #Loop through string for (colours in c('red', 'blue', 'green')){ print(colours) } #Commands used within loops #break command (stop the loop) for (colours in c('red', NA, 'green')){ print(colours) if (is.na(colours)) break } for (colours in c('red', NA, 'green')){ if (is.na(colours)) break print(colours) } #warning (adds a warning message with stopping loop) for (colours in c('red', NA, 'green')){ if (is.na(colours)) warning('This is a missing value!') print(colours) } #stop (similar to break, but adds error message) for (colours in c('red', NA, 'green')){ if (is.na(colours)) stop('This is a missing value!') print(colours) } #next (skips over value) for (colours in c('red', NA, 'green')){ if (is.na(colours)) next print(colours) } #Exercise 4 #### boxplot(lab$ctrl.CD11b) names(lab)[3] for (i in c(3, 4, 7, 8)){ boxplot(lab[,i], main=names(lab)[i]) } #How to get the variable that contain CD11b #grep() #grep(pattern, vector) locs = grep('CD11b', names(lab)) locs for (i in locs){ boxplot(lab[,i], main=names(lab)[i]) } #While loop i=1 while(i<=5){ print(i^2) i=i+1 } #Practical example of using loop to load multiple datasets getwd() setwd("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Further topics in R/Datasets") getwd() list.files('weightloss') filenames = list.files('weightloss') filenames = filenames[-1] #remove baseline.csv filenames #read in weights1 A1 = read.csv("weightloss/weights1.csv") #create empty list l=list() for (i in 1:length(filenames)){ l[[i]] = read.csv(paste("weightloss/", filenames[i], sep="")) } #paste('A','B')#How paste works l[[1]] install.packages('plyr') library(plyr) weights.long = rbind.fill(l) # Apply functions #### # Check the class of each variable in the lab dataset for (i in 1:length(colnames(lab))){ print(class(lab[,i])) } # sapply(): Repeat a command across columns of data # (output is a vector) --- c() # lapply(): Repeat a command across columns of data # (output is list) --- list() # tapply(): Computes summaries of our data across groups sapp = sapply(X = lab, FUN = class) lapp = lapply(lab, class) # tapply(variable, grouping_variable, summary) # Mean weight by id tapply(weights.long$weight.kg, weights.long$id, mean) tapply(weights.long$weight.kg, weights.long$id, sd) # tidyverse approach weights.long %>% group_by(id) %>% dplyr::summarise(mean = mean(weight.kg), sd = sd(weight.kg)) #Writing Functions #### # function_name = function(input arguments){ # Do this with input arguments # return(this) # # } # we create a function that adds 1 to our input addone = function(X){ Y = X + 1 return(Y) } # testing our function addone(12) addone(5) addone(c(12,13)) # Function that calculates BMI calc_BMI = function(height, weight){ BMI = weight / height^2 return(BMI) } # testing calc_BMI calc_BMI(1.8, 70) calc_BMI(1.73, 70) calc_BMI(weight = 70, height = 1.73) calc_BMI(70,1.73) # Two outputs: Function that computes both BMI # and assigns a weight class calc_BMI = function(height, weight){ # calculate BMI BMI = weight / height^2 # allocating BMI class if (BMI>30){ BMIclass = "Overweight" } else{ BMIclass = "Not Overweight" } return(list(BMI=BMI, BMIclass=BMIclass)) } example1 = calc_BMI(height=1.65, weight=75) # Length of airway (cm) = 4.35 + (0.09 x gestational age in weeks) # + (1.09 x log(birth weight in kilograms)) airwaylength = function(gest, bweight){ predval = 4.35 + 0.09*gest + 1.09*log(bweight) return(predval) } airwaylength(gest=30, bweight=1) # Length of airway (cm) = 1.752 + (0.026 x gestational age in weeks) # + (0.045 x log(birth weight in kilograms)) # + (0.642 x foot length in cm) # + (0.09 x crown rump length in cm) airwaylength = function(gest, bweight, FL, CRL, formula=1){ if (formula == 1){ predval = 4.35 + 0.09*gest + 1.09*log(bweight) } else if (formula == 2){ predval = 1.752 + 0.026*gest + 0.045*log(bweight) + 0.642*FL + 0.09*CRL } return(predval) } airwaylength(gest=30, bweight = 1, FL=5, CRL=27) airwaylength(gest=30, bweight = 1, FL=5, CRL=27, formula=2) round(8.172, digits=2) # edit function to round the values airwaylength = function(gest, bweight, FL, CRL, formula=1, ...){ if (formula == 1){ predval = 4.35 + 0.09*gest + 1.09*log(bweight) } else if (formula == 2){ predval = 1.752 + 0.026*gest + 0.045*log(bweight) + 0.642*FL + 0.09*CRL } return(round(predval, ...)) # this pipes through the additional arguments to round() } airwaylength(gest=30, bweight = 1, FL=5, CRL=27, formula=2, digits=1) # exercise 6 #### #Write a function to calculate the difference between the means of #two variables (note: you can use the built-in function called mean in your code). #Name this function meandiff and include two arguments called var1 and var2. meandiff = function(var1, var2){ mdif = mean(var1) - mean(var2) return(mdif) } meandiff(var2=lab$ctrl.igm, var1=lab$stim.igm) #Modify your function so that you can specify how many digits #you want to round the output off to (hint: use '...' as shown on the previous page of these notes). meandiff = function(var1, var2, ...){ mdif = mean(var1) - mean(var2) # rounding mdif = round(mdif, ...) return(mdif) } meandiff(var2=lab$ctrl.igm, var1=lab$stim.igm, digits = 2) # Use a conditional statement (see section 6) # inside your function so that a warning message # is produced if there are any missing values in # either variable. You can use the function # called warning (as shown on page 29). meandiff = function(var1, var2, ...){ if( any( is.na(var1) )| any( is.na(var2) ) ){ warning("There is a missing values in at least one of the variables inputted.") } else{ mdif = mean(var1) - mean(var2) return(round(mdif, ...)) } } meandiff(var2=lab$ctrl.igm, var1=lab$stim.igm, digits = 2) meandiff(var2=lab$ctrl.fup.CD11b, var1=lab$ctrl.CD11b, digits = 2)