#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#
# Further Topics in R - 26th November 2025  #
#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#~#

# Welcome!

# Join the live lecture (available from 9.15am)

### Before 9.30am:
# - Please Open RStudio
# - Visit http://www.casc-platforms.com/furtherR
#   and leave this open 
# - Open a pdf copy of the notes
# - Make sure you have downloaded the all the datasets
#   and unzipped this data folder.

### Timetable:
# 9:15am - 9:30am: Registration
# 9:30am - 11:00am: Session I
# 11:00am - 11:15am: Break
# 11:15am - 12:45pm: Session II
# 12.45pm - 1.45pm: Lunch Break
# 1.45pm - 3.00pm: Session III
# 3.00pm - 3.15pm: Break
# 3.15pm - 5.00pm: Session IV

# Session 1: Recap R Environments ####
# Object 
height = c(180,176,190,168)
weight = c(78,90,100,55)
# or
height <- c(180,176,190,168)
weight <- c(78,90,100,55)

height[1]
weight[3]

# extract multiple values
height[1:3]
weight[c(2,4)]

# Data frames
data.frame()

df1 = data.frame(height, weight)

# extract data from data frame
# weight for the first individual
df1[1,2]
df1[1,"weight"]

# all the weights
df1[,"weight"]
df1$weight

# height for participants 1 and 3
df1[c(1,3),"height"]

class(df1)


#list 
l1 = list(height=height, weight=weight, df1=df1)
l1

# to access the elements of list: [[]]
l1[[1]]
l1[["height"]]
l1[["df1"]]

# Functions: function_name(argument/object)
mean(df1$weight)
is.na(df1$height) # check for missing values
sum(is.na(df1$height)) # count number of missing values

sum(c(2,5,7))

# Other functions:
length(height)
round()
is.na()
unique(weight)


# Brackets
# () : Define the arguments of a function
# [] : Refer to elements of an object
# {} : Defining our own functions and loops

# Setting Working Directory
getwd() # Check working directory

setwd("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Further topics in R/Datasets")

baseline = read.csv("weightloss/baseline.csv")
weights1 = read.csv("weightloss/weights1.csv")
weights2 = read.csv("weightloss/weights2.csv")

# glimpse into the data
head(weights2)

# what type object have I loaded
class(baseline)

# Organising the Data Frame (merging them)

# Merge weights1 and weights2 according to rows
weights1 = weights1[,-3] # remove column 3 (app.date)
# or
weights1 = subset(weights1, select=-c(app.date))

# re-organising weights2 to match weights1
weights2 = weights2[,c(1,3,2)]

# Function to combine by row: rbind()
weights.row = rbind(weights1, weights2)

# Combine by column: cbind()
dfcomb = cbind(baseline, weights.row) # gives error

# Using the merge function
dfcomb = merge(baseline, weights.row)
# default for the by = intersect(names(baseline), names(weights.row))

# or 
dfcomb = merge(baseline, weights.row, by = "id")

# or
dfcomb = merge(baseline, weights.row, by.x = "id", by.y = "id")

# want to keep all of the information for both of the data frames
dfcomb = merge(baseline, weights.row, all = TRUE)

# TIDYVERSE: Merging ####
# install & load tidyverse
install.packages("tidyverse")
library(tidyverse)

# The pipe operator: %>% (Ctrl/Cmd + Shift + M)

# base R
baseline_female <- baseline[baseline$male == 0,]

# tidyverse
baseline_female <- baseline %>% filter(male == 0)

# Merging

# base R: dfcomb = merge(baseline, weights.row)
baseline %>% inner_join(weights.row)

# base R: dfcomb = merge(baseline, weights.row, by = "id")
baseline %>% inner_join(weights.row, by = "id")

# base R: dfcomb = merge(baseline, weights.row, by.x = "id", by.y = "id")
baseline %>% inner_join(weights.row, by = c("id" = "id"))

# we have more detail on Tidyverse in the notes on Moodle!

# Exercise 2 (page 18)
library(datasets)

# how many rows
nrow(beaver1)
nrow(beaver2)

#or
dim(beaver1)
dim(beaver2)

# saved it to a 'visible' variable
dfb1 = beaver1
dfb2 = beaver2

# combine by row
beavercombo = rbind(dfb1,dfb2)
# or tidyverse:
# beavercombo <- dfb1 %>% bind_rows(dfb2)

# checking the rbind
dim(beavercombo)
head(beavercombo)
sum(is.na(beavercombo))

# take out 'day' and 'activ'
# tip: Alt/Options + - gives <- 
dfb1 <- subset(dfb1, select = -c(day,activ))
dfb2 <- subset(dfb2, select = -c(day,activ))

beverall = merge(dfb1, dfb2, by = "time")
# tidyverse: dfb1 %>% inner_join(dfb2, by = "time")

# write to CSV (save)
write.csv(beavercombo, file="beavercombo.csv")

#Conditional statements ####

#If statement

# if (condition is TRUE){
#   Do this
# } else{
#   Do this instead
# }
  
#Example

val.a = 1
val.b = 2

if (val.a==1){
  val.a
} else{
  val.b
}

if (val.a==9){
  val.a
} else{
  val.b
}

#or
ifelse(val.a==9, val.a, val.b)


val.a == 1  #equality check

val.a != 1  #inequality check

val.a > 1  

#OR (|)

val.a ==1 | val.b ==1

val.a ==3 | val.b ==1

#AND (&)

val.a ==1 & val.b==2

val.a <=1 & val.b>5

#Create a new vector
x.vect = c(1, 3, 6.7, 3.9, 0.13, 5.1, 7.03)

x.vect == 1 # check if each element is 1

#How many are 1
sum(x.vect == 1)

#Are any of them 1?
any(x.vect == 1)

#Are all of them 1?
all(x.vect == 1)


#Suppose we want if any of them are equal to 1, 3 or 5

x.vect == c(1, 3, 5) # WRONG Way of checking!!!

#Correct way: using %in%

x.vect %in% c(1,3,5)

#Are any of the values of x.vect contained in c(1,3,5)?

any(x.vect %in% c(1,3,5))

#More if statement examples

#Check if environment contains object: height

ls() #Gives a list of objects in env

if ('height' %in% ls()){
  'Object already exists'
} else{
  'Object does not exist'
}

rm(height) #remove height

if ('height' %in% ls()){
  'Object already exists'
} else{
  'Object does not exist'
}

is.numeric()#Checks if numeric
is.factor() #checks if factor/categoric

if (is.numeric(x.vect)){
  mean(x.vect)
} else if (is.factor(x.vect)){
  table(x.vect)
}

class(x.vect)
is.numeric(x.vect)

if (is.numeric(x.vect)){
  mean(x.vect)
} else if (is.factor(x.vect)){
  table(x.vect)
}else{
  'Not a numeric or categoric variable'
}

xchar = c('London','Manchester', 'Kent')
class(xchar)

if (is.numeric(xchar)){
  mean(x.vect)
} else if (is.factor(xchar)){
  table(xchar)
}else{
  'Not a numeric or categoric variable'
}

#Exercise 3 ####
list.files()

lab = read.csv('lab data.csv')

if (is.numeric(lab$ID)) {
  lab <- lab[order(lab$ID), ]
} else if (is.numeric(lab$volunteer)) {
  lab <- lab[order(lab$volunteer), ]
} else {
  print("Data frame not sorted")
}

#change volunteer to factor

lab$volunteer = as.factor(lab$volunteer)

if (is.numeric(lab$ID)) {
  lab <- lab[order(lab$ID), ]
} else if (is.numeric(lab$volunteer)) {
  lab <- lab[order(lab$volunteer), ]
} else {
  print("Data frame not sorted")
}

#Loops ####

#For loop

# for (series of values){
#   Do this
# }

#Square all integer values from 1 to 5

for (i in 1:5){
  i^2
}


for (i in 1:5){
  print(i^2)
}


#Calculate log of heights using a for loop

for (i in 1:length(height)){
  print(log(height[i]))
}

#Loop through string


for (colours in c('red', 'blue', 'green')){
  print(colours)
}
  
#Commands used within loops

#break command (stop the loop)

for (colours in c('red', NA, 'green')){
  print(colours)
  if (is.na(colours)) break
}


for (colours in c('red', NA, 'green')){
  if (is.na(colours)) break
  print(colours)
  
}

#warning (adds a warning message with stopping loop)

for (colours in c('red', NA, 'green')){
  if (is.na(colours)) warning('This is a missing value!')
  print(colours)
  
}

#stop (similar to break, but adds error message)

for (colours in c('red', NA, 'green')){
  if (is.na(colours)) stop('This is a missing value!')
  print(colours)
  
}

#next (skips over value)

for (colours in c('red', NA, 'green')){
  if (is.na(colours)) next
  print(colours)
  
}

#Exercise 4 ####

boxplot(lab$ctrl.CD11b)

names(lab)[3]

for (i in c(3, 4, 7, 8)){
  boxplot(lab[,i], main=names(lab)[i])
}

#How to get the variable that contain CD11b

#grep()

#grep(pattern, vector)

locs = grep('CD11b', names(lab))

locs

for (i in locs){
  boxplot(lab[,i], main=names(lab)[i])
}


#While loop

i=1
while(i<=5){
  print(i^2)
  i=i+1
}

#Practical example of using loop to load multiple datasets

getwd()

setwd("S:/ICH_StatsAdmin/Lecture Notes-Presentations/Further topics in R/Datasets")

getwd()

list.files('weightloss')

filenames = list.files('weightloss')

filenames = filenames[-1] #remove baseline.csv

filenames

#read in weights1
A1 = read.csv("weightloss/weights1.csv")

#create empty list
l=list()

for (i in 1:length(filenames)){
  l[[i]] = read.csv(paste("weightloss/", filenames[i], sep=""))
}

#paste('A','B')#How paste works

l[[1]]

install.packages('plyr')
library(plyr)

weights.long = rbind.fill(l)

# Apply functions ####

# Check the class of each variable in the lab dataset

for (i in 1:length(colnames(lab))){
  print(class(lab[,i]))
}


# sapply(): Repeat a command across columns of data
# (output is a vector) --- c()

# lapply(): Repeat a command across columns of data
# (output is list) --- list()

# tapply(): Computes summaries of our data across groups


sapp = sapply(X = lab, FUN = class)
lapp = lapply(lab, class)

# tapply(variable, grouping_variable, summary)

# Mean weight by id
tapply(weights.long$weight.kg, weights.long$id, mean)
tapply(weights.long$weight.kg, weights.long$id, sd)


# tidyverse approach
weights.long %>% 
  group_by(id) %>% 
  dplyr::summarise(mean = mean(weight.kg), sd = sd(weight.kg))


#Writing Functions ####

# function_name = function(input arguments){
#   Do this with input arguments
#   return(this)
#   
# }

# we create a function that adds 1 to our input

addone = function(X){
  Y = X + 1
  return(Y)
}

# testing our function
addone(12)
addone(5)

addone(c(12,13))

# Function that calculates BMI
calc_BMI = function(height, weight){
  BMI = weight / height^2
  
  return(BMI)
}

# testing calc_BMI
calc_BMI(1.8, 70)
calc_BMI(1.73, 70)

calc_BMI(weight = 70, height = 1.73)
calc_BMI(70,1.73)

# Two outputs: Function that computes both BMI
# and assigns a weight class

calc_BMI = function(height, weight){
  # calculate BMI
  BMI = weight / height^2
  
  # allocating BMI class
  if (BMI>30){
    BMIclass = "Overweight"
  } else{
    BMIclass = "Not Overweight"
  }
  
  return(list(BMI=BMI, BMIclass=BMIclass))
}


example1 = calc_BMI(height=1.65, weight=75)


# Length of airway (cm) = 4.35 + (0.09 x gestational age in weeks)
# + (1.09 x log(birth weight in kilograms))

airwaylength = function(gest, bweight){
  predval = 4.35 + 0.09*gest + 1.09*log(bweight)
  
  return(predval)
}

airwaylength(gest=30, bweight=1)

# Length of airway (cm) = 1.752 + (0.026 x gestational age in weeks)
# + (0.045 x log(birth weight in kilograms)) 
# + (0.642 x foot length in cm)
# + (0.09 x crown rump length in cm)
  
airwaylength = function(gest, bweight, FL, CRL, formula=1){
  if (formula == 1){
    predval = 4.35 + 0.09*gest + 1.09*log(bweight)
  } else if (formula == 2){
    predval = 1.752 + 0.026*gest + 0.045*log(bweight) + 0.642*FL + 0.09*CRL
  }
  return(predval)
}

airwaylength(gest=30, bweight = 1, FL=5, CRL=27)

airwaylength(gest=30, bweight = 1, FL=5, CRL=27, formula=2)

round(8.172, digits=2)


# edit function to round the values
airwaylength = function(gest, bweight, FL, CRL, formula=1, ...){
  if (formula == 1){
    predval = 4.35 + 0.09*gest + 1.09*log(bweight)
  } else if (formula == 2){
    predval = 1.752 + 0.026*gest + 0.045*log(bweight) + 0.642*FL + 0.09*CRL
  }
  return(round(predval, ...)) 
  # this pipes through the additional arguments to round()
}

airwaylength(gest=30, bweight = 1, FL=5, CRL=27, formula=2, digits=1)


# exercise 6 ####
#Write a function to calculate the difference between the means of 
#two variables (note: you can use the built-in function called mean in your code). 
#Name this function meandiff and include two arguments called var1 and var2.
meandiff = function(var1, var2){
  mdif = mean(var1) - mean(var2)
  return(mdif)
}

meandiff(var2=lab$ctrl.igm, var1=lab$stim.igm)

#Modify your function so that you can specify how many digits 
#you want to round the output off to (hint: use '...' as shown on the previous page of these notes).
meandiff = function(var1, var2, ...){
  mdif = mean(var1) - mean(var2)
  
  # rounding
  mdif = round(mdif, ...)
  return(mdif)
}
meandiff(var2=lab$ctrl.igm, var1=lab$stim.igm, digits = 2)

# Use a conditional statement (see section 6) 
# inside your function so that a warning message 
# is produced if there are any missing values in 
# either variable. You can use the function 
# called warning (as shown on page 29).

meandiff = function(var1, var2, ...){
  if( any( is.na(var1) )| any( is.na(var2) ) ){
    warning("There is a missing values in at least one of the variables inputted.")
  } else{
    mdif = mean(var1) - mean(var2)
    return(round(mdif, ...))
  }
}

meandiff(var2=lab$ctrl.igm, var1=lab$stim.igm, digits = 2)
meandiff(var2=lab$ctrl.fup.CD11b, var1=lab$ctrl.CD11b, digits = 2)