# Introduction to R #### # Welcome to CASC's Intro to R course :-) # Wednesday 29 October 2025 # Welcome! # Join the live lecture (available from 9.15am) ### Before 9.30am: # 1. Please Open RStudio # 2. Download the dataset from moodle # 3. Open a pdf copy of the notes from moodle # https://moodle.ucl.ac.uk/course/view.php?id=12595 # password: DatAnaROct2563 # Timetable ==== # 9:15am - 9:30am: Registration # 9:30am - 11:00am: Session I # 11:00am - 11:15am: Break # 11:15am - 12:45pm: Session II # 12.45pm - 1.45pm: Lunch Break # 1.45pm - 3.00pm: Session III # 3.00pm - 3.15pm: Break # 3.15pm - 5.00pm: Session IV # Welcome! # not a command, a comment instead # R rules #### # case sensitive # calculator # objects: datasets, variables, etc # (R: objected orientated language) # functions: verbs # grammar, syntax: symbols () , [] {} + etc # defining an object ==== qone=81 qone letters LETTERS pi # applying a function ==== # calculate the square root of qone # squareroot(qone): this is the wrong # function name sqrt(qone) ?sqrt # or help(sqrt) # input of the acupuncture dataset ==== acu <- read.csv("~/Library/CloudStorage/OneDrive-UniversityCollegeLondon/Onedrive-Documents/## from my laptop/Courses/Intro to R/##LATEST##/R course files for teachers USB/acupuncture data.csv") View(acu) # or read in the data after setting # the working directory: getwd() # what is the current working directory # in order to change the working directory (WD) # I will use the drop down menu # Session - Set working directory setwd("~/Library/CloudStorage/OneDrive-UniversityCollegeLondon/Onedrive-Documents/## from my laptop/Courses/Intro to R/##LATEST##/R course files for teachers USB") acu <- read.csv('acupuncture data.csv') # getting the know the dataset ==== View(acu) # displays the dataset in another tab names(acu) # gives a list of all variable names dim(acu) # dimensions of the dataset # number of rows followed by number of columns class(acu) # what type of entry is it? acu$age length(acu$age) # dim(acu$age) wont work as age is not 2dimensional # do not use length for data frames/sets acu$pk1 # baseline headache score, pk1 length(acu$pk1) class(acu$pk1) head(acu$sex) # the first 6 entries tail(acu$sex) # the last 6 entries # instead of typing acu$age, i can # use its column number head(names(acu)) # the first 6 variables acu[,2] # filter the acu dataset to only # produce an output of the 2nd column # [rows,columns] # more on filters []: acu[,2] # 2nd column of acu: age acu[2,] # 2nd row of acu: 2nd patient acu[2,2] # the age of the 2nd patient: 52 # ages of the first 10 patients 1:10 # from one to 10 acu[1:10,2] # rows from 1 to 10 for the 2nd column # the first 10 patient data for age and sex acu[1:10,2:3] # the first 10 patient data for age and group c(2,9) acu[1:10,c(2,9)] # filter all the age data of the women acu[acu$sex==1,2] age.w <- acu[acu$sex==1,2] length(acu[acu$sex==1,2]) # how many males & females we have aged # between 50 and 60? acu[acu$age>=50 & acu$age<=60,3] table(acu[acu$age>=50 & acu$age<=60,3]) # add a new variable to a dataset ==== age.sq <- acu$age^2 acu$age.sq <- acu$age^2 dim(acu) # add new rows with missing data acu[410,] <- NA dim(acu) # if added, remove the new rows and back to # 401 rows acu <- acu[-(402:410),] # recode age into categories ==== # <=39 below 39, 39-53 inbetween, >=53 above53 acu$age.cat[acu$age<=39] <- 'below 39' acu$age.cat acu$age.cat[acu$age>39 & acu$age<53] <- 'inbetween' acu$age.cat acu$age.cat[acu$age>=53] <- 'above 53' acu$age.cat table(acu$age.cat) # missing values ==== acu$pk2 # headache score at 6 months length(acu$pk2) # number of entries, both # observed and missing is.na(acu$pk2) # is each of the entries in # pk2 missing (T) or not (F) sum(is.na(acu$pk2)) # adds up all the TRUEs=75 # 75 missing values in pk2 sum(!is.na(acu$pk2)) # adds up all the non-NAs # 326 observed values in pk2 # the question now becomes "is it not missing?" # Exercise 3 # - Compare the length of the pk5 variable # against the actual number of observed # measurements, i.e. remove NAs # (Hint: Use the is.na()) # pk5: headache score at 12 months length(acu$pk5) acu$pk5 sum(is.na(acu$pk5)) # 100 missing sum(!is.na(acu$pk5)) # 301 observed # Create a new acupuncture data frame, # acu2, containing only the patients with non- # missing pk5 and acu3 with those missing pk5. acu2 <- acu[!is.na(acu$pk5),] dim(acu2) acu3 <- acu[is.na(acu$pk5),] dim(acu3) # # Welcome back after lunch ==== # Summarising categorical variables ==== # summary() head(acu$sex) # in R the class of the variable should be a factor to be categorical class(acu$sex) summary(acu$sex) # create a new variable that is a factor/categorical with # names labels for 0/1 acu$sex.cat <- factor(x = acu$sex, levels = c(0,1), labels = c('male', 'female') ) class(acu$sex.cat) head(acu$sex.cat) # now summarising factor, this will process as we expect for # categorical variable summary(acu$sex.cat) tab_sex <- table(acu$sex.cat) # in preparation for adding margins # without this you will get this error # Error in addmargins(acu$sex.cat) : 'A' must be an array or table addmargins(tab_sex) # adding margins to table prop.table(tab_sex) # proportion table round(prop.table(tab_sex)*100, 2) # transforming to percentages and rounding to 2 decimal places write.csv(acu, "acupuncture_edited.csv") # save dataset to working directory, # can specify a different file path. list.files() # run to check files in working directory # Summarising numerical variables ==== class(acu$pk1) head(acu$pk1) tail(acu$pk1) length(acu$pk1) # sum the entries in this variable sum(acu$pk1) # round the entries round(acu$pk1) # with no specified digits, rounds to nearest whole number round(acu$pk1, 1) # sort our data in order sort( head(acu$pk1), decreasing = TRUE ) # summary function summary(acu$pk1) # functions to obtain summary statistics mean() # getting the mean median() # getting median sd() # getting the standard deviation var() # for variance IQR() # for the interquartile range range() # max value - min value quantile(, 0.25) # first quantile quantile(, 0.5) # second quantile quantile(, 0.75)# third quantile # mean of pk1 mean(acu$pk1) # first quantile of pk1 quantile(acu$pk1, 0.25) # handling missing data in statistical summaries summary(acu$pk5) mean(acu$pk5) # returns NA mean(acu$pk5, na.rm = TRUE) # Exercise 4 ==== # How many females were aged between 20 and 25 years old # and how many males were in the same age group? acu.f <- acu[acu$sex==1, ] acu.m <- acu[acu$sex==0, ] dim(acu.f[acu.f$age>= 20 & acu.f$age<=25, ]) dim(acu.m[acu.m$age>= 20 & acu.m$age<=25, ]) # What was the average chronicity (acu$chronicity) of the patients # in this trial? How does it compare with the median? mean(acu$chronicity) median(acu$chronicity) summary(acu$chronicity) # How does average chronicity differ between males and females? mean(acu$chronicity[acu$sex==1]) mean(acu$chronicity[acu$sex==0]) mean(acu$chronicity[acu$sex==1]) - mean(acu$chronicity[acu$sex==0]) # Comparison of groups ==== # 2 categorical variables ==== acu$group class(acu$group) # transforming into a factor acu$group2 <- factor(x = acu$group, levels = c(0,1), labels = c('placebo', 'acupuncture') ) summary(acu$group2) # creating a table for my groups table_group = table(acu$sex.cat,acu$group2) # table with sex and group categorical vars addmargins(table_group) prop.table(table_group, margin = 1) # proportions by row prop.table(table_group, margin = 2) # proportions by columns # numerical variables ==== tapply(FUN = summary, X = acu$chronicity, INDEX = acu$sex.cat) # with the original ordering tapply(acu$chronicity, acu$sex.cat, summary) # calculate standard deviation tapply(FUN = sd, X = acu$chronicity, INDEX = acu$sex.cat) # Graphs ==== # scatterplot between pk1 (baseline headache score) # and pk5 (12 months follow up score) class(acu$pk1) class(acu$pk5) plot(y = acu$pk1, x = acu$pk5) # flipping axes for plot plot(acu$pk1, acu$pk5) par(pty='s') # par: graphical parameters, pty: plot type, s: square plot(acu$pk1, acu$pk5) par(pty='m') # par: graphical parameters, pty: plot type, m: maximum plot(acu$pk1, acu$pk5) plot(acu$pk1, acu$pk5, main = 'Main title of the plot', sub = 'Subtitle at the bottom of the plot', xlab = 'Label of the x axis', ylab = 'Label of the y axis', pch=19, # point character, i.e. the symbol used cex = 0.5, # character expansion, i.e. size of the symbols xlim = c(0,100), # limits of the X axis, min to max ylim = c(0,100), col = 'red' ) points(x=20, y=80, col = 'magenta', cex = 2, pch = '%') points(x=50, y=80, col = 'magenta', cex = 2, pch = '@') abline(h=80, v=20) # h: horizontal, v: vertical lines(x = c(0,100), y = c(0, 100), lty = 2, # line type: solid, dashed, dotted, etc lwd = 2, # line width col = 'blue' ) # Exercise 6 ==== # define a variable x that is equal to integers 1 to 20 x = 1:20 # define a variable y equal to the cubed values of x y = x^3 # draw a histogram of y hist(y) # then plot a scatterplot between x and y as done previously plot(x,y) # add on scatterplot a vertical and horizontal line # at points 16 and 4096 respectively abline(v=16, h=4096) # create one window that contains all graphs # (hint: use par(mfrow = c())) par(mfrow = c(2,2)) # multiple figures by row, 2 rows by 2 columns hist(y) plot(x,y) plot(x,y) abline(v=16, h=4096) plot(y,x) layout(1) # reverts back to one graph at a time hist(y) plot(x,y) plot(x,y) abline(v=16, h=4096) plot(y,x) # Statistical Significant/Inference (CI, p-values) ==== ?t.test # one-sample t test ==== # # index of functions ==== sqrt() read.csv() names() View() dim() class() setwd() getwd() length() head() tail() c() table() is.na() !is.na() sum() # : , () [] <- = $ summary() round() mean() median() sd() var() range() quantile() IQR() write.csv() addmargins() prop.table() tapply()