# Introduction to R ####

# Welcome to CASC's  Intro to R course :-)

# Wednesday 29 October 2025

# Welcome!

# Join the live lecture (available from 9.15am)

### Before 9.30am:
# 1. Please Open RStudio
# 2. Download the dataset from moodle
# 3. Open a pdf copy of the notes from moodle

# https://moodle.ucl.ac.uk/course/view.php?id=12595
# password: DatAnaROct2563

# Timetable ====
# 9:15am - 9:30am: Registration
# 9:30am - 11:00am: Session I
# 11:00am - 11:15am: Break
# 11:15am - 12:45pm: Session II
# 12.45pm - 1.45pm: Lunch Break
# 1.45pm - 3.00pm: Session III
# 3.00pm - 3.15pm: Break
# 3.15pm - 5.00pm: Session IV

# Welcome! 


# not a command, a comment instead

# R rules ####

# case sensitive
# calculator
# objects: datasets, variables, etc
# (R: objected orientated language)
# functions: verbs
# grammar, syntax: symbols () , [] {} + etc 

# defining an object ====
qone=81
qone

letters
LETTERS
pi

# applying a function ====
# calculate the square root of qone
# squareroot(qone): this is the wrong
# function name
sqrt(qone)
?sqrt
# or
help(sqrt)

# input of the acupuncture dataset ====
acu <- read.csv("~/Library/CloudStorage/OneDrive-UniversityCollegeLondon/Onedrive-Documents/## from my laptop/Courses/Intro to R/##LATEST##/R course files for teachers USB/acupuncture data.csv")
View(acu)
# or read in the data after setting 
# the working directory:
getwd() # what is the current working directory 
# in order to change the working directory (WD)
# I will use the drop down menu
# Session - Set working directory
setwd("~/Library/CloudStorage/OneDrive-UniversityCollegeLondon/Onedrive-Documents/## from my laptop/Courses/Intro to R/##LATEST##/R course files for teachers USB")
acu <- read.csv('acupuncture data.csv')

# getting the know the dataset ====
View(acu) # displays the dataset in another tab
names(acu) # gives a list of all variable names
dim(acu) # dimensions of the dataset
  # number of rows followed by number of columns
class(acu) # what type of entry is it?

acu$age
length(acu$age)
# dim(acu$age) wont work as age is not 2dimensional
# do not use length for data frames/sets

acu$pk1 # baseline headache score, pk1
length(acu$pk1)
class(acu$pk1)

head(acu$sex) # the first 6 entries
tail(acu$sex) # the last 6 entries

# instead of typing acu$age, i can 
# use its column number
head(names(acu)) # the first 6 variables
acu[,2] # filter the acu dataset to only
# produce an output of the 2nd column
# [rows,columns]

# more on filters []:
acu[,2] # 2nd column of acu: age
acu[2,] # 2nd row of acu: 2nd patient
acu[2,2] # the age of the 2nd patient: 52


# ages of the first 10 patients
1:10 # from one to 10
acu[1:10,2] # rows from 1 to 10 for the 2nd column

# the first 10 patient data for age and sex
acu[1:10,2:3]

# the first 10 patient data for age and group
c(2,9)
acu[1:10,c(2,9)]

# filter all the age data of the women
acu[acu$sex==1,2]
age.w <- acu[acu$sex==1,2]

length(acu[acu$sex==1,2])

# how many males & females we have aged 
# between 50 and 60? 
acu[acu$age>=50 & acu$age<=60,3]
table(acu[acu$age>=50 & acu$age<=60,3])

# add a new variable to a dataset ====
age.sq <- acu$age^2
acu$age.sq <- acu$age^2
dim(acu)

# add new rows with missing data 
acu[410,] <- NA
dim(acu)

# if added, remove the new rows and back to 
# 401 rows
acu <- acu[-(402:410),]


# recode age into categories ====
# <=39 below 39, 39-53 inbetween, >=53 above53
acu$age.cat[acu$age<=39] <- 'below 39'
acu$age.cat
acu$age.cat[acu$age>39 & acu$age<53] <- 'inbetween'
acu$age.cat
acu$age.cat[acu$age>=53] <- 'above 53'
acu$age.cat
table(acu$age.cat)

# missing values ====
acu$pk2 # headache score at 6 months
length(acu$pk2) # number of entries, both
  # observed and missing 

is.na(acu$pk2) # is each of the entries in
# pk2 missing (T) or not (F)
sum(is.na(acu$pk2)) # adds up all the TRUEs=75
# 75 missing values in pk2
sum(!is.na(acu$pk2)) # adds up all the non-NAs
# 326 observed values in pk2
# the question now becomes "is it not missing?"


# Exercise 3
# - Compare the length of the pk5 variable 
# against the actual number of observed
# measurements, i.e. remove NAs 
# (Hint: Use the is.na()) 

# pk5: headache score at 12 months
length(acu$pk5)
acu$pk5
sum(is.na(acu$pk5)) # 100 missing
sum(!is.na(acu$pk5)) # 301 observed 


# Create a new acupuncture data frame, 
#   acu2, containing only the patients with non-
# missing pk5 and acu3 with those missing pk5.

acu2 <- acu[!is.na(acu$pk5),]
dim(acu2)

acu3 <- acu[is.na(acu$pk5),]
dim(acu3)


#
# Welcome back after lunch ====

# Summarising categorical variables ====
# summary()
head(acu$sex)
# in R the class of the variable should be a factor to be categorical
class(acu$sex)

summary(acu$sex)

# create a new variable that is a factor/categorical with 
# names labels for 0/1
acu$sex.cat <- factor(x = acu$sex, 
                      levels = c(0,1), 
                      labels = c('male', 'female')
                      )
class(acu$sex.cat)
head(acu$sex.cat)

# now summarising factor, this will process as we expect for
# categorical variable
summary(acu$sex.cat)
tab_sex <- table(acu$sex.cat) # in preparation for adding margins
# without this you will get this error
# Error in addmargins(acu$sex.cat) : 'A' must be an array or table


addmargins(tab_sex) # adding margins to table
prop.table(tab_sex) # proportion table

round(prop.table(tab_sex)*100, 2) # transforming to percentages and rounding to 2 decimal places

write.csv(acu, "acupuncture_edited.csv") # save dataset to working directory, 
                                          # can specify a different file path.
list.files() # run to check files in working directory

# Summarising numerical variables ====
class(acu$pk1)
head(acu$pk1)
tail(acu$pk1)
length(acu$pk1)

# sum the entries in this variable
sum(acu$pk1)

# round the entries
round(acu$pk1) # with no specified digits, rounds to nearest whole number
round(acu$pk1, 1)

# sort our data in order
sort( head(acu$pk1), decreasing = TRUE )

# summary function
summary(acu$pk1)

# functions to obtain summary statistics
mean() # getting the mean
median() # getting median
sd() # getting the standard deviation
var() # for variance
IQR() # for the interquartile range
range() # max value - min value
quantile(, 0.25) # first quantile
quantile(, 0.5) # second quantile
quantile(, 0.75)# third quantile

# mean of pk1
mean(acu$pk1)

# first quantile of pk1
quantile(acu$pk1, 0.25)

# handling missing data in statistical summaries
summary(acu$pk5)
mean(acu$pk5) # returns NA
mean(acu$pk5, na.rm = TRUE)

# Exercise 4 ====
# How many females were aged between 20 and 25 years old
# and how many males were in the same age group?
acu.f <- acu[acu$sex==1, ]
acu.m <- acu[acu$sex==0, ]

dim(acu.f[acu.f$age>= 20 & acu.f$age<=25, ])
dim(acu.m[acu.m$age>= 20 & acu.m$age<=25, ])

# What was the average chronicity (acu$chronicity) of the patients
# in this trial? How does it compare with the median?
mean(acu$chronicity)
median(acu$chronicity)

summary(acu$chronicity)

# How does average chronicity differ between males and females?
mean(acu$chronicity[acu$sex==1])
mean(acu$chronicity[acu$sex==0])
mean(acu$chronicity[acu$sex==1]) - mean(acu$chronicity[acu$sex==0])



# Comparison of groups ====
# 2 categorical variables ====
acu$group
class(acu$group)

# transforming into a factor
acu$group2 <- factor(x = acu$group,
                     levels = c(0,1),
                     labels = c('placebo', 'acupuncture')
                     )

summary(acu$group2)

# creating a table for my groups
table_group = table(acu$sex.cat,acu$group2) # table with sex and group categorical vars
addmargins(table_group)

prop.table(table_group, margin = 1) # proportions by row
prop.table(table_group, margin = 2) # proportions by columns


# numerical variables ====
tapply(FUN = summary,
       X = acu$chronicity,
       INDEX = acu$sex.cat)

# with the original ordering
tapply(acu$chronicity,
       acu$sex.cat,
       summary)

# calculate standard deviation
tapply(FUN = sd,
       X = acu$chronicity,
       INDEX = acu$sex.cat)


# Graphs ====
# scatterplot between pk1 (baseline headache score)
# and pk5 (12 months follow up score)

class(acu$pk1)
class(acu$pk5)

plot(y = acu$pk1, x = acu$pk5) # flipping axes for plot
plot(acu$pk1, acu$pk5)

par(pty='s') # par: graphical parameters, pty: plot type, s: square
plot(acu$pk1, acu$pk5)

par(pty='m') # par: graphical parameters, pty: plot type, m: maximum
plot(acu$pk1, acu$pk5)

plot(acu$pk1, acu$pk5,
     main = 'Main title of the plot',
     sub = 'Subtitle at the bottom of the plot',
     xlab = 'Label of the x axis',
     ylab = 'Label of the y axis',
     pch=19, # point character, i.e. the symbol used
     cex = 0.5, # character expansion, i.e. size of the symbols
     xlim = c(0,100), # limits of the X axis, min to max
     ylim = c(0,100),
     col = 'red'
     )

points(x=20, y=80,
       col = 'magenta',
       cex = 2,
       pch = '%')

points(x=50, y=80,
       col = 'magenta',
       cex = 2,
       pch = '@')

abline(h=80, v=20) # h: horizontal, v: vertical

lines(x = c(0,100), y = c(0, 100),
      lty = 2, # line type: solid, dashed, dotted, etc
      lwd = 2, # line width
      col = 'blue'
      )


# Exercise 6 ====
# define a variable x that is equal to integers 1 to 20
x = 1:20
# define a variable y equal to the cubed values of x
y = x^3

# draw a histogram of y
hist(y)

# then plot a scatterplot between x and y as done previously
plot(x,y)

# add on scatterplot a vertical and horizontal line
# at points 16 and 4096 respectively
abline(v=16, h=4096)

# create one window that contains all graphs 
# (hint: use par(mfrow = c()))
par(mfrow = c(2,2)) # multiple figures by row, 2 rows by 2 columns
hist(y)
plot(x,y)
plot(x,y)
abline(v=16, h=4096)
plot(y,x)

layout(1) # reverts back to one graph at a time
hist(y)
plot(x,y)
plot(x,y)
abline(v=16, h=4096)
plot(y,x)

# Statistical Significant/Inference (CI, p-values) ====
?t.test

# one-sample t test ====


#
# index of functions ====
sqrt()
read.csv()
names()
View()
dim()
class()
setwd()
getwd()
length()
head()
tail()
c()
table()
is.na()
!is.na()
sum()
# : , () [] <- = $ 
summary()
round()
mean()
median()
sd()
var()
range()
quantile()
IQR()
write.csv()
addmargins()
prop.table()
tapply()