A Crash Course in R

This code has been kindly contributed by Robin Edwards (from UCL CASA).

There are many useful introductory guides out there to R, but below is the kind of thing I now wish I’d been given when I first started using it – something with simple logically-progressive examples and minimal explanatory text. Copy the text below into a new script in R and run line-by-line to give a quick intro to many of R’s most basic principles and functionality. You can also download a text file with it here. It is by no means comprehensive, even at the most basic level, but still I hope someone finds it useful. You may want to look at RStudio as it is more user-friendly.

## A CRASH COURSE IN [R] PROGRAMMING
## Robin Edwards (geotheory.co.uk), March 2013
## In RStudio run through line-by-line using Ctrl + Enter

# basic R environmental functions
x=3.14159; y=’hello world’; z=TRUE # create some objects. In RStudio they’ll appear in ‘Workspace’
ls() # list the objects in the Workspace
print(y) # print information to R ‘Console’
rm(y) # remove an object
rm(list=ls()) # remove all
getwd() # find current working directory
setwd(“/Users/robinedwards/Documents”) # set working directory as preferred
print ( “R ignores the ‘white-space’ in command syntax” )

# use ‘?’ for help on any R function (if its library is loaded in the session)
?max
??csv # search for a text string in R documentation library
library(help=utils) # get help on a particular package (list its functions)

# ‘str’ is a powerful tool for investigating the underlying structure of any R object
str(max)

# CREATING AND MANIPULATING R OBJECTS

# assigning values to variables
n = 5 # is possible but
n 5 -> n
rm(n)

# R objects can be of various data types, but probably most common are ‘numeric’ and ‘character’
( num ( char <- ‘any text string’ )

# create a VECTOR (array) using the ‘c()’ concatenate function
( vec

# a vector series
( vec

# R vectors can be accessed in various ways using [ ] brackets
vec[3]
vec[3:6]
vec[ c(1,3,8) ]
vec[vec > 15]

# check a vector contains a value
5 %in% vec
12 %in% vec

# finding first index position of a matching value/sting
( x = c(‘one’, ‘five’, ‘two’, 3, ‘two’) )
match(‘two’, x)
match(c(‘two’,’five’), x)

# a MATRIX is a 2D vector (essentially a vector of vectors) of matching data type
( matrx = matrix(1:15, 3, 5) )
( matrx dim(matrx) print(matrx)
t(matrx) # a matrix can be easily transposed

# an ARRAY is a generic vector but with more flexibiity. A 1D array is the same as a normal vector,
# and a 2D array is like a matrix. But arrays can store data with ‘n’ dimensions:
( arry

# Using square brackets on arrays
arry[12] # a single criterion (argument) selects the array’s n’th record
arry[3,1,2] # or use multiple arguments that reflect the array’s dimensionality
arry[,,2]
arry[,1,]

# a DATA.FRAME is like a matrix, but accomodates fields (columns) with different data types
(df

# They can be viewed easily
View(df)

# examine their internal stucture
str(df)

# data interrogation with square brackets
df[1,]
df[2:3,]
df[,1]
df[2,1]

# data.frame and matrix objects can have field (column) and record (row) names
dimnames(df)
colnames(df)
names(df) # not for matrix objects
row.names(df)

# interrogate data.frames by field name using the ‘$’ operator. the result is a simple vector
df$name
df$name[2]

# names can be reassigned
names(df) row.names(df) print(df)

# check dimensions of vector/matrix/array/data.frame objects
length(vec)
dim(df)
dim(arry)
nrow(df)
ncol(df)

# R has various inbuilt data.frame datasets used to illustrate how functions operate e.g.
data()
InsectSprays # this guide makes use of these datasets
warpbreaks

# examine contents
head(InsectSprays) # list the top records of a vector / matrix / d.f.
tail(InsectSprays, n=3) # bottom the 3
summary(InsectSprays) # summarise a data vector

# aggregate() is a powerful function for summarising categorical data
aggregate(InsectSprays$count, by=list(InsectSprays$spray), FUN=mean)
sumInsects names(sumInsects) print(sumInsects)

# subset/apply filter to a data.frame
warpbreaks[warpbreaks$wool==’A’,] # by 1 condition
warpbreaks[warpbreaks$tension %in% c(‘L’,’M’),] # multiple conditions

# adding entries is possible (if a bit tricky)
(newrow (warpbreaks

# but LISTS are better at this
lst = list()

# ways to assign/add items
lst[1] = “one”
lst[[2]] <- “two”
lst[length(lst)+1] <- “three”
print(lst)

# data retrieval
lst[[1]] # double brackets means the object returned is of the data class of the list item
lst[2:3] # selecting a more than 1 list item is possible with single brackets..
lst[c(1,3)] # but the object returned (from single bracket interrogation) is a list

# delete list items
lst[[3]] lst[1:2] lst

# entries can be any object type (like python), including other lists (double bracketting)
lst[[1]] lst[[2]] <- ‘item2′
lst
lst[[1]][[1]]

# Data in lists can also be stored and recalled by key word/number (like Python’s dictionary class)
dict dict[‘wed’] print(dict)
dict[[‘tues’]]
dict[c(‘mon’,’wed’)]

# reorder a vector with ‘sort’
vec sort(vec)

# or a dataframe with ‘order’
df[order(df$years),]

# LOGICAL objects (booleans) are binary true/false objects that facilitate conditional data processing
(bool (bool

# query an object’s data/structure type with ‘class()’
class(bool)
class(num) # numeric is the default data type for number objects
class(as.integer(num)) # integer class exists but is not default
class(char) # character class
class(’237′ ) # numbers aren’t always numeric type
as.numeric(’237′) # but can be converted
as.character(237) # and vice verse

# Child-objects are often of different class to parents
class(df)
class(df[,2])
class(df[,1])

# FACTOR objects are vectors of items that have been categorised by unique values
factr str(factr)
levels(factr)
table(factr)

# you may encounter problems converting a factor of numeric data to numeric type
as.numeric(factr)

# instead do this
as.numeric(as.character(factr))

# editing factors can be tricky
print(df)
df$person[1] <- ‘Matthew’

# instead convert to character or numeric etc
df$person df$person[1] <- ‘Matthew’
df$person levels(df$person)

# LOGICAL OPERATIONS
2 + 2 == 4 # ‘==’ denotes value equality
3 <= 2 # less than or equal to
3 >= 2 # greater than or equal to
‘string’ == “string”
‘b’ >= ‘a’ # strings can be ranked
3 != 3 # NOT operator
c(4,2,6) == c(4,2,8) # vector comparisons return locical vectors
TRUE == T # ‘T’ and ‘F’ default as boolean shortcuts (until overwritten)
TRUE & TRUE # AND operator
TRUE | FALSE # OR operator
F | F

# IF/ELSE statement (used in most logical procedures)
x if(x < 5){
print(‘x is less than 5′)
} else{
print(‘x is not less than 5′)
}

if(T|F) print(‘single liners can dispense with curly brackets’)
if(T&F) print(”) else print(“but then ‘else’ only works on the same line”)

# LOOPING FUNCTIONS – very useful for handling repetitive operations

# ‘FOR’ loop
for(i in 1:10){
print(paste(‘number ‘,i))
}

# WHILE loop (be careful to include safeguards to prevent infinite loops)
i = 30
while(i > 0){
print(paste(‘number ‘,i))
i = i – 3
}

# creating a function
multiply tot return(tot)
}

multiply(3,5)
# note ‘tot’ wasn’t remembered outside the function – functions are contained environments
# if required use ‘<<-’ for global assignment but be careful not to overwrite R’s internal objects
# its generally better to do this:
newVar

# handling ‘NA’ values
(x = 1:5)
x[8] = 8
x[3] = NA
print(x) # sometimes functions will fail because of NA values
na.omit(x) # iterates full list but ignores NAs
x[na.omit(x)]
is.na(x) # alternatively
x[!is.na(x)]

# useful basic math functions
seq(-2, 2, by=.2) # sequence of equal difference
seq(length=10, from=-5, by=.2) # with range defined by vector length
rnorm(20, mean = 0, sd = 1) # random normal distribution
runif(20, min=0, max=100) # array of random numbers
sample(0:100, 20, replace=TRUE) # array of random integers
table(warpbreaks[,2:3]) # array summary stats (powerful summary tool)
min(vec)
max(vec)
range(vec)
mean(vec)
median(vec)
sum(vec)
prod(vec)
abs(-5) # magnitude
sd(rnorm(10)) # standard deviation
4^2 # square
sqrt(16) # square root
5%%3 # modulo (remainder after subtraction of any multiple)
6%%2
for(i in 1:100) if(i%%20==0) print(i) # useful for running an operation every n’th cycle

# Importing and exporting data using comma-separated file
write.csv(df, ‘example.csv’) # save to csv file
rm(df)
(df

# PLOTTING IN R

# some basic functionality
plot(1:10)
plot(sort(rnorm(100)), pch=16, cex=0.5) # specifying point and size respectively
plot(x=1:25, y=25:1, pch=1:25) # x & y inputs, and showing the available point symbols
plot(sin, -pi, 2*pi)
hist(rnorm(1000), breaks=50)
barplot(sumInsects$sum, names.arg = sumInsects$group)
pie(sumInsects$sum, labels = sumInsects$group)

# plots with more visual components are built up incrementally
x plot(x, pch=17)
lines(x, col=’#00FF00′)
points(x+5, pch=16, col=’red’)

# stacking charts
warpbreaks
sumWB names(sumWB) sumWB
(data barplot(data, names.arg=c(‘Group A’,’Group B’),
legend.text=c(‘L’,’M’,’H’), args.legend = list(x = “right”))

barplot(data, names.arg=c(‘Group A’,’Group B’), beside=T,
legend.text=c(‘L’,’M’,’H’), args.legend = list(x = “topright”))

# ‘symbols()’ is a good way to represent a 3rd data dimension (use square root for area proportionality)
(cities lon=c(-0.1,-2.6,-2.2,-1.5), lat=c(51.5,51.4,53.5,53.8), pop=c(8,1,2.7,0.8)))
symbols(x=cities$lon, y=cities$lat, circles=sqrt(cities$pop), inches=0.3,
bg=’red’, fg=NULL, asp=T, xlab=’Longitude’, ylab=’Latitude’)
abline(h=(seq(51,53,1)), col=”lightgray”, lty=1)
abline(v=(seq(-4,1,1)), col=”lightgray”, lty=1)
text(x=cities$lon, y=cities$lat+0.2, labels=cities$city)

# But for much easier and more elegant data visualisation use GGPLOT2

# END OF SCRIPT