R-programming

Sep 12th, 2014 11:30 am

Overview and History of R

Basically R is a dialect of S.

Recommended Books: * Statistical Models in S. John M. Chambers. * Software for Data Analysis, Programming with R. John M. Chambers.

Getting Help

In R:

?subject

Vector

This is the very basic object in R. It contains multiple elements of the same type.

We can build a vector with c() and listing its members.

# Basic vector construction
vec <- c(2, 5, 8)

# Object class
# A vector is atomic: it contains a collection of elements of one type
> class(vec)
  [1] "numeric"

# Object structure summary
> str(vec)
 atomic [1:3] 2 5 8
 - attr(*, "comment")= chr "A dummy sample"

 # Object length
 > length(vec)
[1] 3

# We can freely attach a comment to the object
> comment(vec)
  NULL
> comment(vec) <- "A dummy sample"
> comment(vec)
  [1] "A dummy sample"

# Names of the vector elements
> names(vec)
  NULL
> names(vec) <- c("two", "five", "eight")
vec
  two  five eight
    2     5     8
# We can use those names to select vector elements
> vec["five"]
five
   5
#However, it is still a vector
> class(vec)
[1] "numeric"

# Object dimensions.
# The vector is "dimensionless",  a matrix, array or data frame are not.
> dim(vec)
  NULL
# There is name for each dimension
> dimnames(vec)
  NULL

Handling Vector Content

# Append elements
> vec <- c(2, 5, 8)
> vec
[1] 2 5 8
> vec_ext <- append(vec, c(98, 99))
> vec_ext
[1]  2  5  8 98 99

# Insert new element at any position
> vec_mod <- append(vec, c(98,99), after=2)
> vec_mod
[1]  2  5 98 99  8

# Build a vector from other vectors
# e.g. prefix and suffix a vector
> vec2 <- c(c(80,81), vec, c(98,99))
[1] 80 81  2  5  8 98 99

Type Coercion (Type Casting)

# We can cast a vector with one type of content to another one
vec <- c(2, 5, 8)

# As character elements
vec_tect <- as.character(vec)
vec_tect <- as.character(vec)
vec_tect
  [1] "2" "5" "8"

# As logical elements
# Zero is FALSE, otherwise is TRUE
vec_log <- as.logical(vec)
vec_log
[1] TRUE TRUE TRUE

Matrix

# Basic matrix construction
> mtx <- matrix(1:6, nrow=2, ncol=3)

# Notice how the values are assigned column-wise.
# As a vector, all the matrix elements are of the same type.
# We can think of the matrix as a vector segmented in columns
> mtx
     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6

# Object class
> class(mtx)
[1] "matrix"

# Object length
> length(mtx)
[1] 6

# Number of columns and rows
> nrow(mtx)
[1] 2
> ncol(mtx)
[1] 3

# Object structure summary
> str(mtx)
 int [1:2, 1:3] 1 2 3 4 5 6

# Object dimensions.
> dim(mtx)
[1] 2 3

# Names of each object dimension
> dimnames(mtx)
NULL

Handling Matrix Content

# cbind and rbind are more powerful way to build a matrix
# Each parameter in the call becomes a column of
# the resulting matrix.
# The smaller parameters (with smaller count of elements)
# are recycled, so they have to be a sub-multiple of
# the largest one. Otherwise you will get a warning message.
> mtx <- cbind(seq=1:8, quarter=1:4, bin=0:1)
> mtx
     seq quarter bin
[1,]   1       1   0
[2,]   2       2   1
[3,]   3       3   0
[4,]   4       4   1
[5,]   5       1   0
[6,]   6       2   1
[7,]   7       3   0
[8,]   8       4   1

# Get the column names
> colnames(mtx)
[1] "seq"     "quarter" "bin"

# We can build the same matrix using:
> seq=1:8
> quarter=1:4
> bin=0:1
> mtx <- cbind(seq, quarter, bin)

# The name of the columns once again will be
> colnames(mtx)
[1] "seq"     "quarter" "bin"

# row names
> row.names(mtx)
NULL

# The matrix structure
> str(mtx)
 int [1:8, 1:3] 1 2 3 4 5 6 7 8 1 2 ...
 - attr(*, "dimnames")=List of 2
  ..$ : NULL
  ..$ : chr [1:3] "seq" "quarter" "bin"

# You can flat ("vectorize") a matrix with c()
> vec <- c(mtx)
> vec
 [1] 1 2 3 4 5 6 7 8 1 2 3 4 1 2 3 4 0 1 0 1 0 1 0 1

# In the same way you can use mtx[[i]] to traverse mtx as a vector
> mtx[[5]]
[1] 5

# All the matrix elements should have the same type.
# Any attempt to use mixed types will cause the casting of all elements to
# a broader type
> mtx <- cbind(seq=1:8, quarter=1:4, gender=c("male", "female"))
> mtx
     seq quarter gender
[1,] "1" "1"     "male"
[2,] "2" "2"     "female"
[3,] "3" "3"     "male"
[4,] "4" "4"     "female"
[5,] "5" "1"     "male"
[6,] "6" "2"     "female"
[7,] "7" "3"     "male"
[8,] "8" "4"     "female"

# row wise binding
> mtx <- rbind(seq=1:8, quarter=1:4, bin=0:1)
> mtx
        [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
seq        1    2    3    4    5    6    7    8
quarter    1    2    3    4    1    2    3    4
bin        0    1    0    1    0    1    0    1

# row names
> row.names(mtx)
[1] "seq"     "quarter" "bin"

# The matrix structure
> str(mtx)
 int [1:3, 1:8] 1 1 0 2 2 1 3 3 0 4 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:3] "seq" "quarter" "bin"
  ..$ : NULL

List

> l <- list(1, "Hello", c(18,65))
> l
[[1]]
[1] 1

[[2]]
[1] "Hello"

[[3]]
[1] 18 65

Handling Environments

rm: Remove objects from a specified environment
ls: List objects
objects

Handling packages

install.packages
update.packages
library: load a library
packageVersion: Find the version number of a package

Factor

Is a special type of vector which is used to create to represent categorical data.

# Lets build a vector with factor values
> has_stock = c("yes", "yes", "no", "yes", "no")
> has_stock
[1] "yes" "yes" "no"  "yes" "no"

# Convert the vector to an *unordered* factor
> fctr <- factor(has_stock)
# Notice the following values are not quoted: they are not
# characters, they are factors!
# Also notice the sequence of the levels
> fctr
[1] yes yes no  yes no
Levels: no yes

# Object class
> class(fctr)
[1] "factor"

# Object structure summary
> str(fctr)
 Factor w/ 2 levels "no","yes": 2 2 1 2 1

# Object length
> length(fctr)
[1] 5

# Factors has the attribute levels, which returns the encoded
# labels. The factors are alphabetically as an ordered sequence.
# Each factor is encoded with its index in that sequence.
# i.e. {1:no, 2:yes}
> levels(fctr)
[1] "no"  "yes"

# You can specify the encoding with the levels parameter
> fctr <- factor(c("yes", "yes", "no", "yes", "no"),
+    levels = c("yes", "no"))
# Now the the factors are encoded as specified
> levels(fctr)
[1] "yes" "no"
> str(fctr)
 Factor w/ 2 levels "yes","no": 1 1 2 1 2

Handling Factors Content

# *table()* returns an object of 'table' type with the frequency
# of each disticnt factor
> fctr <- factor(c("yes", "yes", "no", "yes", "no"))
> table(fctr)
fctr
 no yes
  2   3

# Unclass() allows the access to the coded values
> ucl <- unclass(fctr)
> str(ucl)
 atomic [1:5] 2 2 1 2 1
 - attr(*, "levels")= chr [1:2] "no" "yes"
# Get the factor as label
> fctr[2]
[1] yes
Levels: no yes
# Get the factor as code
 > ucl[2]
[1] 2

Missing Values

NA: Not Available. Missing value. NaN: Has a “numeric nature” but is not a numeric value (e.g the value of 0/0. They are used to represent missing numeric values.

# is.na() and is.nan() are used to identify this missing values.
# Not Available
> notAvail <- NA
> is.na(notAvail)
[1] TRUE

# Not a Number
> notNumber <- NAN
> is.nan(notNumber)
[1] TRUE

# A NaN is a missing value
> is.na(notNumber)
[1] TRUE

# But a missing value is not a NaN
> is.nan(notAvail)
[1] FALSE

Sequences

> seq(1,10,by=2)
[1] 1 3 5 7 9
> seq(1,10,length=4)
[1]  1  4  7 10

> x <- c(1,2,3,5,8,13,21)
> seq(along = x)
[1] 1 2 3 4 5 6 7
> seq_along(x)
[1] 1 2 3 4 5 6 7

ifelse(condition, yes_value, no_value)

Subsetting

Operators to extract subsets of R objects:

[ Always returns an object of the same class as the source;
[[ Extract elements of a list or a data frame by name or index. Extracts a single element.
$ Extract elements of a list or a data frame by name

> vect <- 1:20

# Extract the element with a given index
> vect[2]
[1] 2
> vect[[3]]
[1] 3

# Extract the elements with the given indices
> vect[c(3,5,7)]
[1] 3 5 7
> vect[3:7]
[1] 3 4 5 6 7

#Extract the elements flagged with a TRUE value
#Notice the logical selector values are recycled
> vect[c(TRUE,FALSE)]
 [1]  1  3  5  7  9 11 13 15 17 19

#Extract the elements multiple of 5
> vect[vect %% 5 == 0]
[1]  5 10 15 20
# Notice the condition is producing a vector of logical values
# selecting the chosen values
cond <- vect %% 5 == 0
> vect[cond]
[1]  5 10 15 20

#Selecting a value with an out of range index
> vect[30]
[1] NA

Subsetting a Matrix

# Remember the first coordinate is the row and the second is the column.
# However, the matrix -by default- is filled column-wise.
> mtx <- matrix(1:6, 2, 3)
> mtx
     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6

# Select an element by its indices
> mtx[2,3]
[1] 6

# If one index is missing, the whole row/column is selected
> mtx[,2]
[1] 3 4

#Select elements in second row and 2th and 3rd columns
> mtx[2,2:3]
[1] 4 6

# The result is a vector
> class(mtx[2,2:3])
[1] "integer"

# We can select a sub-matrix using the setting drop = FALSE
> mtx[2,2:3, drop=FALSE]
     [,1] [,2]
[1,]    4    6

#Selecting a value with an out of range index
> mtx[3,1]
Error in mtx[3, 1] : subscript out of bounds

Subsetting a List

> lst <- list(seq=1:5, age=45)
> lst
$seq
[1] 1 2 3 4 5

$age
[1] 45

# Selecting with an [index] returns a list (name, and value) with the given element
> lst[2]
$age
[1] 45

# Selecting with an [[index]] returns only value with the given element
> lst[[2]]
[1] 45
# We can use the element name [[name]] instead of the index
> lst[["age"]]
[1] 45
> lst["age"]
$age
[1] 45

# The $ operator extracts the list element in the same way as [[name]]
> lst$age
[1] 45

# Extracting a non existent element
> lst[8]
$<NA>
NULL
> lst["hi"]
$<NA>
NULL
> lst[[8]]
Error in lst[[8]] : subscript out of bounds

Manipulating a data Frame

> df <- data.frame( col1=rnorm(10), col2=runif(10))

# Select rows 1 to 3 and column named "col2"
> df[1:3,"col2"]
[1] 0.3847243 0.4879190 0.2292318

# Select all rows but the first three ones
> df[-(1:3),]
          col1      col2
4   0.33903024 0.2793928
5  -1.45067820 0.7794572

# Select rows by two logical conditions joined by logical "and" (&).
# The comma is to indicate there is not restriction about the columns,
# this way all the columns in the data frame will appear in the output
> df[(df$col1>0 & df$col2>0), ]
       col1      col2
2 0.4996887 0.4879190
4 0.3390302 0.2793928
:
# This selection is similar to the previous one
# but the conditions are joined by a logical "or" (|)
> df[(df$col1>0 | df$col2>0), ]
          col1      col2
1  -0.28077454 0.3847243
2   0.49968870 0.4879190
:

# The function which returns a vector with the indices
# of the elements satisfying the given condition
> which(df$col1>0)
[1] 2 4 6 7

# The sort function you can sort the data
> sort(df$col1)
 [1] -1.45067820 -0.37078268 -0.32129470 -0.28077454 -0.10086351 -0.09041061
 [7]  0.33903024  0.49968870  0.59671933  1.57662348
 # By default the sort is for increasing values
> sort(df$col1, decreasing = T)
 [1]  1.57662348  0.59671933  0.49968870  0.33903024 -0.09041061 -0.10086351
 [7] -0.28077454 -0.32129470 -0.37078268 -1.45067820

The order of the rows in the data frame can be sort using the order function
> df[order(df$col1, decreasing = T),]
          col1      col2
6   1.57662348 0.3149501
7   0.59671933 0.9490971

# The order function returns the row indices sorted as specified
order(df$col1, decreasing = T)
 [1]  6  7  2  4  8  3  1  9 10  5

# You can order by multiple columns
> df[order(df$col1, df$col2),]
          col1      col2
5  -1.45067820 0.7794572
10 -0.37078268 0.4501038

## Add a new column  ##
# Notice how the new values are recycled
> df$new <- c("hi", "lo")
> df
          col1      col2 new
1  -0.28077454 0.3847243  hi
2   0.49968870 0.4879190  lo
3  -0.10086351 0.2292318  hi

# Select rows with a column value in a given list
> df2[df2$height %in% c("hi"),]
        col1      col2 height
1 -0.2807745 0.3847243     hi
3 -0.1008635 0.2292318     hi

## Delete a column ##
> df$new <- NULL
> df
          col1      col2
1  -0.28077454 0.3847243
2   0.49968870 0.4879190

## Add a new getting a new data frame by combining columns
> df2 <- cbind(df, height=c("hi", "lo"))
> df2
          col1      col2 height
1  -0.28077454 0.3847243     hi
2   0.49968870 0.4879190     lo

Plyr

filter(.data, …) summarise(.data, …) summarize(.data, …) mutate(.data, …) arrange(.data, …) select(.data, …)

# Sort the rows with the col1 values in ascending order
arrange(df, col1)
# Same as previous but with descending order
arrange(df, desc(col1))

Subsetting Nested Elements of a List

# Select the 3rd element in the first list element
> lst[[1]][3]
[1] 3

# This is interpreted as 2 indices (in two dimensions)
# Which is wrong for a list
> lst[1,3]
Error in lst[1, 3] : incorrect number of dimensions

# Select the 1st and 3rd members in the list
> lst[c(1,3)]
$seq
[1] 1 2 3 4 5

$<NA>
NULL

# Select from the first list member, its 3rd element
> lst[[c(1,3)]]
[1] 3

# "Similar" subsetting with the indices not as part of a vector:
> lst[[1,3]]
Error in lst[[1, 3]] : incorrect number of subscripts

# Partial name matching
# Notice the list has "seq" member, but it can be
# retrieved just mentioning part of the name
> lst$s
[1] 1 2 3 4 5
However, martial name matching doesn't work with the [[name]] operator:
> lst$s
[1] 1 2 3 4 5
Unless we use the setting exact = FALSE
> lst[["s", exact =F]]
[1] 1 2 3 4 5

Removing NA values

# The function complete.cases() finds the indices where both vector has
# non NA values:
> x <- c(1, 2, NA, 4, NA, 6)
> y <- c("1", "2", NA, "4", "5", NA)
> complete <- complete.cases(x, y)
> x[complete]
[1] 1 2 4
> y[complete]
[1] "1" "2" "4"

Control Structures

if

# Complete "if"
if(<condition>) {
  ## do something
} else {
  ## do something else
}

# "if" with complete "else if"
if(<condition1>) {
  ## do something
} else if(<condition2>) {
  ## do something different
} else {
  ## do something different
}

# Valued if
y <- if(x > 3) {
  10
} else {
  0
}

for

# In all the following cases "Iterate" can be a single R sentence or more
# than one enclosed by curly brackets {# Iterate}
for(i in 1:4) # Iterate
for(i in seq_along(x)) # Iterate
for(letter in x) # Iterate

while

while (<condition>) # Iterate

repeat

The iteration body must contain a break statement to terminate an otherwise never-ending loop.

repeat # Iterate

break and next

next is used to skip to the following iteration in a loop break is used to skip a loop

Functions

The value returned by the function is the result of the last evaluated expression.

f <- function(<arguments>) {
  ## Do something interesting
}

Argument Matching

Check for exact match for a named argument
Check for a partial match
Check for a positional match

Defining a Function

f <- function(a, b = 1, c = 2, d = NULL)

Lazy Evaluation

Arguments to functions are evaluated lazily, so they are evaluated only as needed.

The “…” Argument

The … argument indicate a variable number of arguments that are usually passed on to other functions.

myplot <- function(x, y, type = "l", ...) {
  plot(x, y, type = type, ...)
}

The “…” Argument

The … argument is also necessary when the number of arguments passed to the function cannot be known in advance.

> args(paste)
function (..., sep = " ", collapse = NULL)

One catch with … is that any arguments that appear after … on the argument list must be named explicitly and cannot be partially matched. the arguments names must be fully specified.

> args(paste)
function (..., sep = " ", collapse = NULL)
> paste("a", "b", sep = ":")
[1] "a:b"

# The partial match for "se" is ignored! so "se" is another element of the "..." argument
# and concatenated together with "a" and "b".
> paste("a", "b", se = ":")
[1] "a b :"

Scoping Rules

How does R know which value to assign to which symbol?

When R tries to bind a value to a symbol, it searches through a series of environments to find the appropriate value. When you are working on the command line and need to retrieve the value of an R object, the order is roughly

Search the global environment for a symbol name matching the one requested.
Search the namespaces of each of the packages on the search list

The search list can be found by using the search function.

> search()
[1] ".GlobalEnv" "package:stats" "package:graphics"
[4] "package:grDevices" "package:utils" "package:datasets"
[7] "package:methods" "Autoloads" "package:base"

The global environment or the user’s workspace is always the first element of the search list and the base package is always the last.

The order of the packages on the search list matters! User’s can configure which packages get loaded on startup so you cannot assume that there will be a set list of packages available.

When a user loads a package with library the namespace of that package gets put in position 2 of the search list (by default) and everything else gets shifted down the list.

Note that R has separate namespaces for functions and non-functions so it’s possible to have an object named c and a function named c.

Scoping Rules

R uses lexical scoping or static scoping. A common alternative is dynamic scoping.
Related to the scoping rules is how R uses the search list to bind a value to a symbol
Lexical scoping turns out to be particularly useful for simplifying statistical computations

f <- function(x, y) {
  x^2 + y / z
}

In this case “z” is called a free variable.

Lexical scoping in R means that the values of free variables are searched for in the environment in which the function was defined.

An environment is a collection of (symbol, value) pairs, i.e. x is a symbol and 3.14 might be its value.
Every environment has a parent environment; it is possible for an environment to have multiple “children”
The only environment without a parent is the empty environment
A function + an environment = a closure or function closure.

Searching for the value for a free variable:

If the value of a symbol is not found in the environment in which a function was defined, then the search is continued in the parent environment.
The search continues down the sequence of parent environments until we hit the top-level environment; this usually the global environment (workspace) or the namespace of a package.
After the top-level environment, the search continues down the search list until we hit the emptyenvironment. If a value for a given symbol cannot be found once the empty environment is arrived at, then an error is thrown.

make.power <- function(n) {
  pow <- function(x) {
    x^n
  }
  pow
}
> cube <- make.power(3)
> square <- make.power(2)
> cube(3)
[1] 27
> square(3)
[1] 9

# What’s in a function’s environment?
> ls(environment(cube))
[1] "n" "pow"
> get("n", environment(cube))
[1] 3

Lexical vs. Dynamic Scoping

y <- 10 ## One variable y
f <- function(x) {
  y <- 2  ## A second variable y !!
  y^2 + g(x)
}
g <- function(x) {
  x*y
}

With lexical scoping the value of y in the function g is looked up in the environment in which the function was defined, in this case the global environment, so the value of y is 10.
With dynamic scoping, the value of y is looked up in the environment from which the function was called (sometimes referred to as the calling environment). So the value of y would be 2.

Date and Time

Dates are represented by the Date class
Times are represented by the POSIXct or the POSIXlt class
Dates are stored internally as the number of days since 1970-01-01
Times are stored internally as the number of seconds since 1970-01-01

# We can "unclass" a date and get the number of days since 1970-01-01
unclass(as.Date("1970-01-02"))
## [1] 1

Time

POSIXct is just a very large integer under the hood; it use a useful class when you want to store times in something like a data frame
POSIXlt is a list underneath and it stores a bunch of other useful information like the day of the week, day of the year, month, day of the month. If you subtract two dates represented with this class, you will get the number of days between those dates.

There are a number of generic functions that work on dates and times

weekdays: give the day of the week
months: give the month name
quarters: give the quarter number (“Q1”, “Q2”, “Q3”, or “Q4”)

x <- Sys.time()
x
## [1] "2013-01-24 22:04:14 EST"
p <- as.POSIXlt(x)
names(unclass(p))
## [1] "sec" "min" "hour" "mday" "mon"
## [6] "year" "wday" "yday" "isdst"
p$sec
## [1] 14.34

There is the strptime function in case your dates are written in a different format

datestring <- c("January 10, 2012 10:40", "December 9, 2011 9:10")
x <- strptime(datestring, "%B %d, %Y %H:%M")
x
## [1] "2012-01-10 10:40:00 EST" "2011-12-09 09:10:00 EST"

Check ?strptime for the formatting string options.

General functions

quantile(pack_sum$count, probs = 0.99)

dplyr

result3 <-
  cran %>%
  group_by(package) %>%
  summarize(count = n(),
            unique = n_distinct(ip_id),
            countries = n_distinct(country),
            avg_bytes = mean(size)
  ) %>%
  filter(countries > 60) %>%
  arrange(desc(countries), avg_bytes)

# Print result to console
print(result3)

Summarize data

head(x, n = 6L) tail(x, n = 6L) summary(object) str(object) quantile(x) table(x): Cross Tabulation and Table Creation colSums rowSums xtabs(Freq ~ Gender + Admit, data = DF): Builds a cross table with gender distinct values as rows, Admit distinct values as columns and the frequency as content of the table. ftable(xtab): A more readable presentation of the cross tab when there is more than 2 dimensions required to show the data. print( x, units=“Mb”)

# Count positive values in every column
> colSums(df>0)
col1 col2
   4   10

Check existence

# Check if there is at least one NA
> any(is.na(df$col1))
[1] FALSE
# Check if col2 is always positive
> all(df$col2 > 0)
[1] TRUE

# Check if all values in the data frame are not NA
> all(colSums(is.na(df))==0)
[1] TRUE

Application

library(httr)
library(httuv)

# 1. Find OAuth settings for github:
#    http://developer.github.com/v3/oauth/
oauth_endpoints("github")

# 2. Register an application at https://github.com/settings/applications;
#    Use any URL you would like for the homepage URL (http://github.com is fine)
#    and http://localhost:1410 as the callback url
#
#    Insert your client ID and secret below - if secret is omitted, it will
#    look it up in the GITHUB_CONSUMER_SECRET environmental variable.
myapp <- oauth_app("github", "56b637a5baffac62cad9")

# 3. Get OAuth credentials
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)

# 4. Use API
gtoken <- config(token = github_token)
req <- with_config(gtoken, GET("https://api.github.com/rate_limit"))
stop_for_status(req)
content(req)