# template elements
2018-03-22
Bobae Kang
(Bobae.Kang@illinois.gov)
[1] "Look, a data frame!"
column1 column2 column3 column4 column5
1 11 12 13 14 15
2 21 22 23 24 25
3 31 32 33 34 35
4 41 42 43 44 45
5 51 52 53 54 55
as.data.frame()
(… and vice versa, with as.list()
)I have created an R package icjiar
, which comes with some sample datasets, including a data frame of ISP UCR data (ispcrime
). Let's take a look:
# install.packages("devtools")
# devtools::install_github("bobaekang/icjiar")
library(icjiar)
class(ispcrime) # the class of ispcrime object is "data.frame"
[1] "data.frame"
is.data.frame(ispcrime) # check if ispcrime is a data.frame; TRUE, as expected
[1] TRUE
str(ispcrime) # reports the "structure" of the data frame
'data.frame': 510 obs. of 12 variables:
$ year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
$ county : Factor w/ 102 levels "Adams","Alexander",..: 1 2 3 4 5 6 7 8 9 10 ...
$ violentCrime : int 218 119 6 59 7 42 13 8 12 1210 ...
$ murder : int 0 0 1 0 0 0 0 0 0 5 ...
$ rape : int 37 14 0 24 1 4 0 1 1 127 ...
$ robbery : int 15 4 0 8 0 3 0 0 0 208 ...
$ aggAssault : int 166 101 5 27 6 35 13 7 11 870 ...
$ propertyCrime: int 1555 290 211 733 38 505 56 206 119 5332 ...
$ burglary : int 272 92 58 152 14 90 14 38 41 1384 ...
$ larcenyTft : int 1241 183 147 563 22 405 41 165 71 3756 ...
$ MVTft : int 36 11 5 14 1 8 1 2 3 164 ...
$ arson : int 6 4 1 4 1 2 0 1 4 28 ...
head(ispcrime, 5) # returns the first n rows of the data frame (default 6)
year county violentCrime murder rape robbery aggAssault propertyCrime
1 2011 Adams 218 0 37 15 166 1555
2 2011 Alexander 119 0 14 4 101 290
3 2011 Bond 6 1 0 0 5 211
4 2011 Boone 59 0 24 8 27 733
5 2011 Brown 7 0 1 0 6 38
burglary larcenyTft MVTft arson
1 272 1241 36 6
2 92 183 11 4
3 58 147 5 1
4 152 563 14 4
5 14 22 1 1
dim(ispcrime) # returns the dimension of the data frame (row column)
[1] 510 12
nrow(ispcrime) # returns the number of rows in the data frame
[1] 510
ncol(ispcrime) # returns the number of columns in the data frame
[1] 12
colnames(ispcrime) # returns a vector containing the column names
[1] "year" "county" "violentCrime" "murder"
[5] "rape" "robbery" "aggAssault" "propertyCrime"
[9] "burglary" "larcenyTft" "MVTft" "arson"
Accessing desired subsets
ispcrime$year # access a column by name
ispcrime[[1]] # access the first column by index
ispcrime[, 1] # yet another way to access the first column!
[1] 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011
[15] 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011
[29] 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011
[43] 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011
[57] 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011
[71] 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011
[85] 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011
[99] 2011 2011 2011 2011 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012
[113] 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012
[127] 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012
[141] 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012
ispcrime[1, ] # access the first row by index
year county violentCrime murder rape robbery aggAssault propertyCrime
1 2011 Adams 218 0 37 15 166 1555
burglary larcenyTft MVTft arson
1 272 1241 36 6
# access a specific cell (first row of the first column)
ispcrime$year[1]
ispcrime[[1]][1]
ispcrime[1, 1]
[1] 2011
data.frame()
as.data.frame()
Using data.frame()
fruits <- c("apple", "banana", "clementine")
animals <- c("dogs", "cats", "llamas")
icecream_flavors <- c("chocolate", "vanila", "cookie dough")
df1 <- data.frame(fruits, animals, icecream_flavors)
print(df1)
fruits animals icecream_flavors
1 apple dogs chocolate
2 banana cats vanila
3 clementine llamas cookie dough
df2 <- data.frame(
fruits = c("apple", "banana", "clementine"),
animals = c("dogs", "cats", "llamas"),
icecream_flavors = c("chocolate", "vanila", "cookie dough")
)
print(df2)
fruits animals icecream_flavors
1 apple dogs chocolate
2 banana cats vanila
3 clementine llamas cookie dough
Converting a list using as.data.frame()
lt <- list(
fruits = c("apple", "banana", "clementine"),
animals = c("dogs", "cats", "llamas"),
icecream_flavors = c("chocolate", "vanila", "cookie dough")
)
df3 <- as.data.frame(lt)
print(df3)
fruits animals icecream_flavors
1 apple dogs chocolate
2 banana cats vanila
3 clementine llamas cookie dough
Change column names
colnames(df1) <- c("my_fruits", "my_animals", "my_flavors")
print(df1)
my_fruits my_animals my_flavors
1 apple dogs chocolate
2 banana cats vanila
3 clementine llamas cookie dough
Add columns
# using $ index
df1$my_colors <- c("red", "green", "orange")
# using cbind() function
my_cities <- c("Chicago", "New Work", "Los Angeles")
df1 <- cbind(df1, my_cities)
print(df1)
my_fruits my_animals my_flavors my_colors my_cities
1 apple dogs chocolate red Chicago
2 banana cats vanila green New Work
3 clementine llamas cookie dough orange Los Angeles
Modify columns
df1[["my_colors"]] <- c("maroon", "blue", "purple")
df1$my_cities <- c("Chicago", "London", "Paris")
df1
my_fruits my_animals my_flavors my_colors my_cities
1 apple dogs chocolate maroon Chicago
2 banana cats vanila blue London
3 clementine llamas cookie dough purple Paris
Remove columns
# assinging NULL
df1$my_colors <- NULL
df1
my_fruits my_animals my_flavors my_cities
1 apple dogs chocolate Chicago
2 banana cats vanila London
3 clementine llamas cookie dough Paris
# subsetting
df1 <- df1[, 1:3] # or c("my_fruits", "my_animals", "my_flavors")
df1
my_fruits my_animals my_flavors
1 apple dogs chocolate
2 banana cats vanila
3 clementine llamas cookie dough
Add rows
new_row <- data.frame(
my_fruits = "strawberry",
my_animals = "monkeys",
my_flavors = "butter pecan"
)
df1 <- rbind(df1, new_row)
df1
my_fruits my_animals my_flavors
1 apple dogs chocolate
2 banana cats vanila
3 clementine llamas cookie dough
4 strawberry monkeys butter pecan
Remove rows
# subsetting
df1 <- df1[1:3, ]
df1
my_fruits my_animals my_flavors
1 apple dogs chocolate
2 banana cats vanila
3 clementine llamas cookie dough
Modify cells
# this doesn't work ... why?
df1$my_flavors[1] <- "mint chocolate chip"
df1
my_fruits my_animals my_flavors
1 apple dogs <NA>
2 banana cats vanila
3 clementine llamas cookie dough
# because the column is a factor and only
# new values of the existing levels can be added
df1$my_flavors
[1] <NA> vanila cookie dough
Levels: chocolate cookie dough vanila butter pecan
# first we coerce the column into character class
df1$my_flavors <- as.character(df1$my_flavors)
# now works!
df1$my_flavors[1] <- "mint chocolate chip"
df1
my_fruits my_animals my_flavors
1 apple dogs mint chocolate chip
2 banana cats vanila
3 clementine llamas cookie dough
data.frame
is rarely used since better alternatives are available.
tibble
data.table
data.frame
# A tibble: 5 x 5
column1 column2 column3 column4 column5
<dbl> <dbl> <dbl> <dbl> <dbl>
1 11. 12. 13. 14. 15.
2 21. 22. 23. 24. 25.
3 31. 32. 33. 34. 35.
4 41. 42. 43. 44. 45.
5 51. 52. 53. 54. 55.
tidyverse
framework (we'll come back to this)tidyverse
syntaxdata.frame
object into a tibble
can be done with as_tibble()
from the tibble
package column1 column2 column3 column4 column5
1: 11 12 13 14 15
2: 21 22 23 24 25
3: 31 32 33 34 35
4: 41 42 43 44 45
5: 51 52 53 54 55
data.table
package.data.frame
object into a data.table
can be done with as.data.table()
The capabilities of R are extended through user-created packages, which allow specialized statistical techniques, graphical devices, import/export capabilities, reporting tools […], etc.
- “R (programming language)”, Wikipedia
# first we should install the desired package
install.packages("some_package")
# then we import the package to use its functionalities
library(some_package)
install.packages("package")
install_github("author/package")
install_github()
is available via devtools
package.Tidy data is data where:
(1) Each variable is in a column
(2) Each observation is a row
(3) Each value is a cell.
Anything that is not tidy!
Untidy example 1
year/county violentCrime/propertyCrime
1 2011/Adams 218/1555
2 2011/Alexander 119/290
3 2011/Bond 6/211
4 2011/Boone 59/733
5 2011/Brown 7/38
6 2011/Bureau 42/505
Untidy example 2
index year county typeViolent valueViolent
1 1 2011 Adams murder 0
2 1 2011 Adams rape 37
3 1 2011 Adams robbery 15
4 1 2011 Adams aggAssault 166
5 2 2011 Alexander murder 0
6 2 2011 Alexander rape 14
7 2 2011 Alexander robbery 4
8 2 2011 Alexander aggAssault 101
9 3 2011 Bond murder 1
10 3 2011 Bond rape 0
ggplot2
for data visualizationdplyr
for data manipulationtidyr
for creating “tidy data”readr
for data import/exportpurrr
for loop operationstibble
for data representationThe goal [of the style guide] is to make our R code easier to read, share, and verify.
- Google's R Style guide
Naming a variable (e.g. for firearm arrests)
# Good
firearm_arrests
fa_arr
# Bad
arrests_with_firearm_charges # too verbose
firearmArrests # violating underscore convention
FireArm_Arrests # mixing underscore with other way of naming
farr # not descriptive enough
x # not descriptive at all
Naming a function (e.g. for counting arrests)
# Good
count_arr <- function(x) { ... }
# Bad
num_arr <- function(x) { ... } # noun for a function
do_arr <- function(x) { ... } # not descriptive enough
count <- function(x) { ... } # too generic (common name)
Reserved words in R
if else repeat while function for
in next break # used in loops, conditions, functions
TRUE FALSE # logical values
NULL # undefined
Inf # infinity
NaN # Not a Number
NA # not available (missing)
NA_integer_ NA_real_
NA_complex_ NA_character_ # NA for atomic vector types
... # dot method for one function to pass arguments to another
+
, -
, <
, =
, etc.)
:
, ::
, and :::
(
, except when it is a function callAdding spaces
# Good
greetings <- paste("Hello", "World!", sep = " ")
df[2, ]
x <- 1:10
base::Random() # calling a function with specifying the package
# Bad
greetings<-paste("Hello","world!",sep="")
df[ 2,]
x<- 1 : 10
base :: Random ()
Extra spacing
# for aligning function arguments
some_function (
first_argument = value_1
another_argument = value_2
example = value_3
)
# for aligning variable assignments
numbers <- c(1, 2, 3)
roman_numerals <- c("I", "II", "III")
letters <- c("a", "b", "c")
Indentation
# Good
if (x > 0) {
i = 0
while (i < 10) {
message("Wait, I'm in a loop")
i <- i + 1
}
message("x is positive.")
} else {
message("x is not positive")
}
# Bad
if (y > 0) {
j = 0
while (j < 10) {
message("Wait, I'm in a loop")
j <- j + 1
}
message("y is positive.")
} else {
message("y is not positive")
}
#
symbol) for clarification
# the following code calculates the average of some numbers
numbers <- c(1, 3, 5) # assign a vector of numbers to numbers object
average <- sum(numbers) / length(numbers) # divide the sum of numbers vecotr by its length to get the average
References