# template elements
2018-02-13
Bobae Kang
(Bobae.Kang@illinois.gov)
Source: pixabay.com
Source: r-project.org
“R is a language and environment for statistical computing and graphics.” - The R Foundation
Benefits of using R
Source: Time Magazine
ISP UCR data (2011-2015)
# print the state police's crime data
ispcrime_tbl
# A tibble: 510 x 12
year county violentCrime murder rape robbery aggAssault propertyCrime
<int> <fct> <int> <int> <int> <int> <int> <int>
1 2011 Adams 218 0 37 15 166 1555
2 2011 Alexa~ 119 0 14 4 101 290
3 2011 Bond 6 1 0 0 5 211
4 2011 Boone 59 0 24 8 27 733
5 2011 Brown 7 0 1 0 6 38
6 2011 Bureau 42 0 4 3 35 505
7 2011 Calho~ 13 0 0 0 13 56
8 2011 Carro~ 8 0 1 0 7 206
9 2011 Cass 12 0 1 0 11 119
10 2011 Champ~ 1210 5 127 208 870 5332
# ... with 500 more rows, and 4 more variables: burglary <int>,
# larcenyTft <int>, MVTft <int>, arson <int>
# get a quick summary of violent crime and property crime
ispcrime_tbl %>%
select(violentCrime, propertyCrime) %>%
summary()
violentCrime propertyCrime
Min. : 0 Min. : 0
1st Qu.: 19 1st Qu.: 133
Median : 42 Median : 349
Mean : 501 Mean : 2913
3rd Qu.: 133 3rd Qu.: 1190
Max. :33348 Max. :178902
NA's :7 NA's :7
# filter to keep only counties starting with C for 2015
# while creating and showing a new variable for total crime count
ispcrime_tbl %>%
filter(substr(county, 1, 1) == "C", year == 2015) %>%
mutate(totalCrime = violentCrime + propertyCrime) %>%
select(year, county, totalCrime)
# A tibble: 12 x 3
year county totalCrime
<int> <fct> <int>
1 2015 Calhoun NA
2 2015 Carroll 176
3 2015 Cass 154
4 2015 Champaign 6486
5 2015 Christian 292
6 2015 Clark 103
7 2015 Clay 191
8 2015 Clinton 423
9 2015 Coles 805
10 2015 Cook 153575
11 2015 Crawford 282
12 2015 Cumberland 42
# how about "D" counties in 2014 and 2015?
ispcrime_tbl %>%
filter(substr(county, 1, 1) == "D", year %in% c(2014, 2015)) %>%
mutate(totalCrime = violentCrime + propertyCrime) %>%
select(year, county, totalCrime)
# A tibble: 8 x 3
year county totalCrime
<int> <fct> <int>
1 2014 De Kalb 2218
2 2014 De Witt 182
3 2014 Douglas 116
4 2014 Du Page 12576
5 2015 De Kalb 2173
6 2015 De Witt 140
7 2015 Douglas 173
8 2015 Du Page 12538
# get annual average crime count by county
ispcrime_tbl %>%
group_by(county) %>%
summarise(annualAvgCrime = sum(violentCrime, propertyCrime, na.rm = TRUE) / n())
# A tibble: 102 x 2
county annualAvgCrime
<fct> <dbl>
1 Adams 1724
2 Alexander 385
3 Bond 190
4 Boone 426
5 Brown 39.0
6 Bureau 480
7 Calhoun 13.8
8 Carroll 196
9 Cass 109
10 Champaign 6567
# ... with 92 more rows
# sort by average crime count?
ispcrime_tbl %>%
group_by(county) %>%
summarise(annualAvgCrime = sum(violentCrime, propertyCrime, na.rm = TRUE) / n()) %>%
arrange(desc(annualAvgCrime))
# A tibble: 102 x 2
county annualAvgCrime
<fct> <dbl>
1 Cook 182818
2 Du Page 14316
3 Lake 12779
4 Winnebago 12275
5 Will 11078
6 St. Clair 9262
7 Sangamon 8876
8 Kane 8332
9 Peoria 7229
10 Champaign 6567
# ... with 92 more rows
# merging regions data and count the number of counties by region
ispcrime_tbl %>%
left_join(regions) %>%
group_by(region) %>%
count()
# A tibble: 4 x 2
# Groups: region [4]
region n
<fct> <int>
1 Central 230
2 Cook 5
3 Northern 85
4 Southern 190
# no duplicates!
ispcrime_tbl %>%
select(county) %>%
unique() %>%
left_join(regions) %>%
group_by(region) %>%
count()
# A tibble: 4 x 2
# Groups: region [4]
region n
<fct> <int>
1 Central 46
2 Cook 1
3 Northern 17
4 Southern 38
Example (1): Word cloud
Example (2): Dendrogram
Example (3): Network graph
Example (4): Line graph
Example (5): Choropleth map
# bar plot of crime count in 2015 by region
barplot <- ggplot(filter(ispcrime_tbl2, year == 2015), aes(x = region, y = violentCrime, fill = region, group = region)) +
stat_summary(geom = "bar", fun.y = "sum")
barplot
# add title and change appearance
barplot2 <- barplot +
labs(title = "Violent crime count in 2015 by region") +
theme_classic(base_size = 15)
barplot2
# remove the axes names and legends, and change colors
barplot2 +
labs(x = "", y = "") +
theme(legend.position = "None") +
scale_fill_brewer(palette="Spectral")
# histogram of burglary count by county
ggplot(ispcrime_tbl2, aes(x = burglary)) +
geom_histogram() +
facet_wrap(~ year) +
labs(x = "Burglary count", y = "# counties") +
theme_minimal(base_size = 15)
# exclude Cook county data from the histogram and add colors
ggplot(filter(ispcrime_tbl2, county != "Cook"), aes(x = burglary, fill = Year)) +
geom_histogram() + facet_wrap(~ Year) +
labs(x = "Burglary count", y = "# counties") +
theme_minimal(base_size = 15)
Example - Simple linear model
lm_fit <- lm(violentCrime ~ propertyCrime, ispcrime)
summary(lm_fit)
Call:
lm(formula = violentCrime ~ propertyCrime, data = ispcrime)
Residuals:
Min 1Q Median 3Q Max
-2239.5 -2.2 57.0 78.3 3992.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -79.768287 16.496961 -4.835 1.77e-06 ***
propertyCrime 0.199367 0.001059 188.303 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 363.5 on 501 degrees of freedom
(7 observations deleted due to missingness)
Multiple R-squared: 0.9861, Adjusted R-squared: 0.986
F-statistic: 3.546e+04 on 1 and 501 DF, p-value: < 2.2e-16
# put model fit results in a data frame format
tidy(lm_fit)
term estimate std.error statistic p.value
1 (Intercept) -79.7682868 16.49696109 -4.835332 1.771126e-06
2 propertyCrime 0.1993675 0.00105876 188.302852 0.000000e+00
# get predictions and residuals for each data point
ispcrime %>%
select(year, county, propertyCrime, violentCrime) %>%
add_predictions(lm_fit) %>%
add_residuals(lm_fit) %>%
head()
year county propertyCrime violentCrime pred resid
1 2011 Adams 1555 218 230.24816 -12.248156
2 2011 Alexander 290 119 -21.95172 140.951715
3 2011 Bond 211 6 -37.70175 43.701747
4 2011 Boone 733 59 66.36808 -7.368081
5 2011 Brown 38 7 -72.19232 79.192322
6 2011 Bureau 505 42 20.91229 21.087706
# plot the model fit
plot(violentCrime ~ propertyCrime, ispcrime)
abline(lm_fit)
# show diagnostic plots
par(mfrow=c(2, 2))
plot(lm_fit)
Generalized linear models
# examples of generalized linear models with glm()
logistic_reg <- glm(binary ~ x1 + x2, data = mydata, family = binomial())
poisson_reg <- glm(count ~ x1 + x2, data = mydata, family = poisson())
gamma_reg <- glm(y ~ x1 + x2, data = mydata, family = Gamma())
Other advanced models
stats
and forecast
packages)spdep
and spgwr
packages)survival
package)network
and igraph
packages)tm
and tidytext
packages)caret
and mlr
packages)