# template elements
2018-03-28
Bobae Kang
(Bobae.Kang@illinois.gov)
str_to_upper()
str_to_lower()
str_to_title()
str_trim()
str_squish()
str_c()
str_detect()
str_subset()
str_sub()
Note: Many stringr
functions have base R alternatives
str_to_upper(string, locale = "en")
str_to_lower(string, locale = "en")
str_to_title(string, locale = "en")
string
input is a character vector
locale
is “en”, for Englishstr_to_title()
capitalizes only the first letter of each wordExample
str_to_upper("hello world")
[1] "HELLO WORLD"
str_to_lower("HELLO WORLD")
[1] "hello world"
str_to_title("hello WORLD")
[1] "Hello World"
Base R alternative
# equivalent to str_to_upper()
toupper(string)
# equivalent to str_to_lower()
tolower(string)
str_trim(string, side = c("both", "left", "right"))
str_squish(string)
string
input is a character vectorside
input determines which side of a string to trim
Example
str_trim(" trim both ", side = "both")
[1] "trim both"
str_trim(" trim left only ", side = "left")
[1] "trim left only "
str_trim(" trim right only ", side = "right")
[1] " trim right only"
str_squish(" whitespaces all over the place ")
[1] "whitespaces all over the place"
Base R alternative
# equivalent to str_trim()
trimws(x, which = c("both", "left", "right"))
str_c(..., sep = "", collapse = NULL)
...
) is one more more character vectorssep
is a separator string between input vectors; default value is none (""
).collapse
is an optional string used to combined input vectors into a single stringExample
str_c(c("one", "two"), c("plus three", "minus four"), sep = " ")
[1] "one plus three" "two minus four"
str_c(c("one", "two", "three"), "plus four", sep = " ")
[1] "one plus four" "two plus four" "three plus four"
str_c(c("one", "two", "three"), collapse = " plus ")
[1] "one plus two plus three"
str_c(c("one", "two", "three"), "plus four", sep = " ", collapse = " and ")
[1] "one plus four and two plus four and three plus four"
Base R alternative
# equivalent to str_c()
paste (..., sep = " ", collapse = NULL)
str_detect(string, pattern)
string
input is a character vectorpattern
input is a character vector of length 1 that is a pattern to look for. A pattern input can include regualr expressions.TRUE
or FALSE
)Note: We will discuss regular expressions later.
Example
str = c("I like apple", "You like apple", "Apple, I like")
pat = "I like"
str_detect(str, pat)
[1] TRUE FALSE TRUE
Base R alternative
# equivalent to str_detect()
grepl(pattern, x, ...)
str_subset(string, pattern)
str_which(string, pattern)
string
input is a character vectorpattern
input is a character vector of length 1 that is a pattern to look for. A pattern input can include regualr expressions.str_subset()
returns the matching strings while str_which()
returns the index for the matchesExample
str = c("I like apple", "You like apple", "Apple, I like")
pat = "I like"
str_subset(str, pat)
[1] "I like apple" "Apple, I like"
str_which(str, pat)
[1] 1 3
Base R alternative
# equivalent to str_subset()
grep(pattern, x, value = TRUE, ...)
# equivalent to str_which()
grep(pattern, x, value = FALSE, ...)
str_sub(string, start = 1L, end = -1L)
str_sub(string, start = 1L, end = -1L, omit_na = FALSE) <- value
string
input is a character vectorstart
and end
are integer vectors
start
is the position of the first substring character; default is the first characterend
is the position of the last substring character; default is the last characterstart
to end
.str_sub()
can be used to replace substrings when used with the assignment operator (<-
)Example
str <- "Hello world"
str_sub(str, start = 7)
[1] "world"
str_sub(str, end = 5) <- "Hi"
str
[1] "Hi world"
Base R alternative
# equivalent to str_sub()
substr(x, start, stop)
# equivalent to str_sub() <- value
substr(x, start, stop) <- value
stringr
on tidyverse.orgstringr
CRAN documentationstringr
Github repository“Regular expressions are a concise and flexible tool for describing patterns in strings.”
-stringr.tidyverse.org
Class | Description |
---|---|
[[:digit:]] or \d |
Any digits; i.e. [0-9] |
\\D |
Non-digits; i.e. [^0-9] |
[[:lower:]] |
Lower-case letters; i.e. [a-z] |
[[:upper:]] |
Upper-case letters; i.e. [A-Z] |
[[:alpha:]] |
Alphabetic characters; [A-z] |
[[:alnum:]] |
Alphanumeric characters; i.e. [A-z0-9] |
\\w |
Any Word characters; i.e. [A-z0-9_] |
\\W |
Non-word characters |
[[:blank:]] |
Space and tab |
[[:space:]] or \s |
Space, tab, vertical tab, newline, form feed, carriage return |
\\S |
Not space; i.e. [^[:space:]] |
Example
str <- c("HELLO", "world", "123", "\n")
str_detect(str, "\\d") # has any digit
[1] FALSE FALSE TRUE FALSE
str_detect(str, "\\D") # has no digit
[1] TRUE TRUE FALSE TRUE
str_detect(str, "\\w") # has any alphanumetic character
[1] TRUE TRUE TRUE FALSE
str_detect(str, "\\s") # has any whitespate
[1] FALSE FALSE FALSE TRUE
Metacharacter | Description |
---|---|
\n |
New line |
\r |
Carriage return |
\t |
Tab |
\v |
Vertical tab |
\f |
Form feed |
Group | Description |
---|---|
. |
Any character except \n |
| | Or, e.g. (a |b) |
[...] |
List permitted characters, e.g. [abc] |
[a-z] |
Specify character ranges |
[^...] |
List excluded characters |
(...) |
Grouping, enables back referencing using \\N where N is integer |
Example
str <- c("HELLO", "world", "123", "\n")
str_detect(str, ".") # has any character except \n
[1] TRUE TRUE TRUE FALSE
str_detect(str, "(d|1)") # has d or 1
[1] FALSE TRUE TRUE FALSE
str_detect(str, "[Oo]") # has O or o
[1] TRUE TRUE FALSE FALSE
str_detect(str, "[^HELLO123]") # has characters other than...
[1] FALSE TRUE FALSE TRUE
Anchor | Description |
---|---|
^ |
Start of the string |
$ |
End of the string |
\\b |
Empty string at either edge of a word |
\\B |
NOT the edge of a word |
\\< |
Beginning of a word |
\\> |
End of a word |
Example
str <- c("apple", "apricot", "banana", "pineapple")
str_detect(str, "^(a|ba)")
[1] TRUE TRUE TRUE FALSE
str_detect(str, "apple$")
[1] TRUE FALSE FALSE TRUE
Quantifier | Description |
---|---|
* |
Matches at least 0 times |
+ |
Matches at least 1 time |
? |
Matches at most 1 time; optional string |
{n} |
Matches extactly n times |
{n,} |
Matches at least n times |
{,n} |
Matches at most n times |
{n,m} |
Matches between n and m times |
Example
str <- c("apple", "apricot", "banana", "pineapple")
str_detect(str, "p*")
[1] TRUE TRUE TRUE TRUE
str_detect(str, "p+")
[1] TRUE TRUE FALSE TRUE
str_detect(str, "p{2,}")
[1] TRUE FALSE FALSE TRUE
Date
classPOSIXct
and POSIXlt
classesas_date()
as_datetime()
year()
, month()
, day()
, hour()
, …parse_date_time()
fast_strptime()
ymd_hms()
, ymd()
, …as_date(x, tz = NULL, origin = lubridate::origin)
as_datetime(x, tz = NULL, origin = lubridate::origin)
x
is a vector of POSIXt, numeric or character objectstz
is a time zone nameorigin
is a Date object or something that can be coerced into a Date object
"1970-01-01"
Example
as_date(17618)
[1] "2018-03-28"
class(as_date("20180328"))
[1] "Date"
as_datetime("2018/03/28")
[1] "2018-03-28 UTC"
class(as_datetime("2018-03-28"))
[1] "POSIXct" "POSIXt"
Base R alternative
# equivalent to as_date()
as.Date(x, ...)
# equivalent to as_datetime()
as.POSIXct(x, tz = "", ...)
year(x)
year(x) <- value
x
is a date-time objectvalue
is a numeric objectFunction | Description |
---|---|
year() |
Get/set year component of a date-time |
month() |
Get/set months component of a date-time |
week() |
Get/set weeks component of a date-time |
day() |
Get/set days component of a date-time |
hour() |
Get/set hours component of a date-time |
minute() |
Get/set minutes component of a date-time |
second() |
Get/set seconds component of a date-time |
tz() |
Get/set time zone component of a date-time |
<-
).Example
today <- as_date("2018-03-28")
year(today)
[1] 2018
month(today) <- 4
today
[1] "2018-04-28"
parse_date_time(x, orders, tz = "UTC", truncated = 0, locale = Sys.getlocale("LC_TIME"), exact = FALSE, drop = FALSE, ...)
fast_strptime(x, format, tz = "UTC", lt = TRUE, cutoff_2000 = 68L)
x
is a character or numeric vector of datesorders
is a character vector of date-time order format
"ymd"
for year-month-date formatexact
is a boolean value for using the “exact” match for the date-time format specificed by orders
drop
is a boolean value for dropping, or removing the values not matching the formatformat
is a character string of formatsDate format symbols
Symbol | Description | Example |
---|---|---|
%Y | Year in 4 digits | 2018 |
%y | Year in 2 digits | 18 |
%B | Month in words | March |
%b | Month in words, abbriviated | Mar |
%m | Month in 2 digits | 03 |
%d | Date in 2 digits | 28 |
Example
dates = c("2018-03-28", "2018/03/28", "20180328")
parse_date_time(dates, "ymd")
[1] "2018-03-28 UTC" "2018-03-28 UTC" "2018-03-28 UTC"
fast_strptime(dates[1], "%Y-%m-%d")
[1] "2018-03-28 UTC"
fast_strptime(dates[2], "%Y/%m/%d")
[1] "2018-03-28 UTC"
fast_strptime(dates[3], "%Y%m%d")
[1] "2018-03-28 UTC"
Base R alternative
# equivalent to fast_strptime()
strptime(x, format = "", tz = "")
parse_date_time()
parse_date_time()
ymd_hms(..., quiet = FALSE, tz = NULL, ...)
ymd(..., quiet = FALSE, tz = "UTC", ...)
...
argument is a character vector of dates in appropriate formatquiet
is a boolean value for displaying messagestz
is a character string speficiying time zoneymd_hms
and other similar functions does the same work parse_date_time()
, but with a predefined order.Date-time | Date only | Time only |
---|---|---|
ymd_hms() |
ymd() |
hms() |
ymd_hm() |
ydm() |
hm() |
ymd_h() |
mdy() |
ms() |
mdy_hms() |
myd() |
|
mdy_hm() |
dmy() |
|
mdy_h() |
dym() |
|
dmy_hms() |
||
dmy_hm() |
||
dmy_h() |
y
is yearm
is monthd
is dateh
is hourm
is minutes
is secondlubridate
on tidyverse.orglubridate
CRAN documentationlubridate
Github repositorySource: Wikimedia Commons
readr
package (tidyverse)
read_csv()
write_csv()
data.table
package
fread()
fwrite()
read_csv(file, col_names = TRUE, col_types = NULL, na = c("", "NA"), trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(1000, n_max), ...)
write_csv(x, path, na = "NA", append = FALSE, col_names = !append)
file
is a path to the .csv file to importx
is a data object to exportpath
is a path to the directory where the exported data will be createdread_csv()
is a tibble
objectwrite_csv()
is a .csv filefread(input, sep="auto", sep2="auto", nrows=-1L, header="auto", na.strings="NA", stringsAsFactors=FALSE, skip=0L, colClasses=NULL, col.names,
strip.white=TRUE, fill=FALSE, ...)
fwrite(x, file = "", append = FALSE, quote = "auto", sep = ",", na = "", row.names = FALSE, col.names = TRUE, ...)
input
is a path to the .csv file to importx
is a data object to exportfile
is a path to the directory where the exported data will be createdfread
is a data.table
objectfwrite
is a .csv file in a directoryBase R alternative
read.csv(file, header = TRUE, sep = ",", quote = "\"", dec = ".", fill = TRUE, ...)
write.csv(x, file = "", append = FALSE, quote = TRUE, sep = ",", row.names = TRUE, col.names = TRUE, ...)
readxl
package (tidyverse)
read_excel()
read_xls()
read_xlsx()
read_excel(path, sheet = NULL, range = NULL, col_names = TRUE, col_types = NULL, na = "", trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(1000, n_max))
read_xls(path, ...)
read_xlsx(path, ...)
path
is a path to the excel file (.xls or .xlsx) to importsheet
is the name of a sheet in the excel file to importcol_names
is a boolean value for using the first row to import as column names skip
is a number of rows to skipguess_max
is a number of rows to use to guess the class of each columntibble
objectreadxl
on tidyverse.orgreadxl
CRAN documentationreadxl
Github repositoryhaven
package (tidyverse)
read_sav()
read_spss()
write_sav()
haven
also has functions to import/export the file formats of other statistical softwares
read_sav(file, user_na = FALSE)
read_spss(file, user_na = FALSE)
write_sav(data, path)
file
is a path to the SPSS file (.sav) to import in read_sav()
, or a path to export the data in write_sav()
data
is a data object to exportread_sav()
is a tibble
object
read_spss()
is a simple alias for read_sav()
write_sav()
is an SPSS data filehaven
on tidyverse.orghaven
CRAN documentationhaven
Github repositoryfeather
package (tidyverse)
read_feather()
write_feather()
read_feather(path, columns = NULL)
write_feather(x, path)
path
is a path to the .feather file to import in read_feather()
, or a path to export the data in write_feather()
x
is the data object to exportread_feather()
is a tibble
objectwrite_feather()
is a feather filefeather
CRAN documentationSource: Joel Ploz