ClassGroups: Character classes

Description

Match character classes.

Usage

alnum(lo, hi, char_class = TRUE)
alpha(lo, hi, char_class = TRUE)
blank(lo, hi, char_class = TRUE)
cntrl(lo, hi, char_class = TRUE)
digit(lo, hi, char_class = TRUE)
graph(lo, hi, char_class = TRUE)
lower(lo, hi, char_class = TRUE)
printable(lo, hi, char_class = TRUE)
punct(lo, hi, char_class = TRUE)
space(lo, hi, char_class = TRUE)
upper(lo, hi, char_class = TRUE)
hex_digit(lo, hi, char_class = TRUE)
any_char(lo, hi)
grapheme(lo, hi)
newline(lo, hi)
dgt(lo, hi, char_class = FALSE)
wrd(lo, hi, char_class = FALSE)
spc(lo, hi, char_class = FALSE)
not_dgt(lo, hi, char_class = FALSE)
not_wrd(lo, hi, char_class = FALSE)
not_spc(lo, hi, char_class = FALSE)
ascii_digit(lo, hi, char_class = TRUE)
ascii_lower(lo, hi, char_class = TRUE)
ascii_upper(lo, hi, char_class = TRUE)
ascii_alpha(lo, hi, char_class = TRUE)
ascii_alnum(lo, hi, char_class = TRUE)
char_range(lo, hi, char_class = lo < hi)

Arguments

A non-negative integer. Minimum number of repeats, when grouped.

positive integer. Maximum number of repeats, when grouped.

char_class

A logical value. Should x be wrapped in a character class? If NA, the function guesses whether that's a good idea.

Value

A character vector representing part or all of a regular expression.

References

http://www.regular-expressions.info/shorthand.html and http://www.rexegg.com/regex-quickstart.html#posix

Examples

Run this code

# R character classes
alnum()
alpha()
blank()
cntrl()
digit()
graph()
lower()
printable()
punct()
space()
upper()
hex_digit()

# Special chars
any_char()
grapheme()
newline()

# Generic classes
dgt()
wrd()
spc()

# Generic negated classes
not_dgt()
not_wrd()
not_spc()

# Non-locale-specific classes
ascii_digit()
ascii_lower()
ascii_upper()

# Don't provide a class wrapper
digit(char_class = FALSE) # same as DIGIT

# Match repeated values
digit(3)
digit(3, 5)
digit(0)
digit(1)
digit(0, 1)

# Ranges of characters
char_range(0, 7) # octal number

# Usage
(rx <- digit(3))
stringi::stri_detect_regex(c("123", "one23"), rx)

# Some classes behave differently under different engines
# In particular PRCE and Perl recognise all these characters
# as punctuation but ICU does not
p <- c(
  "!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "[", "]", "{", "}", ";",
  ":", "'", '"', ",", "<", ">", ".", "/", "?", "\\", "|", "`", "~"
)
icu_matched <- stringi::stri_detect_regex(p, punct())
p[icu_matched]
p[!icu_matched]
pcre_matched <- grepl(punct(), p)
p[pcre_matched]
p[!pcre_matched]

# A grapheme is a character that can be defined by more than one code point
# PCRE does not recognise the concept.
x <- c("Chloe", "Chlo\u00e9", "Chlo\u0065\u0301")
stringi::stri_match_first_regex(x, "Chlo" %R% capture(grapheme()))

# newline() matches three types of line ending: \r, \n, \r\n.
# You can standardize line endings using
stringi::stri_replace_all_regex("foo\nbar\r\nbaz\rquux", NEWLINE, "\n")