- 基本用法
- 正则表达式
- 使用正则表达式的stringr函数
1. 基本用法
- 字符串长度
str_length()
> str_length(c("a", "R for data science", NA))
[1] 1 18 NA
- 字符串组合
str_c()
# 将字符串组合为字符
> str_c("x", "y")
[1] "xy"
# 缺失值可传染
> x <- c("abc", NA)
> str_c("|-", x, "-|")
[1] "|-abc-|" NA
# 使用 str_replace_na() 函数显示缺失值
> str_c("|-", str_replace_na(x), "-|")
[1] "|-abc-|" "|-NA-|"
# 可自动循环短向量
> str_c("prefix-", c("a", "b", "c"), "-suffix")
[1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
# 使用 collapse() 函数将字符向量合并为字符串
> str_c(c("x", "y", "z"), collapse = ",")
[1] "x,y,z"
- 字符串取子集
str_sub()
# start 和 end 参数给出了提取字符串的位置
> x <- c("apple", "banana", "pear")
> str_sub(x, 1, 3)
[1] "app" "ban" "pea"
# 负数表示从后往前数
> str_sub(x, -3, -1)
[1] "ple" "ana" "ear"
# 通过赋值形式修改字符串
> str_sub(x, 1, 1) <- str_to_upper(str_sub(x, 1, 1))
> x
[1] "Apple" "Banana" "Pear"
- 字符串排序
str_sort()
或str_order()
> x <- c("apple", "eggplant", "banana")
> str_sort(x, locale = "en") # 英语
[1] "apple" "banana" "eggplant"
2. 正则表达式
2.1 特殊匹配
\
在正则表示式中作为转义字符,并且在字符串中也需要用\
进行转义。所以正则表达式字符串中需使用\\
匹配转义字符。
\\\\
对应 \
\\.
对应 .
\\d
对应 \d
2.2 字符类与字符选项
特殊模式可以匹配多个字符
.
匹配除换行符外所有字符
\d
匹配任意数字
\s
匹配任意 空白 字符(如空格、制表符、换行符)
[abc]
匹配a或b或c
[^abc]
匹配除a\b\c外任何字符
[A-Za-z]
匹配所有大小写英文字符
- 需要在字符串中对
\
进行转义,如输入\\d
创建包含\d
的正则表达式
2.3 锚点
^
从字符串开头进行匹配
$
从字符串末尾进行匹配
\b
匹配单词的边界
2.4 重复
?
0次或1次
+
1次或多次
*
0次或多次
{n}
匹配n次
{n, m}
匹配n次到m次
- 默认将匹配尽量长的字符串, 即贪婪的。在表达式后添加
?
将匹配尽量短的字符串
2.5 分组与回溯引用
\\1
重复匹配第一个()
内的表达式
\\2
重复匹配第二个()
内的表达式
3. 使用正则表达式的stringr函数
3.1 匹配检测
-
str_detect()
检测字符向量是否匹配某种模式,返回长度相同的逻辑向量
> x <- c("apple", "banana", "pear")
> str_detect(x, "e")
[1] TRUE FALSE TRUE
逻辑向量中FALSE值为0,TRUE值为1,使得sum()
mean()
函数可用作统计
> mean(str_detect(words, pattern = "[aeiou]$"))
[1] 0.2765306
当匹配的逻辑条件复杂时,可通过单个正则表达式或逻辑运算符将多个str_detect()
调用组合
# 如匹配非元音字母开头的单词
> no_vowels_1 <- !str_detect(words, "[aeiou]")
> no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
> identical(no_vowels_1, no_vowels_2)
[1] TRUE
-
str_subset()
选取出所匹配的子集
> str_subset(words, "x$")
[1] "box" "sex" "six" "tax"
结合filter()
操作,选出某一列中符合匹配模式的字符串
> df <- tibble(
+ word = words,
+ i = seq_along(words)
+ )
> df %>%
+ filter(str_detect(words, "x$"))
# A tibble: 4 x 2
word I
1 box 108
2 sex 747
3 six 772
4 tax 841
-
str_count
返回字符串中匹配的数量
> str_count(x, "a")
[1] 1 3 1
str_count()
与 mutate()
配合使用
# 计算每个单词中所包含的原音和辅音字母
> df %>%
+ mutate(
+ vowels = str_count(words, "[aeiou]"),
+ consonants = str_count(words, "[^aeiou]")
+ )
# A tibble: 980 x 4
word i vowels consonants
1 a 1 1 0
2 able 2 2 2
3 about 3 3 2
4 absolute 4 4 4
5 accept 5 2 4
6 account 6 3 4
7 achieve 7 4 3
8 across 8 2 4
9 act 9 1 2
10 active 10 3 3
# … with 970 more rows
- exercise
# words表格中哪个单词包含最大比例的元音字母
> df %>%
+ mutate(
+ vowels = str_count(word, "[aeiou]"),
+ length = str_length(word),
+ rate = vowels / length
+ ) %>%
+ arrange(desc(rate))
# A tibble: 980 x 5
word i vowels length rate
1 a 1 1 1 1
2 area 49 3 4 0.75
3 idea 412 3 4 0.75
4 age 22 2 3 0.667
5 ago 24 2 3 0.667
6 air 26 2 3 0.667
7 die 228 2 3 0.667
8 due 250 2 3 0.667
9 eat 256 2 3 0.667
10 europe 278 4 6 0.667
# … with 970 more rows
3.2 提取匹配内容
str_extract()
提取匹配的实际文本
# 以 stringr::sentences 为例
> head(sentences)
[1] "The birch canoe slid on the smooth planks."
[2] "Glue the sheet to the dark blue background."
[3] "It's easy to tell the depth of a well."
[4] "These days a chicken leg is a rare dish."
[5] "Rice is often served in round bowls."
[6] "The juice of lemons makes fine punch."
# 构建正则表达式 color_match
> colors <- c("red", "orange", "yellow", "green", "blue", "purple")
> color_match <- str_c(colors, collapse = "|")
> color_match
[1] "red|orange|yellow|green|blue|purple"
# 挑选出匹配的句子
> has_color <- str_subset(sentences, color_match)
> length(has_color)
# 提取匹配句子中包含的颜色
> matches <- str_extract(has_color, color_match)
> matches
[1] "blue" "blue" "red" "red" "red" "blue"
[7] "yellow" "red" "red" "green" "red" "red"
[13] "blue" "red" "red" "red" "red" "blue"
[19] "red" "blue" "red" "green" "red" "red"
[25] "red" "red" "red" "red" "green" "red"
[31] "green" "red" "purple" "green" "red" "red"
[37] "red" "red" "red" "blue" "red" "blue"
[43] "red" "red" "red" "red" "green" "green"
[49] "green" "red" "red" "yellow" "red" "orange"
[55] "red" "red" "red"
str_extract()
只提取第一个匹配,可使用str_extract_all()
得到所有匹配exercise
# 提取以ing结尾的所有单词
pattern1 <- "\\b[A-Za-z]+ing\\b"
str_extract(sentences, pattern1)
# 比较:
pattern2 <- "ing\\b"
str_extract(sentences, pattern2)
pattern1
与 pattern2
在 str_subset()
函数中都可以提取出包含ing结尾单词的句子, 但只有pattern1
可在 str_extract()
中提取出结尾为ing的单词
3.3 分组匹配
str_match()
可在提取出完整匹配后给出每个独立分组,结果返回矩阵
e.g. 提取跟在 a 或 the 后面的单词
# 构建正则表达式,定义为 至少有1个非空格字符的字符序列
> noun <- "(a|the)([^ ]+)"
# 提取匹配到的字符串
> has_noun <- sentences %>%
+ str_subset(noun) %>%
+ head(10)
# 提取所匹配的具体内容
> has_noun %>% str_extract(noun)
[1] "the smooth" "the sheet" "the depth" "a chicken"
[5] "the parked" "the sun" "the huge" "the ball"
[9] "the woman" "a helps"
# 提取完整内容并显示分组的匹配结果
> has_noun %>% str_match(noun)
[,1] [,2] [,3]
[1,] "the smooth" "the" "smooth"
[2,] "the sheet" "the" "sheet"
[3,] "the depth" "the" "depth"
[4,] "a chicken" "a" "chicken"
[5,] "the parked" "the" "parked"
[6,] "the sun" "the" "sun"
[7,] "the huge" "the" "huge"
[8,] "the ball" "the" "ball"
[9,] "the woman" "the" "woman"
[10,] "a helps" "a" "helps"
-
str_extract_all()
提取所有匹配
3.4 替换匹配的内容
str_replace(string, pattern, replacement)
使用固定字符串替换匹配内容
> x <- c("1 apple", "2 pear", "3 banana")
> str_replace(x, pattern = "[aeiou]", replacement = "-")
[1] "1 -pple" "2 p-ar" "3 b-nana"
> str_replace_all(x, "[aeiou]", "-")
[1] "1 -ppl-" "2 p--r" "3 b-n-n-"
# 可同时执行多个替换
> str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
[1] "one apple" "two pear" "three banana"
可利用回溯引用来插入匹配中的分组
> head(sentences)[1:5]
[1] "The birch canoe slid on the smooth planks."
[2] "Glue the sheet to the dark blue background."
[3] "It's easy to tell the depth of a well."
[4] "These days a chicken leg is a rare dish."
[5] "Rice is often served in round bowls."
> sentences %>%
+ str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
+ head(5)
[1] "The canoe birch slid on the smooth planks."
[2] "Glue sheet the to the dark blue background."
[3] "It's to easy tell the depth of a well."
[4] "These a days chicken leg is a rare dish."
[5] "Rice often is served in round bowls."
注意正则表达式中的空格!
3.5 拆分
str_split(string, pattern, n = Inf, simplify = FLASE)
默认结果返回列表形式
> sentences %>%
+ head(5) %>%
+ str_split(" ")
[[1]]
[1] "The" "birch" "canoe" "slid" "on" "the"
[7] "smooth" "planks."
[[2]]
[1] "Glue" "the" "sheet" "to"
[5] "the" "dark" "blue" "background."
[[3]]
[1] "It's" "easy" "to" "tell" "the" "depth" "of"
[8] "a" "well."
[[4]]
[1] "These" "days" "a" "chicken" "leg" "is"
[7] "a" "rare" "dish."
[[5]]
[1] "Rice" "is" "often" "served" "in" "round"
[7] "bowls."
# 设置 simplify = TRUE 返回矩阵
> sentences %>%
+ head(5) %>%
+ str_split(" ", simplify = TRUE)
[,1] [,2] [,3] [,4] [,5] [,6] [,7]
[1,] "The" "birch" "canoe" "slid" "on" "the" "smooth"
[2,] "Glue" "the" "sheet" "to" "the" "dark" "blue"
[3,] "It's" "easy" "to" "tell" "the" "depth" "of"
[4,] "These" "days" "a" "chicken" "leg" "is" "a"
[5,] "Rice" "is" "often" "served" "in" "round" "bowls."
[,8] [,9]
[1,] "planks." ""
[2,] "background." ""
[3,] "a" "well."
[4,] "rare" "dish."
[5,] "" ""
可通过 boundary()
函数指定字母、行或句子和单词边界进行拆分
> x <- "This is a sentence. This is another sentence"
> str_split(x, boundary("word"))[[1]]
[1] "This" "is" "a" "sentence" "This"
[6] "is" "another" "sentence"
3.6 定位匹配内容
str_locate(string, pattern)
可以给出每个匹配位置的开始和结束, 结合 str_sub()
函数提取或修改匹配的内容。