stringr包字符串处理

这是stringr包中常用的字符串处理函数，字符串处理好了，简直是利器在手。

str_length获得字符串长度

> b<-c('abc','456','aini')
> str_length(b)
[1] 3 3 4
> length(b)
[1] 3

str_sub截断字符串

使用sub_str()访问单个字符。它有三个参数:字符向量、起始位置和结束位置。任何一个位置都可以是一个正整数，从长度计算，或者是一个负整数，从右边计算。

x <- c("abcdef", "ghifjk")
str_sub(x, 3, 3)
#> [1] "c" "i"
str_sub(x, 2, -2)
#> [1] "bcde" "hifj"

str_sub可以修改字符

str_sub(x, 3, 3) <- "X"
x
#> [1] "abXdef" "ghXfjk"

str_pad通过增加空格来将字符串长度在某一长度

x <- c("abc", "defghi")
str_pad(x, 10)
#> [1] "       abc" "    defghi"
str_pad(x, 10, "both")
#> [1] "   abc    " "  defghi  "
#str_pad不会将字符长度缩短
str_pad(x, 4)
#> [1] " abc"   "defghi"
x <- c("Short", "This is a long string")
x %>% 
  str_trunc(10) %>% 
  str_pad(10, "right")
#> [1] "Short     " "This is..."

str_trim与str_pad相反，删除空格

x <- c("  a   ", "b   ",  "   c")
str_trim(x)
#> [1] "a" "b" "c"
str_trim(x, "left")
#> [1] "a   " "b   " "c"

str_wrap包装一段文字，使每一行的长度尽可能相似。

jabberwocky <- str_c(
  "`Twas brillig, and the slithy toves ",
  "did gyre and gimble in the wabe: ",
  "All mimsy were the borogoves, ",
  "and the mome raths outgrabe. "
)
cat(str_wrap(jabberwocky, width = 40))
#> `Twas brillig, and the slithy toves did
#> gyre and gimble in the wabe: All mimsy
#> were the borogoves, and the mome raths
#> outgrabe.

str_to_upper、str_to_lower大小写和str_to_title首字母

x <- "I like horses."
str_to_upper(x)
#> [1] "I LIKE HORSES."
str_to_title(x)
#> [1] "I Like Horses."

str_to_lower(x)
#> [1] "i like horses."
# Turkish has two sorts of i: with and without the dot
str_to_lower(x, "tr")
#> [1] "ı like horses."

str_order()和str_sort()对字符向量排序

str_order和str_sort的区别在于前者返回排序后的索引（下标），后者返回排序后的实际值

x <- c("y", "i", "k")
str_order(x)
#> [1] 2 3 1
str_sort(x)
#> [1] "i" "k" "y"
# In Lithuanian, y comes between i and k
str_sort(x, locale = "lt")
#> [1] "i" "y" "k"

str_detect()str_subset()检测字符串中是否存在某种匹配模

str_detect()检测模式的存在与否，并返回逻辑向量(类似于grepl())。str_子集()返回与正则表达式(类似于grep()的value = TRUE)匹配的字符向量的元素)。

fruit <- c("apple", "banana", "pear", "pinapple")
str_detect(fruit, "a")
[1] TRUE TRUE TRUE TRUE
str_detect(fruit, "^a")
[1]  TRUE FALSE FALSE FALSE
str_detect("aecfg", letters)
[1]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
[12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[23] FALSE FALSE FALSE FALSE
> str_subset(fruit, "a")
[1] "apple"    "banana"   "pear"     "pinapple"
> str_subset(fruit, "a$")
[1] "banana"

str_count计算匹配数

fruit <- c("apple", "banana", "pear", "pineapple")
str_count(fruit, "a")
## [1] 1 3 1 1
str_count(fruit, "p")
## [1] 2 0 1 3
str_count(fruit, c("a", "b", "p", "p"))
## [1] 1 1 1 3

str_locate和str_locate_all()定位

str_locate()定位模式的第一个位置，并返回一个包含列start和end的数字矩阵。str_locate_all()查找所有匹配项，返回一个数字矩阵列表。类似于regexpr()和gregexpr()。

> x <- c("abcdef", "ghifjk")
> str_locate(x, "def")
     start end
[1,]     4   6
[2,]    NA  NA
> str_locate(x, "fjk")
     start end
[1,]    NA  NA
[2,]     4   6
str_locate_all(c("abcdefabc", "ghifjkabc"), "abc")
[[1]]
start end
[1,]     1   3
[2,]     7   9

[[2]]
start end
[1,]     7   9

str_extract和str_extract_all提取匹配字符串

str_extract()提取与第一个匹配项对应的文本，返回一个字符向量。str_extract_all()提取所有匹配项并返回字符向量列表。

shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
str_extract(shopping_list, "\\d")
 ## [1] "4" NA  NA  "2"
 str_extract(shopping_list, "[a-z]+")
## [1] "apples" "bag"    "bag"    "milk"
str_extract(shopping_list, "[a-z]{1,4}")
## [1] "appl" "bag"  "bag"  "milk"
str_extract(shopping_list, "\\b[a-z]{1,4}\\b")
## [1] NA     "bag"  "bag"  "milk"
> str_extract_all(shopping_list, "[a-z]+")
[[1]]
[1] "apples" "x"     

[[2]]
[1] "bag"   "of"    "flour"

[[3]]
[1] "bag"   "of"    "sugar"

[[4]]r
[1] "milk" "x"

str_replace和str_replace_all字符串替换

str_replace()替换第一个匹配的模式并返回一个字符向量。str_replace_all()替换所有匹配项。类似于sub()和gsub()。

> string<-str_replace('1989.03.24','\\.','-')
> string
[1] "1989-03.24"
> string<-str_replace_all('1989.03.24','\\.','-')
> string
[1] "1989-03-24"

str_split和str_split_fixed字符串分割

str_split_fixed()根据模式将字符串分割成固定数量的片段，并返回一个字符矩阵。str_split()将字符串分割成可变数量的片段，并返回一个字符向量列表。

str_split("a-b-c", "-")
#> [[1]]
#> [1] "a" "b" "c"
str_split_fixed("a-b-c", "-", n = 2)
#>      [,1] [,2] 
#> [1,] "a"  "b-c"