R字符串处理2

1 、连接字符串:str_c()

str_c(..., sep = "", collapse = NULL)

sep:连接两个字符之间插入的符号
collapse:连接后,使用哪个字符分割

示例

> str_c("Letter: ", letters)
 [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e" "Letter: f" "Letter: g" "Letter: h"
 [9] "Letter: i" "Letter: j" "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o" "Letter: p"
[17] "Letter: q" "Letter: r" "Letter: s" "Letter: t" "Letter: u" "Letter: v" "Letter: w" "Letter: x"
[25] "Letter: y" "Letter: z"

> str_c("Letter", letters, sep = ": ")
 [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e" "Letter: f" "Letter: g" "Letter: h"
 [9] "Letter: i" "Letter: j" "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o" "Letter: p"
[17] "Letter: q" "Letter: r" "Letter: s" "Letter: t" "Letter: u" "Letter: v" "Letter: w" "Letter: x"
[25] "Letter: y" "Letter: z"

> str_c(letters, " is for", "...")
 [1] "a is for..." "b is for..." "c is for..." "d is for..." "e is for..." "f is for..." "g is for..."
 [8] "h is for..." "i is for..." "j is for..." "k is for..." "l is for..." "m is for..." "n is for..."
[15] "o is for..." "p is for..." "q is for..." "r is for..." "s is for..." "t is for..." "u is for..."
[22] "v is for..." "w is for..." "x is for..." "y is for..." "z is for..."

> str_c(letters[-26], " comes before ", letters[-1])
 [1] "a comes before b" "b comes before c" "c comes before d" "d comes before e" "e comes before f"
 [6] "f comes before g" "g comes before h" "h comes before i" "i comes before j" "j comes before k"
[11] "k comes before l" "l comes before m" "m comes before n" "n comes before o" "o comes before p"
[16] "p comes before q" "q comes before r" "r comes before s" "s comes before t" "t comes before u"
[21] "u comes before v" "v comes before w" "w comes before x" "x comes before y" "y comes before z"
> str_c(letters, collapse = "")
[1] "abcdefghijklmnopqrstuvwxyz"

> str_c(letters, collapse = ", ")
[1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"

> str_c(c("a", NA, "b"), "-d")
[1] "a-d"  "NA-d" "b-d"

2 、字符串计数:str_count()

str_count(string, pattern = "")

string:字符串
pattern:对哪个字符进行计数

示例

> fruit <- c("apple", "banana", "pear", "pineapple")
> str_count(fruit, "a")
[1] 1 3 1 1
> str_count(fruit, "p")
[1] 2 0 1 3
> str_count(fruit, "e")
[1] 1 0 1 2
> str_count(fruit, c("a", "b", "p", "p"))
[1] 1 1 1 3
> str_count(c("a.", "...", ".a.a"), ".")
[1] 2 3 4
> str_count(c("a.", "...", ".a.a"), fixed("."))
[1] 1 3 2

3 、字符串逻辑判断:str_detect()

str_detect(string, pattern = "")

string:字符串
pattern:对哪个字符进行逻辑判断

示例

> fruit <- c("apple", "banana", "pear", "pinapple")
> str_detect(fruit, "a")
[1] TRUE TRUE TRUE TRUE
> str_detect(fruit, "^a")
[1]  TRUE FALSE FALSE FALSE
> str_detect(fruit, "a$")
[1] FALSE  TRUE FALSE FALSE
> str_detect(fruit, "b")
[1] FALSE  TRUE FALSE FALSE
> str_detect(fruit, "[aeiou]")
[1] TRUE TRUE TRUE TRUE
> # Also vectorised over pattern
> str_detect("aecfg", letters)
 [1]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[17] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

4 、复制字符串:str_dup()

str_dup(string, times)

string:字符串
times:复制的次数

示例

> fruit <- c("apple", "pear", "banana")
> str_dup(fruit, 2)
[1] "appleapple"   "pearpear"     "bananabanana"
> str_dup(fruit, 1:3)
[1] "apple"              "pearpear"           "bananabananabanana"
> str_c("ba", str_dup("na", 0:5))
[1] "ba"           "bana"         "banana"       "bananana"     "banananana"   "bananananana"

5 、从字符串中提取匹配字符:str_extract()

str_extract(string, pattern)
str_extract_all(string, pattern, simplify = FALSE)

string:字符串
pattern:匹配的字符,默认正则和非正则

示例


> shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
> str_extract(shopping_list, "\\d")
[1] "4" NA  NA  "2"
> str_extract(shopping_list, "[a-z]+")
[1] "apples" "bag"    "bag"    "milk"  
> str_extract(shopping_list, "[a-z]{1,4}")
[1] "appl" "bag"  "bag"  "milk"
> str_extract(shopping_list, "\\b[a-z]{1,4}\\b")
[1] NA     "bag"  "bag"  "milk"
> str_extract_all(shopping_list, "[a-z]+")
[[1]]
[1] "apples" "x"     

[[2]]
[1] "bag"   "of"    "flour"

[[3]]
[1] "bag"   "of"    "sugar"

[[4]]
[1] "milk" "x"   

> str_extract_all(shopping_list, "\\b[a-z]+\\b")
[[1]]
[1] "apples"

[[2]]
[1] "bag"   "of"    "flour"

[[3]]
[1] "bag"   "of"    "sugar"

[[4]]
[1] "milk"

> str_extract_all(shopping_list, "\\d")
[[1]]
[1] "4"

[[2]]
character(0)

[[3]]
character(0)

[[4]]
[1] "2"

6 、字符串的长度:str_length()

str_length(string))

string:字符串

示例


> str_length(letters)
 [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
> str_length(NA)
[1] NA
> str_length(factor("abc"))
[1] 3
> str_length(c("i", "like", "programming", NA))
[1]  1  4 11 NA
> # Two ways of representing a u with an umlaut
> u1 <- "\u00fc"
> u2 <- stringi::stri_trans_nfd(u1)
> # The print the same:
> u1
[1] "ü"
> u2
[1] "ü"
> # But have a different length
> str_length(u1)
[1] 1
> str_length(u2)
[1] 2

7 、找到匹配的字符串的位置:str_locate()

str_locate(string, pattern)
str_locate_all(string, pattern)

string:字符串
pattern:匹配的字符,默认正则和非正则

示例

> fruit <- c("apple", "banana", "pear", "pineapple")
> str_locate(fruit, "$")
     start end
[1,]     6   5
[2,]     7   6
[3,]     5   4
[4,]    10   9
> str_locate(fruit, "a")
     start end
[1,]     1   1
[2,]     2   2
[3,]     3   3
[4,]     5   5
> str_locate(fruit, "e")
     start end
[1,]     5   5
[2,]    NA  NA
[3,]     2   2
[4,]     4   4
> str_locate(fruit, c("a", "b", "p", "p"))
     start end
[1,]     1   1
[2,]     1   1
[3,]     1   1
[4,]     1   1
> str_locate_all(fruit, "a")
[[1]]
     start end
[1,]     1   1

[[2]]
     start end
[1,]     2   2
[2,]     4   4
[3,]     6   6

[[3]]
     start end
[1,]     3   3

[[4]]
     start end
[1,]     5   5

> str_locate_all(fruit, "e")
[[1]]
     start end
[1,]     5   5

[[2]]
     start end

[[3]]
     start end
[1,]     2   2

[[4]]
     start end
[1,]     4   4
[2,]     9   9

> str_locate_all(fruit, "")
[[1]]
     start end
[1,]     1   0
[2,]     2   1
[3,]     3   2
[4,]     4   3
[5,]     5   4

[[2]]
     start end
[1,]     1   0
[2,]     2   1
[3,]     3   2
[4,]     4   3
[5,]     5   4
[6,]     6   5

[[3]]
     start end
[1,]     1   0
[2,]     2   1
[3,]     3   2
[4,]     4   3

[[4]]
      start end
 [1,]     1   0
 [2,]     2   1
 [3,]     3   2
 [4,]     4   3
 [5,]     5   4
 [6,]     6   5
 [7,]     7   6
 [8,]     8   7
 [9,]     9   8

8 、从字符串中提取匹配组:str_match()

str_match(string, pattern)
str_match_all(string, pattern)

string:字符串
pattern:匹配的字符,默认正则和非正则

示例


> strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
+ "387 287 6718", "apple", "233.398.9187 ", "482 952 3315",
+ "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",
+ "Home: 543.355.3679")
> phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
> str_extract(strings, phone)
 [1] "219 733 8965" "329-293-8753" NA             "595 794 7569" "387 287 6718" NA            
 [7] "233.398.9187" "482 952 3315" "239 923 8115" "579-499-7527" NA             "543.355.3679"
> str_match(strings, phone)
      [,1]           [,2]  [,3]  [,4]  
 [1,] "219 733 8965" "219" "733" "8965"
 [2,] "329-293-8753" "329" "293" "8753"
 [3,] NA             NA    NA    NA    
 [4,] "595 794 7569" "595" "794" "7569"
 [5,] "387 287 6718" "387" "287" "6718"
 [6,] NA             NA    NA    NA    
 [7,] "233.398.9187" "233" "398" "9187"
 [8,] "482 952 3315" "482" "952" "3315"
 [9,] "239 923 8115" "239" "923" "8115"
[10,] "579-499-7527" "579" "499" "7527"
[11,] NA             NA    NA    NA    
[12,] "543.355.3679" "543" "355" "3679"
> str_extract_all(strings, phone)
[[1]]
[1] "219 733 8965"

[[2]]
[1] "329-293-8753"

[[3]]
character(0)

[[4]]
[1] "595 794 7569"

[[5]]
[1] "387 287 6718"

[[6]]
character(0)

[[7]]
[1] "233.398.9187"

[[8]]
[1] "482 952 3315"

[[9]]
[1] "239 923 8115" "842 566 4692"

[[10]]
[1] "579-499-7527"

[[11]]
character(0)

[[12]]
[1] "543.355.3679"

> str_match_all(strings, phone)
[[1]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "219 733 8965" "219" "733" "8965"

[[2]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "329-293-8753" "329" "293" "8753"

[[3]]
character(0)

[[4]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "595 794 7569" "595" "794" "7569"

[[5]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "387 287 6718" "387" "287" "6718"

[[6]]
character(0)

[[7]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "233.398.9187" "233" "398" "9187"

[[8]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "482 952 3315" "482" "952" "3315"

[[9]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "239 923 8115" "239" "923" "8115"
[2,] "842 566 4692" "842" "566" "4692"

[[10]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "579-499-7527" "579" "499" "7527"

[[11]]
character(0)

[[12]]
     [,1]           [,2]  [,3]  [,4]  
[1,] "543.355.3679" "543" "355" "3679"

> x <- c(" ", " <>", "", "", NA)
> str_match(x, "<(.*?)> <(.*?)>")
     [,1]      [,2] [,3]
[1,] " " "a"  "b" 
[2,] " <>"  "a"  ""  
[3,] NA        NA   NA  
[4,] NA        NA   NA  
[5,] NA        NA   NA  
> str_match_all(x, "<(.*?)>")
[[1]]
     [,1]  [,2]
[1,] "" "a" 
[2,] "" "b" 

[[2]]
     [,1]  [,2]
[1,] "" "a" 
[2,] "<>"  ""  

[[3]]
     [,1]  [,2]
[1,] "" "a" 

[[4]]
character(0)

[[5]]
character(0)

> str_extract(x, "<.*?>")
[1] "" "" "" NA    NA   
> str_extract_all(x, "<.*?>")
[[1]]
[1] "" ""

[[2]]
[1] "" "<>" 

[[3]]
[1] ""

[[4]]
character(0)

[[5]]
character(0)

9 、字符串增加空字符:str_pad()

str_pad(string, width, side = c("left", "right", "both"), pad = " ")

string:字符向量
decreasing:宽度
side:左边,右边,还是两边增加空格

示例

> str_pad("conan", 20, "left")
[1] "               conan"
> # 从右边补充空格,直到字符串长度为20
> str_pad("conan", 20, "right")
[1] "conan               "
> # 从左右两边各补充空格,直到字符串长度为20
> str_pad("conan", 20, "both")
[1] "       conan        "
> # 从左右两边各补充x字符,直到字符串长度为20
> str_pad("conan", 20, "both",'x')
[1] "xxxxxxxconanxxxxxxxx"

10 、替换字符串:str_replace()

str_replace(string, pattern, replacement)
str_replace_all(string, pattern, replacement)

string:字符向量
pattern:匹配的字符,默认正则和非正则

示例

> fruits <- c("one apple", "two pears", "three bananas")
> str_replace(fruits, "[aeiou]", "-")
[1] "-ne apple"     "tw- pears"     "thr-e bananas"
> str_replace_all(fruits, "[aeiou]", "-")
[1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"
> str_replace(fruits, "([aeiou])", "")
[1] "ne apple"     "tw pears"     "thre bananas"
> str_replace(fruits, "([aeiou])", "\\1\\1")
[1] "oone apple"     "twoo pears"     "threee bananas"
> str_replace(fruits, "[aeiou]", c("1", "2", "3"))
[1] "1ne apple"     "tw2 pears"     "thr3e bananas"
> str_replace(fruits, c("a", "e", "i"), "-")
[1] "one -pple"     "two p-ars"     "three bananas"
> fruits <- c("one apple", "two pears", "three bananas")
> str_replace(fruits, "[aeiou]", "-")
[1] "-ne apple"     "tw- pears"     "thr-e bananas"
> str_replace_all(fruits, "[aeiou]", "-")
[1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"
> str_replace_all(fruits, "([aeiou])", "")
[1] "n ppl"    "tw prs"   "thr bnns"
> str_replace_all(fruits, "([aeiou])", "\\1\\1")
[1] "oonee aapplee"      "twoo peeaars"       "threeee baanaanaas"
> str_replace_all(fruits, "[aeiou]", c("1", "2", "3"))
[1] "1n1 1ppl1"     "tw2 p22rs"     "thr33 b3n3n3s"
> str_replace_all(fruits, c("a", "e", "i"), "-")
[1] "one -pple"     "two p-ars"     "three bananas"

11 、分割字符串:str_split()

str_split(string, pattern, n = Inf, simplify = FALSE)
str_split_fixed(string, pattern, n)

string:字符向量
pattern:匹配的字符,默认正则和非正则
simplify: 如何值为FALSE返回字符串向量,如何值为FALSE返回字符串矩阵

示例

> fruits <- c(
+ "apples and oranges and pears and bananas",
+ "pineapples and mangos and guavas"
+ )
> str_split(fruits, " and ")
[[1]]
[1] "apples"  "oranges" "pears"   "bananas"

[[2]]
[1] "pineapples" "mangos"     "guavas"

> str_split(fruits, " and ", n = 3)
[[1]]
[1] "apples"            "oranges"           "pears and bananas"

[[2]]
[1] "pineapples" "mangos"     "guavas"    

> str_split(fruits, " and ", n = 2)
[[1]]
[1] "apples"                        "oranges and pears and bananas"

[[2]]
[1] "pineapples"        "mangos and guavas"

> # If n greater than number of pieces, no padding occurs
> str_split(fruits, " and ", n = 5)
[[1]]
[1] "apples"  "oranges" "pears"   "bananas"

[[2]]
[1] "pineapples" "mangos"     "guavas"    

> # Use fixed to return a character matrix
> str_split_fixed(fruits, " and ", 3)
     [,1]         [,2]      [,3]               
[1,] "apples"     "oranges" "pears and bananas"
[2,] "pineapples" "mangos"  "guavas"           
> str_split_fixed(fruits, " and ", 4)
     [,1]         [,2]      [,3]     [,4]     
[1,] "apples"     "oranges" "pears"  "bananas"
[2,] "pineapples" "mangos"  "guavas" ""       

12 、截取字符串:str_sub()

str_sub(string, start = 1L, end = -1L)
str_sub(string, start = 1L, end = -1L) <- value

string:字符向量
start:开始位置
end: 结束位置

示例

> hw <- "Hadley Wickham"
> str_sub(hw, 1, 6)
[1] "Hadley"

> str_sub(hw, end = 6)
[1] "Hadley"

> str_sub(hw, 8, 14)
[1] "Wickham"

> str_sub(hw, 8)
[1] "Wickham"

> str_sub(hw, c(1, 8), c(6, 14))
[1] "Hadley"  "Wickham"

> str_sub(hw, -1)
[1] "m"

> str_sub(hw, -7)
[1] "Wickham"

> str_sub(hw, end = -7)
[1] "Hadley W"

> # Alternatively, you can pass in a two colum matrix, as in the
> # output from str_locate_all
> pos <- str_locate_all(hw, "[aeio]")[[1]]
> str_sub(hw, pos)
[1] "adley Wickham" "ey Wickham"    "ickham"        "am"            "adley Wickham" "ey Wickham"   
[7] "ickham"        "am"       

> str_sub(hw, pos[, 1], pos[, 2])
[1] "a" "e" "i" "a"
> # Vectorisation

> str_sub(hw, seq_len(str_length(hw)))
 [1] "Hadley Wickham" "adley Wickham"  "dley Wickham"   "ley Wickham"    "ey Wickham"    
 [6] "y Wickham"      " Wickham"       "Wickham"        "ickham"         "ckham"         
[11] "kham"           "ham"            "am"             "m"       

> str_sub(hw, end = seq_len(str_length(hw)))
 [1] "H"              "Ha"             "Had"            "Hadl"           "Hadle"         
 [6] "Hadley"         "Hadley "        "Hadley W"       "Hadley Wi"      "Hadley Wic"    
[11] "Hadley Wick"    "Hadley Wickh"   "Hadley Wickha"  "Hadley Wickham"

> # Replacement form
> x <- "BBCDEF"
> str_sub(x, 1, 1) <- "A"; x
[1] "ABCDEF"

> str_sub(x, -1, -1) <- "K"; x
[1] "ABCDEK"

> str_sub(x, -2, -2) <- "GHIJ"; x
[1] "ABCDGHIJK"

> str_sub(x, 2, -2) <- ""; x
[1] "AK"

13 、删除空字符串:str_trim()

str_trim(string, side = c("both", "left", "right"))

string:字符向量
side:删除左边,右边,两边的空字符

示例

> str_trim(" String with trailing and leading white space\t")
[1] "String with trailing and leading white space"

> str_trim("\n\nString with trailing and leading white space\n\n")
[1] "String with trailing and leading white space"

你可能感兴趣的:(R字符串处理2)