数据清洗篇之一些有用的知识点 | R for data science

数据清洗占了数据分析的80%的时间
本篇主要讲主要用于数据清洗的tidyverse包中一些处理字符串相关和数据框求交求差等函数

Talk is cheap, show you the code. 改自Linux之父Linus Torvalds

拆分以及合并某列的数据

library("tidyverse")
table3
#> # A tibble: 6 x 3
#>   country      year rate             
#> *                     
#> 1 Afghanistan  1999 745/19987071     
#> 2 Afghanistan  2000 2666/20595360    
#> 3 Brazil       1999 37737/172006362  
#> 4 Brazil       2000 80488/174504898  
#> 5 China        1999 212258/1272915272
#> 6 China        2000 213766/1280428583


table3 %>% 
  separate(rate, into = c("cases", "population"))
#> # A tibble: 6 x 4
#>   country      year cases  population
#>                  
#> 1 Afghanistan  1999 745    19987071  
#> 2 Afghanistan  2000 2666   20595360  
#> 3 Brazil       1999 37737  172006362 
#> 4 Brazil       2000 80488  174504898 
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583\


# separate()是以non-alphanumeric character(非数字或字母的字符)分割,但你也可以手动以sep指定分隔符。还支持正则,比如\D是非数字,注意R里面需要转义
 table3 %>% 
+     separate(rate, into = c("cases", "population"), sep = "\\D")
# A tibble: 6 × 4
  country      year cases  population
                 
1 Afghanistan  1999 745    19987071  
2 Afghanistan  2000 2666   20595360  
3 Brazil       1999 37737  172006362 
4 Brazil       2000 80488  174504898 
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583
(常见通用正则:
 \d 表示数字0-9, 
\D 表示非数字,
\s 表示空白字符(包括空格、制表符、换行符等),
\S 表示非空白字符,
\w 表示字(字母和数字),
\W 表示非字,
\< 和 \> 分别表示以空白字符开始和结束的文本)


# 还可以给sep传递整数,代表从哪里开始截断;1代表最左边,-1代表最右边
table3 %>% 
  separate(year, into = c("century", "year"), sep = 2)
#> # A tibble: 6 x 4
#>   country     century year  rate             
#>                          
#> 1 Afghanistan 19      99    745/19987071     
#> 2 Afghanistan 20      00    2666/20595360    
#> 3 Brazil      19      99    37737/172006362  
#> 4 Brazil      20      00    80488/174504898  
#> 5 China       19      99    212258/1272915272
#> 6 China       20      00    213766/1280428583


# 有个参数extra,指分割剩下的怎么处理,默认是抛出警告即 "warn"参数,或者"drop",即扔掉,或者"merge",即合并
# 1, warn,默认的
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% 
+     separate(x, c("one", "two", "three"))
# A tibble: 3 × 3
  one   two   three
    
1 a     b     c    
2 d     e     f    
3 h     i     j    
Warning message:
Expected 3 pieces. Additional pieces discarded in 1 rows [2]. 
# 2. drop
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% 
+     separate(x, c("one", "two", "three"), extra = "drop")
# A tibble: 3 × 3
  one   two   three
    
1 a     b     c    
2 d     e     f    
3 h     i     j
# 3. merge
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% 
+     separate(x, c("one", "two", "three"), extra = "merge")
# A tibble: 3 × 3
  one   two   three
    
1 a     b     c    
2 d     e     f,g  
3 h     i     j 


# 还有个参数是fill,即填充,默认靠右填充NA,可以指定靠左
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% 
+     separate(x, c("one", "two", "three"))
# A tibble: 3 × 3
  one   two   three
    
1 a     b     c    
2 d     e     NA   
3 f     g     i    
Warning message:
Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2]. 
 tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% 
+     separate(x, c("one", "two", "three"), fill = "right")
# A tibble: 3 × 3
  one   two   three
    
1 a     b     c    
2 d     e     NA   
3 f     g     i    
 tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% 
+     separate(x, c("one", "two", "three"), fill = "left")
# A tibble: 3 × 3
  one   two   three
    
1 a     b     c    
2 NA    d     e    
3 f     g     i 

unite反向操作,即合并

table5 %>% 
  unite(new, century, year)
#> # A tibble: 6 x 3
#>   country     new   rate             
#>                       
#> 1 Afghanistan 19_99 745/19987071     
#> 2 Afghanistan 20_00 2666/20595360    
#> 3 Brazil      19_99 37737/172006362  
#> 4 Brazil      20_00 80488/174504898  
#> 5 China       19_99 212258/1272915272
#> 6 China       20_00 213766/1280428583


# 默认以"_"连接
# 可以用sep指定连接符
table5 %>% 
  unite(new, century, year, sep = "")
#> # A tibble: 6 x 3
#>   country     new   rate             
#>                       
#> 1 Afghanistan 1999  745/19987071     
#> 2 Afghanistan 2000  2666/20595360    
#> 3 Brazil      1999  37737/172006362  
#> 4 Brazil      2000  80488/174504898  
#> 5 China       1999  212258/1272915272
#> 6 China       2000  213766/1280428583

取交集和特异集合

df1 <- tribble(
  ~x, ~y,
   1,  1,
   2,  1
)
df2 <- tribble(
  ~x, ~y,
   1,  1,
   1,  2
)


intersect(df1, df2) # 取交集
#> # A tibble: 1 x 2
#>       x     y
#>    
#> 1     1     1

# Note that we get 3 rows, not 4


union(df1, df2) # 取两者中特有的
#> # A tibble: 3 x 2
#>       x     y
#>    
#> 1     1     1
#> 2     2     1
#> 3     1     2


setdiff(df1, df2) # 取在df1中有的,df2中没有的
#> # A tibble: 1 x 2
#>       x     y
#>    
#> 1     2     1
setdiff(df2, df1) # 取在df2中有的,df1中没有的
#> # A tibble: 1 x 2
#>       x     y
#>    
#> 1     1     2

连接字符串

str_c("x", "y")
#> [1] "xy"
str_c("x", "y", "z")
#> [1] "xyz"


# sep可指定分隔符
str_c("x", "y", sep = ", ")
#> [1] "x, y"


# 如果想要显示NA,需要用str_replace_na()函数
x <- c("abc", NA)
str_c("|-", x, "-|")
#> [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")
#> [1] "|-abc-|" "|-NA-|" 


# 自动循环连接
str_c("prefix-", c("a", "b", "c"), "-suffix")
#> [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"


# 长度为零的自动去除
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE
str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)
#> [1] "Good morning Hadley."
# 要是换成TRUE:
birthday <- TRUE
 str_c(
+     "Good ", time_of_day, " ", name,
+     if (birthday) " and HAPPY BIRTHDAY",
+     "."
+ )
[1] "Good morning Hadley and HAPPY BIRTHDAY."


# 合并向量需要用collapse参数
str_c(c("x", "y", "z"), collapse = ", ")
#> [1] "x, y, z"

paste和paste0的区别?paste默认以空格连接,paste0则和str_c一样,默认直连

paste("foo", "bar")
#> [1] "foo bar"
paste0("foo", "bar")
#> [1] "foobar"
str_c("foo", "bar")
[1] "foobar"


# 但是str_c遇到NA就直接返回NA
str_c("foo",NA)
[1] NA

str_trim()用于去除空格,和python中的str.strip()一样,但是多了左边或右边的选项。与其相反的是str_pad()

str_trim(" abc ")
#> [1] "abc"
str_trim(" abc ", side = "left")
#> [1] "abc "
str_trim(" abc ", side = "right")
#> [1] " abc"


# 参数str_pad(string, width, side = c("left", "right", "both"), pad = " ")
str_pad("abc", width = 5, side = "both", pad = "i")
[1] "iabci"

写个函数把c("a", "b", "c")转化为字符串a, b, and c

str_commasep <- function(x, delim = ",") {
  n <- length(x)
  if (n == 0) {
    ""
  } else if (n == 1) {
    x
  } else if (n == 2) {
    # no comma before and when n == 2
    str_c(x[[1]], "and", x[[2]], sep = " ")
  } else {
    # commas after all n - 1 elements
    not_last <- str_c(x[seq_len(n - 1)], delim)
    # prepend "and" to the last element
    last <- str_c("and", x[[n]], sep = " ")
    # combine parts with spaces
    str_c(c(not_last, last), collapse = " ")
  }
}
str_commasep("")
#> [1] ""
str_commasep("a")
#> [1] "a"
str_commasep(c("a", "b"))
#> [1] "a and b"
str_commasep(c("a", "b", "c"))
#> [1] "a, b, and c"
str_commasep(c("a", "b", "c", "d"))
#> [1] "a, b, c, and d"

OK,本篇完

ref: https://r4ds.had.co.nz/tidy-data.html

你可能感兴趣的:(数据清洗篇之一些有用的知识点 | R for data science)