R-tidyr 数据整理

Three interrelated rules which make a dataset tidy:

  1. Each variable must have its own column.
  2. Each observation must have its own row.
  3. Each value must have its own cell.

1. pivot

  1. One variable might be spread across multiple columns. --> pivot_longer()
  2. One observation might be scattered across multiple rows. --> pivot_wider()

1.1 pivot_longer

pivot_longer(data, cols, names_to = "name", values_to = "value", values.drop.na = FALSE)
data 需要转换的数据
cols 进行转换的列, the columns to pivot are specified with dplyr::select() style notation.
names_to 列名转换为
values_to 值放置于
values_drop_na 去掉缺失值

> table4a
# A tibble: 3 x 3
  country     `1999` `2000`
*           
1 Afghanistan    745   2666
2 Brazil       37737  80488
3 China       212258 213766
> table4a %>% 
+   pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
# A tibble: 6 x 3
  country     year   cases
           
1 Afghanistan 1999     745
2 Afghanistan 2000    2666
3 Brazil      1999   37737
4 Brazil      2000   80488
5 China       1999  212258
6 China       2000  213766

# table4a中列名1999与2000都不符合命名标准,所以需要用反括号 ’ 引用

1.2 pivot_wider

pivot_wider(data, names_from = 'name', values_from = 'values')

> table2
# A tibble: 12 x 4
   country      year type            count
                      
 1 Afghanistan  1999 cases             745
 2 Afghanistan  1999 population   19987071
 3 Afghanistan  2000 cases            2666
 4 Afghanistan  2000 population   20595360
 5 Brazil       1999 cases           37737
 6 Brazil       1999 population  172006362
 7 Brazil       2000 cases           80488
 8 Brazil       2000 population  174504898
 9 China        1999 cases          212258
10 China        1999 population 1272915272
11 China        2000 cases          213766
12 China        2000 population 1280428583
> table2 %>%
+   pivot_wider(names_from = type, values_from = count)
# A tibble: 6 x 4
  country      year  cases population
                 
1 Afghanistan  1999    745   19987071
2 Afghanistan  2000   2666   20595360
3 Brazil       1999  37737  172006362
4 Brazil       2000  80488  174504898
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583

exercise:

> preg <- tribble(
+   ~pregnant, ~male, ~female,
+   "yes",     NA,    10,
+   "no",      20,    12
+ )
> preg
# A tibble: 2 x 3
  pregnant  male female
        
1 yes         NA     10
2 no          20     12

> preg %>% 
+   preg %>% 
  pivot_longer(cols = 2:3,
               # cols = male:female,
               # cols = c("male","female"),
               names_to = "gender", 
               values_to = "count")
# A tibble: 4 x 3
  pregnant gender count
        
1 yes      male      NA
2 yes      female    10
3 no       male      20
4 no       female    12

2 separate and unite

2.1 separate

separate(data, col, into, sep=, convert, remove)
col 需要拆分的列
into = 拆分为
sep = 拆分位置(特定符号或制定位置)
convert 是否转换数据类型(默认FALASE)
remove 删除原有列(默认TRUE)

> table3
# A tibble: 6 x 3
  country      year rate             
*                     
1 Afghanistan  1999 745/19987071     
2 Afghanistan  2000 2666/20595360    
3 Brazil       1999 37737/172006362  
4 Brazil       2000 80488/174504898  
5 China        1999 212258/1272915272
6 China        2000 213766/1280428583

> table3 %>% 
+   separate(rate, into = c("cases", "population"), sep = "/", convert = TRUE)
# A tibble: 6 x 4
  country      year  cases population
                 
1 Afghanistan  1999    745   19987071
2 Afghanistan  2000   2666   20595360
3 Brazil       1999  37737  172006362
4 Brazil       2000  80488  174504898
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583
  • extract() uses regluar expression to capture groups and turn groups into multiple columns.

2.2 separate_rows

separate_rows(data, ..., sep = "[^[:alnum:].]+", convert = FALSE)

> teacher <- data.frame(
+   teacher = c("a", "b"),
+   class = c("1,2", "1,3,5")
+ )
> teacher 
  teacher class
1       a   1,2
2       b 1,3,5
> teacher %>% separate_rows("class", sep = ",", convert = TRUE)
  teacher class
1       a     1
2       a     2
3       b     1
4       b     3
5       b     5

2.3 unite

unite(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE)

> table5
# A tibble: 6 x 4
  country     century year  rate             
*                        
1 Afghanistan 19      99    745/19987071     
2 Afghanistan 20      00    2666/20595360    
3 Brazil      19      99    37737/172006362  
4 Brazil      20      00    80488/174504898  
5 China       19      99    212258/1272915272
6 China       20      00    213766/1280428583

> table5 %>%
+   unite(col = new,century, year, sep = "")
# A tibble: 6 x 3
  country     new   rate             
                      
1 Afghanistan 1999  745/19987071     
2 Afghanistan 2000  2666/20595360    
3 Brazil      1999  37737/172006362  
4 Brazil      2000  80488/174504898  
5 China       1999  212258/1272915272
6 China       2000  213766/1280428583

你可能感兴趣的:(R-tidyr 数据整理)