Three interrelated rules which make a dataset tidy:
- Each variable must have its own column.
- Each observation must have its own row.
- Each value must have its own cell.
1. pivot
- One variable might be spread across multiple columns. -->
pivot_longer()
- One observation might be scattered across multiple rows. -->
pivot_wider()
1.1 pivot_longer
pivot_longer(data, cols, names_to = "name", values_to = "value", values.drop.na = FALSE)
data
需要转换的数据
cols
进行转换的列, the columns to pivot are specified with dplyr::select()
style notation.
names_to
列名转换为
values_to
值放置于
values_drop_na
去掉缺失值
> table4a
# A tibble: 3 x 3
country `1999` `2000`
*
1 Afghanistan 745 2666
2 Brazil 37737 80488
3 China 212258 213766
> table4a %>%
+ pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
# A tibble: 6 x 3
country year cases
1 Afghanistan 1999 745
2 Afghanistan 2000 2666
3 Brazil 1999 37737
4 Brazil 2000 80488
5 China 1999 212258
6 China 2000 213766
# table4a中列名1999与2000都不符合命名标准,所以需要用反括号 ’ 引用
1.2 pivot_wider
pivot_wider(data, names_from = 'name', values_from = 'values')
> table2
# A tibble: 12 x 4
country year type count
1 Afghanistan 1999 cases 745
2 Afghanistan 1999 population 19987071
3 Afghanistan 2000 cases 2666
4 Afghanistan 2000 population 20595360
5 Brazil 1999 cases 37737
6 Brazil 1999 population 172006362
7 Brazil 2000 cases 80488
8 Brazil 2000 population 174504898
9 China 1999 cases 212258
10 China 1999 population 1272915272
11 China 2000 cases 213766
12 China 2000 population 1280428583
> table2 %>%
+ pivot_wider(names_from = type, values_from = count)
# A tibble: 6 x 4
country year cases population
1 Afghanistan 1999 745 19987071
2 Afghanistan 2000 2666 20595360
3 Brazil 1999 37737 172006362
4 Brazil 2000 80488 174504898
5 China 1999 212258 1272915272
6 China 2000 213766 1280428583
exercise:
> preg <- tribble(
+ ~pregnant, ~male, ~female,
+ "yes", NA, 10,
+ "no", 20, 12
+ )
> preg
# A tibble: 2 x 3
pregnant male female
1 yes NA 10
2 no 20 12
> preg %>%
+ preg %>%
pivot_longer(cols = 2:3,
# cols = male:female,
# cols = c("male","female"),
names_to = "gender",
values_to = "count")
# A tibble: 4 x 3
pregnant gender count
1 yes male NA
2 yes female 10
3 no male 20
4 no female 12
2 separate and unite
2.1 separate
separate(data, col, into, sep=, convert, remove)
col
需要拆分的列
into =
拆分为
sep =
拆分位置(特定符号或制定位置)
convert
是否转换数据类型(默认FALASE)
remove
删除原有列(默认TRUE)
> table3
# A tibble: 6 x 3
country year rate
*
1 Afghanistan 1999 745/19987071
2 Afghanistan 2000 2666/20595360
3 Brazil 1999 37737/172006362
4 Brazil 2000 80488/174504898
5 China 1999 212258/1272915272
6 China 2000 213766/1280428583
> table3 %>%
+ separate(rate, into = c("cases", "population"), sep = "/", convert = TRUE)
# A tibble: 6 x 4
country year cases population
1 Afghanistan 1999 745 19987071
2 Afghanistan 2000 2666 20595360
3 Brazil 1999 37737 172006362
4 Brazil 2000 80488 174504898
5 China 1999 212258 1272915272
6 China 2000 213766 1280428583
-
extract()
uses regluar expression to capture groups and turn groups into multiple columns.
2.2 separate_rows
separate_rows(data, ..., sep = "[^[:alnum:].]+", convert = FALSE)
> teacher <- data.frame(
+ teacher = c("a", "b"),
+ class = c("1,2", "1,3,5")
+ )
> teacher
teacher class
1 a 1,2
2 b 1,3,5
> teacher %>% separate_rows("class", sep = ",", convert = TRUE)
teacher class
1 a 1
2 a 2
3 b 1
4 b 3
5 b 5
2.3 unite
unite(data, col, ..., sep = "_", remove = TRUE, na.rm = FALSE)
> table5
# A tibble: 6 x 4
country century year rate
*
1 Afghanistan 19 99 745/19987071
2 Afghanistan 20 00 2666/20595360
3 Brazil 19 99 37737/172006362
4 Brazil 20 00 80488/174504898
5 China 19 99 212258/1272915272
6 China 20 00 213766/1280428583
> table5 %>%
+ unite(col = new,century, year, sep = "")
# A tibble: 6 x 3
country new rate
1 Afghanistan 1999 745/19987071
2 Afghanistan 2000 2666/20595360
3 Brazil 1999 37737/172006362
4 Brazil 2000 80488/174504898
5 China 1999 212258/1272915272
6 China 2000 213766/1280428583