CHAPTER 3 Data Transformation with dplyr
library(nycflights13)
library(tidyverse)
查看冲突信息发现dplyr与基本包的函数冲突,如想用基本包的函数,可以这样写:stats::filter(),stats::lag()。
本次演示数据为nycflights13::flights,包括336,776 flights that departed from New York City in 2013,数据来自US Bureau of Transportation Statistics。
查看具体信息:
?flights
class(flights)
dim(flights)
head(flights)
View(flights)
可以发现该数据是一个Tibbles,属于数据框,但是在tidyverse里更方便。
dplyr包的核心函数:
- filter 按行筛选
- arrange 给行排序
- select 按列筛选
- mutate 根据原有列生成新列
- summarize 摘要统计
以上函数都可以与group_by()连用,用于分组执行。
执行模板为:第一个参数为数据框,后面的参数为要做的事情,然后返回新数据框。
Filter Rows with filter()
filter(flights, month == 1, day == 1)
# 如果想保存结果,需要另外赋值给一个变量
jan1 <- filter(flights, month == 1, day == 1)
# 如果想保存同时打印,需要在外层用括号包裹起来。
(jan1 <- filter(flights, month == 1, day == 1))
Comparisons
比较运算符:>, >=, <, <=, != (not equal), and == (equal)。
几个易犯的错误:
- 判断相等是“==” 而不是“=”;
- 浮点型和整型数据不相等,用near判断。
sqrt(2) ^ 2 == 2
#> [1] FALSE
1/49 * 49 == 1
#> [1] FALSE
near(sqrt(2) ^ 2, 2)
#> [1] TRUE
near(1 / 49 * 49, 1)
#> [1] TRUE
Logical Operators
或 | ,与 & ,非 !
# 找出11月或12月出发的航班
filter(flights, month == 11 | month == 12)
用管道符简化选择 x%in%y,指x是y中的一个
(nov_dec <- filter(flights, month %in% c(11,12)))
其他简化操作:!(x & y)等价于!x | !y,!(x | y)等价于!x & !y
# 这两个结果相同
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)
Missing Values 缺失值
缺失值用NAs表示 (“not availables”)
NULL表示空值,无
NA > 5
#> [1] NA
10 == NA
#> [1] NA
NA + 10
#> [1] NA
NA == NA
#> [1] NA
is.na()检查是否是缺失值
注意filter只选择TRUE的值,FALSE 和 NA 的会排除掉,
如果想保留缺失值
df <- tibble(x=c(1,NA,3))
filter(df,x > 1)
#> # A tibble: 1 × 1
#> x
#>
#> 1 3
filter(df, is.na(x) | x > 1)
#> # A tibble: 2 × 1
#> x
#>
#> 1 NA
#> 2 3
练习题:
1.a选择延误到达大于等于两个小时的航班
View(flights)
filter(flights, arr_delay >= 120)
b.The flights that flew to Houston were are those flights where the destination (dest) is either “IAH” or “HOU”.
filter(flights,dest=='IAH' | dest=='HOU')
# 或者用%in% 选择
filter(flights,dest %in% c('IAH','HOU'))
c. Were operated by United, American, or Delta
filter(flights, carrier %in% c("AA", "DL", "UA"))
d.Departed in summer (July, August, and September)
filter(flights, month >= 7, month <= 9)
e. Arrived more than two hours late, but didn’t leave late
filter(flights, dep_delay <= 0, arr_delay > 120)
f. Were delayed by at least an hour, but made up over 30 minutes in flight
filter(flights, dep_delay >= 60, dep_delay - arr_delay > 30)
g. Departed between midnight and 6 a.m. (inclusive)
filter(flights, dep_time <= 600 | dep_time == 2400)
- between()的作用
between(x, left, right) 与 x >= left & x <= right 相同
filter(flights, between(month, 7, 9))
# month >= 7 & month <= 9
- missing dep_time 缺失值
filter(flights, is.na(dep_time))
- Why is NA ^ 0 not missing? Why is NA | TRUE not missing?
Why is FALSE & NA not missing? Can you figure out the general
rule? (NA * 0 is a tricky counterexample!)
NA ^ 0
#> [1] 1
NA | TRUE
#> [1] TRUE
NA & FALSE
#> [1] FALSE
NA | FALSE
#> [1] NA
NA & TRUE
#> [1] NA
NA * 0
#> [1] NA
Inf * 0
#> [1] NaN
-Inf * 0
#> [1] NaN
Arrange Rows with arrange() 对列排序
arrange(flights, year, month, day)
- desc()函数逆序排序
arrange(flights, desc(arr_delay))
df <- tibble(x = c(5, 2, NA))
arrange(df, x) #缺失值排到最后
arrange(df, desc(x))
Select Columns with select() 按列选择
select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
select(flights, ends_with("y"))
- select支持正则表达式:
starts_with("abc"),ends_with("xyz") ,contains("ijk"),matches("(.)\1"),num_range("x", 1:3)
rename() 重命名变量
rename(flights, tail_num = tailnum)
select(flights, time_hour, air_time, everything())
练习题:
Exercise 5.4.1.1 Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.
select(flights, dep_time, dep_delay, arr_time, arr_delay)
select(flights, "dep_time", "dep_delay", "arr_time", "arr_delay")
select(flights, 4, 5, 6, 9)
select(flights, one_of(c("dep_time", "dep_delay", "arr_time", "arr_delay")))
variables <- c("dep_time", "dep_delay", "arr_time", "arr_delay")
select(flights, one_of(variables))
select(flights, starts_with("dep_"), starts_with("arr_"))
select(flights, matches("^(dep|arr)_(time|delay)$"))
select(flights, ends_with("arr_time"), ends_with("dep_time"))
select(flights, contains("_time"), contains("arr_"))
Exercise 5.4.1.2 What happens if you include the name of a variable multiple times in a select() call?
# select忽略重复项,只选第一个。
select(flights, year, month, day, year, year)
# everything
select(flights, arr_delay, everything())
Exercise 5.4.1.3 What does the one_of() function do? Why might it be helpful in conjunction with this vector?
# one_of 可以将一个向量传入
vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, one_of(vars))
Exercise 5.4.1.4 Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default?
select(flights, contains("TIME"))
# contains忽略了大小写,有个参数可以改变
select(flights, contains("TIME", ignore.case = FALSE))
Add New Variables with mutate() 添加新列
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
)
mutate(flights_sml,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60)
mutate(flights_sml,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
# transmute
transmute(flights,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
该函数支持输入函数,注意输入和输出必须为向量。常用创建函数运算为:
- 算数运算符:+ - * / ^
- 模运算符:%/% 取整,%% 取余
- 对数函数:log(),log2(),log10()
- 偏移函数lead(),lag(),帮助向左或者向右延伸一个变量
(x <- 1:10)
lead(x)
lag(x)
- 累加和积等 cumsum(), cumprod(), cummin(), cummax()
cumsum(x)
cummean(x)
- 逻辑比较:<, <=, >, >=, !=
- 排序:min_rank(),row_number(), dense_rank(), percent_rank(), cume_dist(),
and ntile()
5.6 Grouped summaries with summarise() 折叠数据框,一般与group_by()连用
by_day <- group_by(flights, year, month, day) # 设置分组
summarize(by_day, delay = mean(dep_delay, na.rm = TRUE)) #summarize设置函数
Combining Multiple Operations with the Pipe
head(flights)
# 1. 对dest进行分组
by_dest <- group_by(flights, dest)
# 2.计算距离,平均延误时间,飞机数量
delay <- summarize(by_dest,
count = n(),dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
# 3.对数据进行过滤
delay <- filter(delay, count > 20, dest != "HNL")
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
# 用管道符 %>% 连接前后数据
delays <- flights %>%
group_by(dest) %>%
summarize(
count = n(),dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
Missing Values 的处理 na.rm 参数
flights %>%
group_by(year, month, day) %>%
summarize(mean = mean(dep_delay)) # 如果有一个NA,那么结果就为NA,需要先去掉
flights %>%
group_by(year, month, day) %>%
summarize(mean = mean(dep_delay, na.rm = TRUE))
# 可以先把数据中的空值去掉
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarize(mean = mean(dep_delay))
Counts
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(delay=mean(arr_delay))
ggplot(delays,aes(delay))+geom_freqpoly(binwidth=10)
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarize(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
ggplot(data = delays, mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)
delays %>%
filter(n > 25) %>%
ggplot(mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)
library(Lahman)
batting <- as_tibble(Lahman::Batting)
batters <- batting %>% group_by(playerID) %>%
summarize(ba=sum(H,na.rm=TRUE)/sum(AB,na.rm = TRUE),
ab = sum(AB,na.rm = TRUE))
batters %>% filter(ab>100) %>% ggplot(aes(ab,ba)) + geom_point() +geom_smooth(se=F)
batters %>% arrange(desc(ba))
# 常用的分组函数:mean(x),median(x)
not_cancelled %>% group_by(year,month,day) %>% summarize(
avg_delay1 = mean(arr_delay),
avg_delay2 = mean(arr_delay[arr_delay > 0])
)
# sd(x), 四分位数IQR(x), 中位值偏差mad(x)
not_cancelled %>%
group_by(dest) %>%
summarize(distance_sd = sd(distance)) %>%
arrange(desc(distance_sd))
# min(x), quantile(x, 0.25), max(x)
# When do the first and last flights leave each day?
not_cancelled %>%
group_by(year, month, day) %>%
summarize(
first = min(dep_time),
last = max(dep_time)
)
not_cancelled %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r))
# 计算非空值sum(!is.na(x)),计算唯一值:n_distinct(x)
# Which destinations have the most carriers?
not_cancelled %>%
group_by(dest) %>%
summarize(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))
# 计数:n(),和count()
not_cancelled %>%
count(dest)
not_cancelled %>%
count(tailnum, wt = distance)
# How many flights left before 5am? (these usually
# indicate delayed flights from the previous day)
not_cancelled %>%
group_by(year, month, day) %>%
summarize(n_early = sum(dep_time < 500))
# 也可以对逻辑值进行计数:如sum(x>10)代表数多少个TRUE,mean(x)计算其比例。
not_cancelled %>%
group_by(year, month, day) %>%
summarize(hour_perc = mean(arr_delay > 60))
按多个变量分组
daily <- group_by(flights, year, month, day)
(per_day <- summarize(daily, flights = n()))
(per_month <- summarize(per_day, flights = sum(flights)))
(per_year <- summarize(per_month, flights = sum(flights)))
flights %>% group_by(day) %>% summarize(mean(dep_time,na.rm = T))
# ungrouping 取消分组
daily %>%
ungroup() %>% # no longer grouped by date
summarize(flights = n()) # all flights
3.6.7 练习
(2) 找出另外一种方法,这种方法要可以给出与 not_cancelled %>% count(dest) 和 not_
cancelled %>% count(tailnum, wt = distance) 同样的输出(不能使用 count())。
not_canceled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>% count(dest)
not_cancelled %>% count(tailnum, wt = distance)
# 可以先分组再求每组的长度。
not_cancelled %>%
group_by(dest) %>%
summarise(n = length(dest))
not_cancelled %>%
group_by(dest) %>%
summarise(n = n())
not_cancelled %>%
group_by(tailnum) %>%
summarise(n = sum(distance))
3.7 分组新变量(和筛选器)
# 找出每个分组中最差的成员
flights_sml %>% group_by(year,month,day) %>% filter(rank(desc(arr_delay))<10)
#找出大于某个阈值的所有分组:
popular_dests <- flights %>% group_by(dest) %>% filter(n()>365)
popular_dests
#对数据进行标准化以计算分组指标
popular_dests %>%
filter(arr_delay > 0) %>%
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
select(year:day, dest, arr_delay, prop_delay)
head(flights)
filter(flights,origin %>% c('IAH'))
阅读推荐:
生信技能树公益视频合辑:学习顺序是linux,r,软件安装,geo,小技巧,ngs组学!
B站链接:https://m.bilibili.com/space/338686099
YouTube链接:https://m.youtube.com/channel/UC67sImqK7V8tSWHMG8azIVA/playlists
生信工程师入门最佳指南:https://mp.weixin.qq.com/s/vaX4ttaLIa19MefD86WfUA