stringr字符处理 - (jianshu.com)
dplyr表格操作 - (jianshu.com)
对于表格来说,一般行(row)代表不同的样本,列(column)代表不同角度的观测值。
tibble
是dplyr默认支持的一种表格类型,相比于R常规的data.frame
表格类型有些许不同。
本次基于dplyr
包,学习表格操作的知识点有:
1、表格数据统计
2、表格筛选
3、表格新增列
4、表格排序
5、表格的列名/行名
6、两个表格合并
library(tidyverse)
# -- Attaching packages ----------------------------------------------------- tidyverse 1.3.1 --
# √ ggplot2 3.3.5 √ purrr 0.3.4
# √ tibble 3.1.2 √ dplyr 1.0.7
# √ tidyr 1.1.3 √ stringr 1.4.0
# √ readr 2.0.0 √ forcats 0.5.1
# -- Conflicts -------------------------------------------------------- tidyverse_conflicts() --
# x dplyr::filter() masks stats::filter()
# x dplyr::lag() masks stats::lag()
0、示例数据
mtcars %>% head
# mpg cyl disp hp drat wt qsec vs am gear carb
# Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
# Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
# Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
# Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
# Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
# Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
iris %>% head
# # A tibble: 6 x 5
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#
# 1 5.1 3.5 1.4 0.2 setosa
# 2 4.9 3 1.4 0.2 setosa
# 3 4.7 3.2 1.3 0.2 setosa
# 4 4.6 3.1 1.5 0.2 setosa
# 5 5 3.6 1.4 0.2 setosa
# 6 5.4 3.9 1.7 0.4 setosa
1、表格数据统计
- 表格数据的统计一般指对感兴趣的列进行例如均值、最值等统计指标的计算。
- dplyr支持在表征分组信息列的基础上,对其它列进行统计;搭配管道符
%>%
操作尤其强大
关于管道操作符,可参考上一小节
1.1 group_by()
#得到具有Groups属性的tibble数据格式
by_cyl <- mtcars %>% group_by(cyl)
by_cyl %>% head #如下表示根据cyl分为3组,但注释数据框本身信息没有任何丢失
# # A tibble: 6 x 11
# # Groups: cyl [3]
# mpg cyl disp hp drat wt qsec vs am gear carb
#
# 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
# 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
# 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
# 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
# 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
# 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
mtcars %>% group_by(cyl, am) #根据两个分类列进行分组
# A tibble: 32 x 11
# Groups: cyl, am [6]
1.2 summarise()
summarise(mtcars, mean_disp = mean(disp))
mtcars %>% summarise(mean_disp = mean(disp))
# mean_disp
# 1 230.7219
- 如上是最简单的用法。可以对分组后的tibble进行高效的统计操作;并且可以通过
?summarise
查看有哪些常用的统计函数
mtcars %>%
group_by(cyl) %>% #分组统计,再纵向合并分组结果
summarise(mean = mean(disp), n = n())
# # A tibble: 3 x 3
# cyl mean n
#
# 1 4 105. 11
# 2 6 183. 7
# 3 8 353. 14
# n()函数表示每个分类水平的数目,类似于table()。一般建议在分组统计时都添加下,对数据有整体的把握
mtcars %>%
group_by(cyl) %>%
summarise(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75), test = c("aa","bb"))
# # A tibble: 6 x 4
# # Groups: cyl [3]
# cyl qs prob test
#
# 1 4 78.8 0.25 aa
# 2 4 121. 0.75 bb
# 3 6 160 0.25 aa
# 4 6 196. 0.75 bb
# 5 8 302. 0.25 aa
# 6 8 390 0.75 bb’
# 搭配across同时对多列进行统计
iris %>%
group_by(Species) %>% #自动变为tibble
summarise(across(starts_with("Sepal"), list(mean = mean, sd = sd)))
iris %>%
group_by(Species) %>%
summarise(across(starts_with("Sepal"), mean, .names = "mean_{.col}"))
1.3 n()
与 count()
#下面两个命令等价
mtcars %>% count(cyl)
mtcars %>%
group_by(cyl) %>%
summarise(n = n())
# # A tibble: 3 x 2
# cyl n
#
# 1 4 11
# 2 6 7
# 3 8 14
#进阶用法
mtcars %>% count(cyl, sort = T) #降序排列
mtcars %>% count(cyl, vs) #两个分组变量
mtcars %>% count(qsec_int=round(qsec)) #计算新的一列,并分组统计
mtcars %>% add_count(cyl) #在原有表格基础上增添一列,为相应level的counts总数
2、表格筛选
2.1 filter()
筛选行
- 按列的属性值筛选感兴趣的样本
mtcars %>% filter(cyl == 6)
mtcars %>% filter(cyl == 6, disp > 200) #与
mtcars %>% filter(cyl == 6 & disp > 200) #与
mtcars %>% filter(cyl == 6 | disp > 200) #或
# 搭配group_by(),分组筛选
# 这在基于全组水平的筛选指标很有意义
mtcars %>%
group_by(vs) %>%
filter(disp == max(disp)) #筛选按 vs 分组里,disp的最大值
mtcars %>%
group_by(vs) %>%
filter(disp > median(disp)) #筛选按 vs 分组里,disp 里的前50%
# top_n() 函数可更加方便的选择每组的top X、tail X
# top_n(x, n, wt)
mtcars %>%
group_by(vs) %>%
top_n(3, wt = disp) #top X
mtcars %>%
group_by(vs) %>%
top_n(-3, wt = disp) #tail X
# top_frac(x, n, wt)
2.2 distinct()
与n_distinct
行去重
df <- tibble(
x = sample(3, 10, rep = TRUE),
y = sample(3, 10, rep = TRUE)
)
# # A tibble: 10 x 2
# x y
#
# 1 3 1
# 2 1 1
# 3 2 1
# 4 1 2
# 5 3 2
# 6 3 2
# 7 2 2
# 8 2 3
# 9 3 1
# 10 3 3
nrow(df)
df %>% n_distinct() #有多少完全unique的行
# df %>% distinct() %>% nrow()
nrow(distinct(df)) #默认去掉完全一致的行
nrow(distinct(df, x)) #按照指定的一/几列的值进行去重
2.3 挑选列
mtcars %>% select(mpg)
mtcars %>% select(mpg, am)
mtcars %>% select(mpg:hp)
mtcars %>% select(starts_with("c")) #列名以p开头的
mtcars %>% select(contains("p")) #列名含有 p 的
mtcars %>% select(starts_with("c") | contains("p") | vs)
mtcars %>% select(!mpg) #反选
mtcars %>% select(-mpg) #反选
mtcars %>% select(hp, everything()) #把指定列放在第一列
3、表格新增列
3.1 mutate
在原有表格基础上增添新的列
#根据已有的列计算得到新的列
starwars %>%
select(name, mass) %>%
mutate(
mass2 = mass * 2,
mass2_squared = mass2 * mass2
)
#修改已有的列
starwars %>%
select(name, height, mass, homeworld) %>%
mutate(
mass = NULL, #删除原有列
height = height * 0.0328084 # 修改原有的列
)
#多列批量操作
starwars %>%
select(name, homeworld, species) %>%
mutate(across(!name, as.factor))
iris %>%
as_tibble() %>%
mutate(across(c(Sepal.Length, Sepal.Width), round))
#涉及到总体水平时,group_by分组会有影响
starwars %>%
select(name, mass, species) %>%
mutate(mass_norm = mass / mean(mass, na.rm = TRUE))
starwars %>%
select(name, mass, species) %>%
group_by(species) %>%
mutate(mass_norm = mass / mean(mass, na.rm = TRUE))
#新增列,表示组内排名
starwars %>%
select(name, mass, homeworld) %>%
group_by(homeworld) %>%
mutate(rank = min_rank(mass)) #分组排名,默认为升序,即值越小,排名越前
starwars %>%
select(name, mass, homeworld) %>%
group_by(homeworld) %>%
mutate(rank = min_rank(desc(mass))) #分组排名,设置为降序,即值越大,排名越前
###关于各种rank###
x <- c(5, 1, 3, 2, 2)
rank(x)
#[1] 5.0 1.0 4.0 2.5 2.5
row_number(x) # rank no duplication;可能不公平;相同情况下,排在前面的排名高
#[1] 5 1 4 2 3
min_rank(x) # rank no duplication;有相同排名,会有gap的情况
dense_rank(x) #rank no duplication;有相同排名,没有gap的情况
percent_rank(x) #a number between 0 and 1 computed by rescaling min_rank to [0, 1]
cume_dist(x) #a cumulative distribution function. Proportion of all values less than or equal to the current rank.
#################
# 默认新增的列会在原始表格的最右边
# Experimental: you can override with `.before` or `.after`
df <- tibble(x = 1, y = 2)
df %>% mutate(z = x + y)
df %>% mutate(z = x + y, .before = 3) #指明新增的列在第几列
df %>% mutate(z = x + y, .after = x) #指明新增的列放在哪一列的后面
?transmute #只输出新增的列
df %>% transmute(z = x + y)
df %>% mutate(z = x + y, .keep = "none") # .keep = 'all' the default
4、表格排序
arrange()
mtcars %>% head
mtcars %>% arrange(disp)
mtcars %>% arrange(desc(disp)) # desc 降序排列
arrange(mtcars, carb, disp) #先按carb升序排列,再按disp升序排列
mtcars %>% count(cyl, sort = T) #降序排列
mtcars %>%
count(cyl) %>%
arrange(n) #升序排列
5、表格的列名/行名
-
rename
: 修改列名
#修改列名
iris <- as_tibble(iris) # so it prints a little nicer
rename(iris, petal_length = Petal.Length) # new = old
-
rownames_to_column
、column_to_rownames
行名与列的转换
mtcars %>% head
#行名变为列
rownames_to_column(mtcars, var = "car") %>% head
mtcars %>%
rownames_to_column(var = "car") %>% head
mtcars %>%
rownames_to_column(var = "car") %>%
#指定列变为行名
column_to_rownames(var = "car") %>% head
6、两个表格合并
- 示例表格数据
df1 <- data.frame(x1 = c(1, 2, 3, 4, 5, 6),
y1 = c("a", "b", "c", "d","e","f"))
# x1 y1
# 1 1 a
# 2 2 b
# 3 3 c
# 4 4 d
# 5 5 e
# 6 6 f
df2 <- data.frame(x2 = c(1, 2, 3, 4, 5, 6),
y2 = c("q", "w", "e", "r","t","y"))
# x2 y2
# 1 1 q
# 2 2 w
# 3 3 e
# 4 4 r
# 5 5 t
# 6 6 y
df3 <- data.frame(x3 = c( 2, 4, 5, 6),
y3 = c( "a", "s", "d","f"))
# x3 y3
# 1 2 a
# 2 4 s
# 3 5 d
# 4 6 f
根据不同的分析目的,有多种合并方法~
6.1、 内连接
- 取交集,即保留同时在两个表中的观测
# merge(df1, df3, by.x = "y1", by.y = "y3")
inner_join(df1, df3, by=c("y1"="y3"))
# x1 y1 x3
# 1 1 a 2
# 2 4 d 5
# 3 6 f 6
注意:如果这两个列的名相同,合并时直接交代行名即可,例如
inner_join(df1_1, df3_1, by="y")
,merge(df1_1, df3_1, by = "y")
,下同~
6.2、外连接
- (1)左连接--保留左边表格的所有观测,缺失值用
NA
值代替
# merge(df1, df3, by.x = "y1", by.y = "y3", all.x = T)
left_join(df1, df3, by=c("y1"="y3"))
# x1 y1 x3
# 1 1 a 2
# 2 2 b NA
# 3 3 c NA
# 4 4 d 5
# 5 5 e NA
# 6 6 f 6
- (2)右连接--保留右边表格的所有观测,缺失值用
NA
值代替
# merge(df1, df2, by.x = "y1", by.y = "y2", all.y = T)
right_join(df1, df2, by=c("y1"="y2"))
# x1 y1 x2
# 1 5 e 3
# 2 NA q 1
# 3 NA w 2
# 4 NA r 4
# 5 NA t 5
# 6 NA y 6
- (3)外连接--保留两个表格里的所有观测,缺失值用
NA
值代替
# merge(df1, df2, by.x = "y1", by.y = "y2", all.x = T, all.y = T)
full_join(df1, df2, by=c("y1"="y2"))
# x1 y1 x2
# 1 1 a NA
# 2 2 b NA
# 3 3 c NA
# 4 4 d NA
# 5 5 e 3
# 6 6 f NA
# 7 NA q 1
# 8 NA w 2
# 9 NA r 4
# 10 NA t 5
# 11 NA y 6