总结之前经常用到的
ggplot2::geom_bar()
绘制柱状图的用法。
- 1、基础用法;
- 2、
position=
参数:调整两种分组的展示方式; - 3、
stat=
参数:设置频数统计方式; - 4、
geom_text()
添加频数注释; - 5、双向柱状图;
- 6、组内排序;
- 7、柱状图+误差棒
1、基础用法
library(ggplot2)
library(patchwork)
library(carData) #示例数据
head(Salaries) #教职工资情况
# rank discipline yrs.since.phd yrs.service sex salary
# 1 Prof B 19 18 Male 139750
# 2 Prof B 20 16 Male 173200
# 3 AsstProf B 4 3 Male 79750
# 4 Prof B 45 39 Male 115000
# 5 Prof B 40 41 Male 141500
# 6 AssocProf B 6 6 Male 97000
table(Salaries$rank)
# AsstProf AssocProf Prof
# 67 64 266
table(Salaries$rank, Salaries$sex)
# Female Male
# AsstProf 11 56
# AssocProf 10 54
# Prof 18 248
p1 = ggplot(Salaries, aes(x=rank)) +
geom_bar()
p2 = ggplot(Salaries, aes(x=rank)) +
geom_bar() + scale_y_continuous(expand=c(0,0))
p3 =ggplot(Salaries, aes(x=rank, fill=rank)) +
geom_bar()
p1 + p2 + p3
2、position=
参数:调整两种分组的展示方式
- Default:
position=stack
p1 <- ggplot(Salaries, aes(x=rank, fill=sex)) +
geom_bar(position="stack") + labs(title='position="stack"')
p2 <- ggplot(Salaries, aes(x=rank, fill=sex)) +
geom_bar(position="dodge") + labs(title='position="dodge"')
p3 <- ggplot(Salaries, aes(x=rank, fill=sex)) +
geom_bar(position="fill") + labs(title='position="fill"')
p1 + p2 + p3 + plot_layout(guides = 'collect')
3、stat=
参数:设置频数统计方式
- Default:
stat="count"
表示从给定的数据里,统计每个类别出现的次数;此时aes()
只需要给定x
参数即可; -
stat="identity"
表示直接指定每种类别的频数;此时aes()
除了需要给定x
参数交代类别,还需要指定y
参数表示频数值。
library(tidyverse)
dat = Salaries %>% group_by(rank) %>%
dplyr::summarise(n=n()) %>% as.data.frame()
dat
# rank n
# 1 AsstProf 67
# 2 AssocProf 64
# 3 Prof 266
p1 = ggplot(dat, aes(x=rank, y=n, fill=rank)) +
geom_bar(stat = "identity")
dat = Salaries %>% group_by(rank,sex) %>%
dplyr::summarise(n=n()) %>% as.data.frame()
dat
# rank sex n
# 1 AsstProf Female 11
# 2 AsstProf Male 56
# 3 AssocProf Female 10
# 4 AssocProf Male 54
# 5 Prof Female 18
# 6 Prof Male 248
p2 = ggplot(dat, aes(x=rank, y=n, fill=sex)) +
geom_bar(stat = "identity")
p1 + p2
4、geom_text()
添加频数注释
- 相对更适合于
stat = "identity"
的数据类型;如果不是,转换一下即可
dat = Salaries %>% group_by(rank) %>%
dplyr::summarise(n=n())
p1=ggplot(dat, aes(x=rank, y=n)) +
geom_bar(stat="identity") +
geom_text(aes(label=n), vjust = -0.2)
# vjust<0,上移;vjust>0,下移
dat = Salaries %>% group_by(rank,sex) %>%
dplyr::summarise(n=n())
p2=ggplot(dat, aes(x=rank, y=n, fill=sex)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label=n), vjust = -0.2,
position=position_dodge(width=0.9))
p1 + p2
5、双向柱状图
dat = Salaries %>% group_by(rank,sex) %>%
dplyr::summarise(n=n())
dat_m = dat %>%
filter(sex=="Male") %>%
mutate(lab=n) %>% as.data.frame()
# rank sex n lab
# 1 AsstProf Male 56 56
# 2 AssocProf Male 54 54
# 3 Prof Male 248 248
dat_f = dat %>%
filter(sex=="Female") %>%
mutate(lab=-1*n) %>% as.data.frame() #注意要取相反值
# rank sex n lab
# 1 AsstProf Female 11 -11
# 2 AssocProf Female 10 -10
# 3 Prof Female 18 -18
ggplot() +
geom_bar(data=dat_m, aes(x=rank, y=lab, fill=sex),
stat = "identity", position = 'dodge') +
geom_text(data=dat_m, aes(x=rank, y=lab, label=n, vjust=-0.25)) +
geom_bar(data=dat_f, aes(x=rank, y=lab, fill=sex),
stat = "identity", position = 'dodge') +
geom_text(data=dat_f, aes(x=rank, y=lab, label=n, vjust=1.25)) +
scale_y_continuous(breaks=c(200, 100, 0, -20),
labels=c("200", "100", "0","20")) +
scale_fill_manual(values=c("#0072B5","#BC3C28"))
6、组内排序
- 如果只有一种分组方式,调整柱子顺序通过设置类别的因子水平即可。
- 但如果更复杂的情况--组内排序。举个例子:5个学生的三门课程成绩,按照每门学科分组,将5个学生按照成绩从低到高排序(或者从高到低排序)。
grade = data.frame(
subject=rep(c("Chineses","Math","English"), each=5),
name=rep(c("A","B","C","D","E"),3),
score=c(79,65,70,94,82,76,87,80,81,89,88,79,82,95,90))
# subject name score
# 1 Chineses A 79
# 2 Chineses B 65
# 3 Chineses C 70
# 4 Chineses D 94
# 5 Chineses E 82
# 6 Math A 76
# 7 Math B 87
# 8 Math C 80
# 9 Math D 81
# 10 Math E 89
# 11 English A 88
# 12 English B 79
# 13 English C 82
# 14 English D 95
# 15 English E 90
# 先按学科均分从高到低
# 然后每个学科内,成绩从低到高学生排序
grade$subject=fct_reorder(grade$subject, grade$score, .desc=T)
library(tidytext)
p1 = ggplot(grade, aes(x=reorder_within(name,score,subject), y=score, fill=name)) +
geom_bar(stat = "identity") +
scale_x_reordered() +
facet_wrap(subject~. ,scales = "free_x")
# 先按学科均分从低到高
# 然后每个学科内,成绩从高到低学生排序
grade$subject=fct_reorder(grade$subject, grade$score, .desc=F)
library(tidytext)
p2 = ggplot(grade, aes(x=reorder_within(name,-score,subject), y=score, fill=name)) +
geom_bar(stat = "identity") +
scale_x_reordered() +
facet_wrap(subject~. ,scales = "free_x")
p1 + p2 + plot_layout(guides = 'collect')
- 注意
reorder_within(个体,值,分组)
,还需要设置scale_x_reordered() , facet_wrap(variable~. ,scales = "free_x")
7、柱状图+误差棒
- 分组的离散型变量可以用带误差棒的柱状图可视化。其中柱子的高度表示均值,误差棒表示波动水平的sd值
#首先定义可以计算分组离散变量的均值与sd值的函数
data_summary <- function(data, varname, groupnames){
require(plyr)
summary_func <- function(x, col){
c(mean = mean(x[[col]], na.rm=TRUE),
sd = sd(x[[col]], na.rm=TRUE))
}
data_sum<-ddply(data, groupnames, .fun=summary_func,
varname)
data_sum <- rename(data_sum, c("mean" = varname))
return(data_sum)
}
head(grade)
# subject name score
# 1 Chineses A 79
# 2 Chineses B 65
# 3 Chineses C 70
# 4 Chineses D 50
# 5 Chineses E 82
# 6 Math A 76
df1 = data_summary(grade, varname="score",
groupnames=c("subject"))
# subject score sd
# 1 Chineses 69.2 12.716131
# 2 Math 84.4 8.561542
# 3 English 85.8 5.019960
p1=ggplot(df1, aes(x=subject, y=score)) +
geom_bar(stat="identity", color="black") +
geom_errorbar(aes(ymin=score-sd, ymax=score+sd), width=.2) #双向
p2=ggplot(df1, aes(x=subject, y=score)) +
geom_bar(stat="identity", color="black") +
geom_errorbar(aes(ymin=score, ymax=score+sd), width=.2) #单向
p1 + p2
df2 <- data_summary(Salaries, varname="salary",
groupnames=c("rank", "sex"))
# rank sex salary sd
# 1 AsstProf Female 78049.91 9371.996
# 2 AsstProf Male 81311.46 7901.343
# 3 AssocProf Female 88512.80 17965.286
# 4 AssocProf Male 94869.70 12890.817
# 5 Prof Female 121967.61 19619.583
# 6 Prof Male 127120.82 28213.808
ggplot(df2, aes(x=rank, y=salary, fill=sex)) +
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=salary-sd, ymax=salary+sd), width=.2,
position=position_dodge(0.9)) +
theme_classic() +
scale_fill_manual(values=c('#999999','#E69F00'))