R-ggplot2-柱状图系列

总结之前经常用到的ggplot2::geom_bar()绘制柱状图的用法。

  • 1、基础用法;
  • 2、position=参数:调整两种分组的展示方式;
  • 3、stat=参数:设置频数统计方式;
  • 4、geom_text()添加频数注释;
  • 5、双向柱状图;
  • 6、组内排序;
  • 7、柱状图+误差棒

1、基础用法

library(ggplot2)
library(patchwork)
library(carData) #示例数据
head(Salaries) #教职工资情况
#        rank discipline yrs.since.phd yrs.service  sex salary
# 1      Prof          B            19          18 Male 139750
# 2      Prof          B            20          16 Male 173200
# 3  AsstProf          B             4           3 Male  79750
# 4      Prof          B            45          39 Male 115000
# 5      Prof          B            40          41 Male 141500
# 6 AssocProf          B             6           6 Male  97000

table(Salaries$rank)
# AsstProf AssocProf      Prof 
#       67        64       266
table(Salaries$rank, Salaries$sex)
#           Female Male
# AsstProf      11   56
# AssocProf     10   54
# Prof          18  248
p1 = ggplot(Salaries, aes(x=rank)) +
  geom_bar()
p2 = ggplot(Salaries, aes(x=rank)) +
  geom_bar() + scale_y_continuous(expand=c(0,0)) 
p3 =ggplot(Salaries, aes(x=rank, fill=rank)) +
  geom_bar()
p1 + p2 + p3

2、position=参数:调整两种分组的展示方式

  • Default:position=stack
p1 <- ggplot(Salaries, aes(x=rank, fill=sex)) +
  geom_bar(position="stack") + labs(title='position="stack"') 
p2 <- ggplot(Salaries, aes(x=rank, fill=sex)) +
  geom_bar(position="dodge") + labs(title='position="dodge"')
p3 <- ggplot(Salaries, aes(x=rank, fill=sex)) +
  geom_bar(position="fill") + labs(title='position="fill"')
p1 + p2 + p3 + plot_layout(guides = 'collect')

3、stat=参数:设置频数统计方式

  • Default:stat="count" 表示从给定的数据里,统计每个类别出现的次数;此时aes()只需要给定x参数即可;
  • stat="identity"表示直接指定每种类别的频数;此时aes()除了需要给定x参数交代类别,还需要指定y参数表示频数值。
library(tidyverse)
dat = Salaries %>% group_by(rank) %>% 
  dplyr::summarise(n=n()) %>% as.data.frame()
dat
#        rank   n
# 1  AsstProf  67
# 2 AssocProf  64
# 3      Prof 266
p1 = ggplot(dat, aes(x=rank, y=n, fill=rank)) +
  geom_bar(stat = "identity")

dat = Salaries %>% group_by(rank,sex) %>% 
  dplyr::summarise(n=n()) %>% as.data.frame()
dat
#        rank    sex   n
# 1  AsstProf Female  11
# 2  AsstProf   Male  56
# 3 AssocProf Female  10
# 4 AssocProf   Male  54
# 5      Prof Female  18
# 6      Prof   Male 248

p2 = ggplot(dat, aes(x=rank, y=n, fill=sex)) +
  geom_bar(stat = "identity")

p1 + p2

4、geom_text()添加频数注释

  • 相对更适合于stat = "identity"的数据类型;如果不是,转换一下即可
dat = Salaries %>% group_by(rank) %>% 
  dplyr::summarise(n=n())
p1=ggplot(dat, aes(x=rank, y=n)) +
  geom_bar(stat="identity") +
  geom_text(aes(label=n), vjust = -0.2)
  # vjust<0,上移;vjust>0,下移

dat = Salaries %>% group_by(rank,sex) %>% 
  dplyr::summarise(n=n())
p2=ggplot(dat, aes(x=rank, y=n, fill=sex)) +
  geom_bar(stat="identity", position = "dodge") +
  geom_text(aes(label=n), vjust = -0.2,
            position=position_dodge(width=0.9))

p1 + p2

5、双向柱状图

dat = Salaries %>% group_by(rank,sex) %>% 
  dplyr::summarise(n=n())
dat_m = dat %>% 
  filter(sex=="Male") %>% 
  mutate(lab=n) %>% as.data.frame()
#        rank  sex   n lab
# 1  AsstProf Male  56  56
# 2 AssocProf Male  54  54
# 3      Prof Male 248 248
dat_f = dat %>% 
  filter(sex=="Female") %>% 
  mutate(lab=-1*n) %>% as.data.frame() #注意要取相反值
#        rank    sex  n lab
# 1  AsstProf Female 11 -11
# 2 AssocProf Female 10 -10
# 3      Prof Female 18 -18

ggplot() + 
  geom_bar(data=dat_m, aes(x=rank, y=lab, fill=sex),
           stat = "identity", position = 'dodge') +
  geom_text(data=dat_m, aes(x=rank, y=lab, label=n, vjust=-0.25)) +
  geom_bar(data=dat_f, aes(x=rank, y=lab, fill=sex),
           stat = "identity", position = 'dodge') +
  geom_text(data=dat_f, aes(x=rank, y=lab, label=n, vjust=1.25)) +
  scale_y_continuous(breaks=c(200, 100, 0, -20),
                     labels=c("200", "100", "0","20")) +
  scale_fill_manual(values=c("#0072B5","#BC3C28"))
image.png

6、组内排序

  • 如果只有一种分组方式,调整柱子顺序通过设置类别的因子水平即可。
  • 但如果更复杂的情况--组内排序。举个例子:5个学生的三门课程成绩,按照每门学科分组,将5个学生按照成绩从低到高排序(或者从高到低排序)。
grade = data.frame(
  subject=rep(c("Chineses","Math","English"), each=5),
  name=rep(c("A","B","C","D","E"),3),
  score=c(79,65,70,94,82,76,87,80,81,89,88,79,82,95,90))
#     subject name score
# 1  Chineses    A    79
# 2  Chineses    B    65
# 3  Chineses    C    70
# 4  Chineses    D    94
# 5  Chineses    E    82
# 6      Math    A    76
# 7      Math    B    87
# 8      Math    C    80
# 9      Math    D    81
# 10     Math    E    89
# 11  English    A    88
# 12  English    B    79
# 13  English    C    82
# 14  English    D    95
# 15  English    E    90

# 先按学科均分从高到低
# 然后每个学科内,成绩从低到高学生排序
grade$subject=fct_reorder(grade$subject, grade$score, .desc=T)
library(tidytext)
p1 = ggplot(grade, aes(x=reorder_within(name,score,subject), y=score, fill=name)) +
  geom_bar(stat = "identity") +
  scale_x_reordered() +
  facet_wrap(subject~. ,scales = "free_x")

# 先按学科均分从低到高
# 然后每个学科内,成绩从高到低学生排序
grade$subject=fct_reorder(grade$subject, grade$score, .desc=F)
library(tidytext)
p2 = ggplot(grade, aes(x=reorder_within(name,-score,subject), y=score, fill=name)) +
  geom_bar(stat = "identity") +
  scale_x_reordered() +
  facet_wrap(subject~. ,scales = "free_x")

p1 + p2 + plot_layout(guides = 'collect')
  • 注意reorder_within(个体,值,分组),还需要设置scale_x_reordered() , facet_wrap(variable~. ,scales = "free_x")

7、柱状图+误差棒

  • 分组的离散型变量可以用带误差棒的柱状图可视化。其中柱子的高度表示均值,误差棒表示波动水平的sd值
#首先定义可以计算分组离散变量的均值与sd值的函数
data_summary <- function(data, varname, groupnames){
  require(plyr)
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      sd = sd(x[[col]], na.rm=TRUE))
  }
  data_sum<-ddply(data, groupnames, .fun=summary_func,
                  varname)
  data_sum <- rename(data_sum, c("mean" = varname))
  return(data_sum)
}
head(grade)
#    subject name score
# 1 Chineses    A    79
# 2 Chineses    B    65
# 3 Chineses    C    70
# 4 Chineses    D    50
# 5 Chineses    E    82
# 6     Math    A    76
df1 = data_summary(grade, varname="score", 
             groupnames=c("subject"))
#    subject score        sd
# 1 Chineses  69.2 12.716131
# 2     Math  84.4  8.561542
# 3  English  85.8  5.019960
p1=ggplot(df1, aes(x=subject, y=score)) + 
  geom_bar(stat="identity", color="black") +
  geom_errorbar(aes(ymin=score-sd, ymax=score+sd), width=.2) #双向
p2=ggplot(df1, aes(x=subject, y=score)) + 
  geom_bar(stat="identity", color="black") +
  geom_errorbar(aes(ymin=score, ymax=score+sd), width=.2)  #单向
p1 + p2
df2 <- data_summary(Salaries, varname="salary", 
                    groupnames=c("rank", "sex"))
#        rank    sex    salary        sd
# 1  AsstProf Female  78049.91  9371.996
# 2  AsstProf   Male  81311.46  7901.343
# 3 AssocProf Female  88512.80 17965.286
# 4 AssocProf   Male  94869.70 12890.817
# 5      Prof Female 121967.61 19619.583
# 6      Prof   Male 127120.82 28213.808
ggplot(df2, aes(x=rank, y=salary, fill=sex)) + 
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  geom_errorbar(aes(ymin=salary-sd, ymax=salary+sd), width=.2,
                position=position_dodge(0.9)) +
  theme_classic() +
  scale_fill_manual(values=c('#999999','#E69F00'))

你可能感兴趣的:(R-ggplot2-柱状图系列)