library(ggplot2)
data('diamonds')
?diamonds
summary((diamonds$color)) # 显示每个钻石颜色种类的数量情况
summary(diamonds$price)# 基础预览
qplot(x=price, data = diamonds, ylab = 'Counts',
color=I('black'), fill=I('#099DD9')) +
scale_x_continuous(breaks = seq(0, 19000, 2000))
sum(diamonds$price <= 500) # 1749:便宜钻石
# 异常值:$15000 以上的:三分位点+1.5IQR
sum(diamonds$price >= 15000) # 1656
qplot(x=price, data=diamonds, binwidth=50,
ylab = 'Counts') + facet_wrap(~cut)
# 因为图形有点丑陋,所以改为频率多边形图
qplot(x=price, data=diamonds, ylab = 'Counts',
geom = 'freqpoly', color = cut)
min(diamonds$price) # 最低价格:326
max(diamonds$price) # 最高价格:18823
# 细化坐标轴 price
qplot(x=price, data=diamonds, ylab = 'Counts',
geom = 'freqpoly', color = cut) +
scale_x_continuous(limits = c(300, 350),
breaks=seq(300, 350, 2))
# 发现最低价格在切工分类为 ideal 的钻石
qplot(x=price, data=diamonds, ylab = 'Counts',
geom = 'freqpoly', color = cut) +
scale_x_continuous(limits = c(18800, 18823),
breaks=seq(18800, 18823, 2))
# 惊奇的发现切工为 premium(优质切工) 的价格最高
by(diamonds$price, diamonds$cut, mean)
# fair:4358.758; Good:3928.864; very good:3981.76;
# premium:4584.258; ideal:3457.542
# 依然是高级切工 premium 的平均价格最高
# 一览克拉数情况
summary(diamonds$carat)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.2000 0.4000 0.7000 0.7979 1.0400 5.0100
qplot(x=carat, data = diamonds, ylab = 'Counts',
color=I('black'), fill=I('#F79420'))+
scale_x_continuous(breaks=seq(0.2, 5.0100, 0.2))
# 发现主要还是集中在 1.2 以内,所以价格÷carat后的数值可能
# 会近似长尾分布使得许多尾部数据无法充分利用,
# 提前考虑 x 坐标轴 log10
qplot(x=price/carat, data = diamonds,
xlab = 'price/carat (log10)', ylab='Counts',
color=I('black'), fill=I('#CC99FF')) +
scale_x_log10() # 接近正态分布
下图呈现长尾分布,需要进行坐标轴处理,否则 “尾巴” 失去意义就可惜了
处理坐标轴后
qplot(x=color, y=price,
data = diamonds, geom = 'boxplot') +
scale_y_continuous(breaks = seq(0, 18900, 2000)) +
coord_flip() # 箱线图转置
# 颜色最优的钻石的IQR
IQR(subset(diamonds, color='D')$color)
# 对钻石的价格进行 summary 操作,得到分位点,均值,
# 极值等情况,根据中间的颜色参数来分类
by(diamonds$price, diamonds$color, summary)
qplot(x=color, y=price/carat, data = diamonds,
geom = 'boxplot')
这里贪快没有使用 ggplot 的详细作图而是使用 qplot 简易作图,所以暂时没有颜色,若使用 ggplot,可以直接根据参数 variable 自动填充和谐的颜色
summary(diamonds$carat)
# x:重量;y:占比
qplot(x=carat, y=..count../sum(..count..),
data = diamonds, binwidth=0.2, ylab = 'probability',
geom = 'freqpoly') +
scale_x_continuous(breaks = seq(0.2, 5.01, 0.2))