What to Do First?
getwd()
list.files
pf <- read.csv("pseudo_facebook.tsv",sep='\t')
Histogram of Users' Birthdays
names(pf)
library(ggplot2)
summary(pf$dob_day)
qplot(x=dob_day,data=pf,bins=31)+
scale_x_continuous(breaks=1:31)
ggplot(aes(x=dob_day),data=pf)+
geom_histogram(bins=31)+
scale_x_continuous(breaks=1:31)
Faceting
qplot(x=dob_day,data=pf,bins=31)+
scale_x_continuous(breaks=1:31)+
facet_wrap(~dob_month,ncol=3)
ggplot(aes(x=dob_day),data=pf)+
geom_histogram(bins=31)+
scale_x_continuous(breaks=1:31)+
facet_wrap(~dob_month,ncol=3)
facet_grid(vertical~horizontal)
传递两个或多个变量时使用facet_grid
Friend Count
qplot(x=friend_count,data=pf)
ggplot(aes(x=friend_count),data=pf)+
geom_histogram()
Limiting the Axes
限制轴,避免长尾数据
qplot(x=friend_count,data=pf,xlim=c(0,1000))
qplot(x=friend_count,data=pf)+
scale_x_continuous(limits = c(0,1000))
ggplot(aes(x=friend_count),data=pf)+
geom_histogram()+
scale_x_continuous(limits=c(0,1000))
Adjusting the Bin Width
qplot(x=friend_count,data=pf,binwidth=25)+
scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))
ggplot(aes(x=friend_count),data=pf)+
geom_histogram(binwidth=25)+
scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))
Faceting Friend Count
qplot(x=friend_count,data=pf,binwidth=25)+
scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
facet_wrap(~gender)
ggplot(aes(x=friend_count),data=pf)+
geom_histogram(binwidth=25)+
scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
facet_wrap(~gender)
Omitting NA Values
R 将缺失值表现为NA
qplot(x=friend_count,data=subset(pf,!is.na(gender)),binwidth=25)+
scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
facet_wrap(~gender)
ggplot(aes(x=friend_count),data=subset(pf,!is.na(gender)))+
geom_histogram(binwidth=25)+
scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
facet_wrap(~gender)
na.omit(pf)将去掉数据集中所有包含NA的条目
qplot(x=friend_count,data=na.omit(pf),binwidth=25)+
scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
facet_wrap(~gender)
ggplot(aes(x=friend_count),data=na.omit(pf))+
geom_histogram(binwidth=25)+
scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
facet_wrap(~gender)
通过上述生成的直方图,很难判断哪个性别的平均好友数更多
Statistics 'by' Gender
table(pf$gender)
by(pf$friend_count,pf$gender,summary)
Tenure
Notes:
color为16进制颜色代码,参见https://en.wikipedia.org/wiki/Web_colors
qplot(x=tenure,data=pf,binwidth=30,
color=I('Black'),fill=I('#099DD9'))
ggplot(aes(x=tenure),data=pf)+
geom_histogram(binwidth=30,color='Black',fill='#099DD9')
create a histogram of tenure by year?
qplot(x=tenure/365,data=pf,binwidth=1,
color=I('Black'),fill=I('#099DD9'))
ggplot(aes(x=tenure/365),data=pf)+
geom_histogram(binwidth=1,color='Black',fill='#099DD9')
Labeling Plots
qplot(x=tenure/365,data=pf,
xlab='Number of years using Facebook',
ylab='Number of users in sample',
color=I('Black'),fill=I('#099DD9'))+
scale_x_continuous(breaks=seq(1,7,1),limits=c(0,7))
ggplot(aes(x=tenure/365),data=pf,
xlab='Number of years using Facebook',
ylab='Number of users in sample')+
geom_histogram(color='Black',fill='#099DD9')+
scale_x_continuous(breaks=seq(1,7,1),limits=c(0,7))
User Ages
summary(pf$age)
qplot(x=age,data=pf,binwidth=1,
color=I('Black'),fill=I('#099DD9'))+
scale_x_continuous(breaks=seq(0,113,5),limits=c(0,113))
ggplot(aes(x=age),data=pf)+
geom_histogram(color='Black',fill='#099DD9',binwidth = 1)+
scale_x_continuous(breaks=seq(0,113,5),limits=c(0,113))
Transforming Data
Notes:
p1 <- qplot(x=friend_count,data=pf)
summary(pf$friend_count)
summary(log10(pf$friend_count+1))
summary(sqrt(pf$friend_count))
p2 <- qplot(x=log10(pf$friend_count+1),data=pf)
p3 <- qplot(x=sqrt(pf$friend_count),data=pf)
library(gridExtra)
grid.arrange(p1,p2,p3,ncol=1)
使用ggplot的版本
p1 <- ggplot(aes(x=friend_count),data=pf)+
geom_histogram()
p2 <- p1+scale_x_log10()
p3 <- p1+scale_x_sqrt()
grid.arrange(p1,p2,p3,ncol=1)
Add a Scaling Layer
logScale <- qplot(x=log10(pf$friend_count),data=pf)
countScale <- ggplot(aes(x=friend_count),data=pf)+
geom_histogram()+
scale_x_log10()
grid.arrange(logScale,countScale,ncol=2)
qplot(x=pf$friend_count,data=pf)+
scale_x_log10()
上面两幅图的区别在于X轴上的标记不同
频数多边形
qplot(x=friend_count,y=..count../sum(..count..),
data=subset(pf,!is.na(gender)),
xlab='Friend count',
ylab='Proportion of users with that friend count',
binwidth=10,geom='freqpoly',color=gender)+
scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))
ggplot(aes(x = friend_count, y = ..count../sum(..count..)),
data = subset(pf, !is.na(gender)),
xlab='好友数量',
ylab='Percentage of users with that friend count') +
geom_freqpoly(aes(color = gender), binwidth=10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))
qplot(x=www_likes,data=subset(pf,!is.na(gender)),
geom='freqpoly',color=gender)+
scale_x_continuous()+
scale_x_log10()
ggplot(aes(x=www_likes),data=subset(pf,!is.na(gender)))+
geom_freqpoly(aes(color=gender))+
scale_x_continuous()+
scale_x_log10()
Likes on the Web
by(pf$www_likes,pf$gender,sum)
Box Plots
qplot(x=gender,y=friend_count,
data=subset(pf,!is.na(gender)),
geom='boxplot')
ggplot(aes(x=gender,y=friend_count),
data=subset(pf,!is.na(gender)))+
geom_boxplot()
Adjust the code to focus on users who have friend counts between 0 and 1000.
qplot(x=gender,y=friend_count,
data=subset(pf,!is.na(gender)),
geom='boxplot',ylim=c(0,1000))
qplot(x=gender,y=friend_count,
data=subset(pf,!is.na(gender)),
geom='boxplot')+
scale_y_continuous(lim=c(0,1000))
ggplot(aes(x=gender,y=friend_count),
data=subset(pf,!is.na(gender)))+
geom_boxplot()+
scale_y_continuous(lim=c(0,1000))
使用coord_cartesian
qplot(x=gender,y=friend_count,
data=subset(pf,!is.na(gender)),
geom='boxplot')+
coord_cartesian(ylim=c(0,1000))
ggplot(aes(x=gender,y=friend_count),
data=subset(pf,!is.na(gender)))+
geom_boxplot()+
coord_cartesian(ylim=c(0,1000))
Box Plots, Quartiles, and Friendships
qplot(x=gender,y=friend_count,
data=subset(pf,!is.na(gender)),
geom='boxplot')+
coord_cartesian(ylim=c(0,250))
ggplot(aes(x=gender,y=friend_count),
data=subset(pf,!is.na(gender)))+
geom_boxplot()+
coord_cartesian(ylim=c(0,250))
by(pf$friend_count,pf$gender,summary)
coord_cartesian的结果和表输出的结果一致(包括中位数等
names(pf)
by(pf$friendships_initiated,pf$gender,mean)
summary(pf$friendships_initiated)
qplot(x=gender,y=friendships_initiated,
data=subset(pf,!is.na(gender)),
geom='boxplot')+
coord_cartesian(ylim=c(0,200))
ggplot(aes(x=gender,y=friendships_initiated),
data=subset(pf,!is.na(gender)))+
geom_boxplot()+
coord_cartesian(ylim=c(0,200))
箱线图帮助我们理解数据的分布,感知异常值
Getting Logical 符合逻辑
summary(pf$mobile_likes)
summary(pf$mobile_likes>0)
pf$mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes>0,1,0)
pf$mobile_check_in <- factor(pf$mobile_check_in)
summary(pf$mobile_check_in)
what percent of check in using mobile?
sum(pf$mobile_check_in==1)/length(pf$mobile_check_in)
习题集
1.对数据的基本了解
data(diamonds)
View(diamonds)
str(diamonds)
?diamonds
2.价格直方图
qplot(data=diamonds,x=price,binwidth=300)+
scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,2000))
ggplot(aes(x=price),data=diamonds)+
geom_histogram(binwidth = 300)+
scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,2000))
3.钻石数量
lessthan500 <-subset(diamonds,price<500)
dim(lessthan500)
lessthan250 <-subset(diamonds,price<250)
dim(lessthan250)
morethan15000 <-subset(diamonds,price>=15000)
dim(morethan15000)
4.廉价钻石
qplot(data=diamonds,x=price,binwidth=100)+
scale_x_continuous(limits=c(0,2000),breaks=seq(0,2000,100))
ggplot(aes(x=price),data=diamonds)+
geom_histogram(binwidth = 100)+
scale_x_continuous(limits=c(0,2000),breaks=seq(0,2000,100))
ggsave('priceHistogram.png')
5.the histogram of diamond prices by cut.
qplot(data=diamonds,x=price,binwidth=1000)+
scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,4000))+
facet_wrap(~cut,ncol=5)
ggplot(aes(x=price),data=diamonds)+
geom_histogram(binwidth = 1000)+
scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,4000))+
facet_wrap(~cut,ncol=5)
6.切工-价格
by(diamonds$price,diamonds$cut,max)
by(diamonds$price,diamonds$cut,min)
by(diamonds$price,diamonds$cut,median)
7.由切工决定的每克拉价格,使用scales,可使分隔后每个图的y轴标度不一样
ggplot(data=diamonds,aes(x=(price/carat)))+geom_histogram()+
facet_wrap(~cut,scales='free_y')+
scale_x_log10()
qplot(data=diamonds,x=(price/carat))+
facet_wrap(~cut,scales='free_y')+
scale_x_log10()
8.价格箱线图
qplot(data=diamonds,
x=color,y=price,geom='boxplot')+
coord_cartesian(ylim=c(0,10000))
ggplot(aes(x=color,y=price),data=diamonds)+
geom_boxplot()+
coord_cartesian(ylim=c(0,10000))
9.四分位数以及IQR
quantile(subset(diamonds, color=='D')$price)
quantile(subset(diamonds,color== 'J')$price)
IQR(subset(diamonds,color=='D')$price)
IQR(subset(diamonds,color=='J')$price)
10.由颜色表示的每克拉价格箱线图
ggplot(aes(x=color,y=price/carat),data=diamonds)+
geom_boxplot()+
coord_cartesian(ylim=c(0,8000))
qplot(x=color,y=price/carat,data=diamonds,geom='boxplot')+
coord_cartesian(ylim=c(0,8000))
11.克拉频率多边形
qplot(x=carat,data=diamonds,
xlab='carat',
ylab='frequency',
binwidth=0.01,geom='freqpoly')+
scale_x_continuous(breaks=seq(0,5,0.2))+
scale_y_continuous(breaks=seq(0,12000,2000))
ggplot(aes(x=carat),data=diamonds,
xlab='carat',ylab='frequency')+
geom_freqpoly(binwidth=0.01)+
scale_x_continuous(breaks=seq(0,5,0.2))+
scale_y_continuous(breaks=seq(0,12000,2000))
table(diamonds$carat)[table(diamonds$carat)>2000]