探索单一变量

What to Do First?

getwd()
list.files
pf <- read.csv("pseudo_facebook.tsv",sep='\t')

Histogram of Users' Birthdays

names(pf)
library(ggplot2)
summary(pf$dob_day)

qplot(x=dob_day,data=pf,bins=31)+
  scale_x_continuous(breaks=1:31)

ggplot(aes(x=dob_day),data=pf)+
  geom_histogram(bins=31)+
  scale_x_continuous(breaks=1:31)

Faceting

qplot(x=dob_day,data=pf,bins=31)+
  scale_x_continuous(breaks=1:31)+
  facet_wrap(~dob_month,ncol=3)

ggplot(aes(x=dob_day),data=pf)+
  geom_histogram(bins=31)+
  scale_x_continuous(breaks=1:31)+
  facet_wrap(~dob_month,ncol=3)

facet_grid(vertical~horizontal)

传递两个或多个变量时使用facet_grid

Friend Count

qplot(x=friend_count,data=pf)

ggplot(aes(x=friend_count),data=pf)+
  geom_histogram()

Limiting the Axes

限制轴,避免长尾数据

qplot(x=friend_count,data=pf,xlim=c(0,1000))

qplot(x=friend_count,data=pf)+
  scale_x_continuous(limits = c(0,1000))

ggplot(aes(x=friend_count),data=pf)+
  geom_histogram()+
  scale_x_continuous(limits=c(0,1000))

Adjusting the Bin Width

qplot(x=friend_count,data=pf,binwidth=25)+
  scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))

ggplot(aes(x=friend_count),data=pf)+
  geom_histogram(binwidth=25)+
  scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))

Faceting Friend Count

qplot(x=friend_count,data=pf,binwidth=25)+
  scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

ggplot(aes(x=friend_count),data=pf)+
  geom_histogram(binwidth=25)+
  scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

Omitting NA Values

R 将缺失值表现为NA

qplot(x=friend_count,data=subset(pf,!is.na(gender)),binwidth=25)+
  scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

ggplot(aes(x=friend_count),data=subset(pf,!is.na(gender)))+
  geom_histogram(binwidth=25)+
  scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

na.omit(pf)将去掉数据集中所有包含NA的条目

qplot(x=friend_count,data=na.omit(pf),binwidth=25)+
  scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

ggplot(aes(x=friend_count),data=na.omit(pf))+
  geom_histogram(binwidth=25)+
  scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,50))+
  facet_wrap(~gender)

通过上述生成的直方图,很难判断哪个性别的平均好友数更多


Statistics 'by' Gender

table(pf$gender)
by(pf$friend_count,pf$gender,summary)

Tenure

Notes:

color为16进制颜色代码,参见https://en.wikipedia.org/wiki/Web_colors

qplot(x=tenure,data=pf,binwidth=30,
      color=I('Black'),fill=I('#099DD9'))

ggplot(aes(x=tenure),data=pf)+
  geom_histogram(binwidth=30,color='Black',fill='#099DD9')
  

create a histogram of tenure by year?

qplot(x=tenure/365,data=pf,binwidth=1,
      color=I('Black'),fill=I('#099DD9'))

ggplot(aes(x=tenure/365),data=pf)+
  geom_histogram(binwidth=1,color='Black',fill='#099DD9')

Labeling Plots

qplot(x=tenure/365,data=pf,
      xlab='Number of years using Facebook',
      ylab='Number of users in sample',
      color=I('Black'),fill=I('#099DD9'))+
  scale_x_continuous(breaks=seq(1,7,1),limits=c(0,7))

ggplot(aes(x=tenure/365),data=pf,
       xlab='Number of years using Facebook',
       ylab='Number of users in sample')+
  geom_histogram(color='Black',fill='#099DD9')+
  scale_x_continuous(breaks=seq(1,7,1),limits=c(0,7))
  

User Ages

summary(pf$age)

qplot(x=age,data=pf,binwidth=1,
      color=I('Black'),fill=I('#099DD9'))+
  scale_x_continuous(breaks=seq(0,113,5),limits=c(0,113))

ggplot(aes(x=age),data=pf)+
  geom_histogram(color='Black',fill='#099DD9',binwidth = 1)+
  scale_x_continuous(breaks=seq(0,113,5),limits=c(0,113))
  

Transforming Data

Notes:

p1 <- qplot(x=friend_count,data=pf)
summary(pf$friend_count)
summary(log10(pf$friend_count+1))
summary(sqrt(pf$friend_count))

p2 <- qplot(x=log10(pf$friend_count+1),data=pf)
p3 <- qplot(x=sqrt(pf$friend_count),data=pf)
library(gridExtra)
grid.arrange(p1,p2,p3,ncol=1)

使用ggplot的版本

p1 <- ggplot(aes(x=friend_count),data=pf)+
  geom_histogram()
p2 <- p1+scale_x_log10()
p3 <- p1+scale_x_sqrt()
grid.arrange(p1,p2,p3,ncol=1)

Add a Scaling Layer

logScale <- qplot(x=log10(pf$friend_count),data=pf)
countScale <- ggplot(aes(x=friend_count),data=pf)+
  geom_histogram()+
  scale_x_log10()
grid.arrange(logScale,countScale,ncol=2)

qplot(x=pf$friend_count,data=pf)+
  scale_x_log10()

上面两幅图的区别在于X轴上的标记不同


频数多边形

qplot(x=friend_count,y=..count../sum(..count..),
      data=subset(pf,!is.na(gender)),
      xlab='Friend count',
      ylab='Proportion of users with that friend count',
      binwidth=10,geom='freqpoly',color=gender)+
  scale_x_continuous(limits = c(0,1000),breaks=seq(0,1000,50))

ggplot(aes(x = friend_count, y = ..count../sum(..count..)), 
       data = subset(pf, !is.na(gender)),
       xlab='好友数量',
       ylab='Percentage of users with that friend count') + 
  geom_freqpoly(aes(color = gender), binwidth=10) + 
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50))

qplot(x=www_likes,data=subset(pf,!is.na(gender)),
      geom='freqpoly',color=gender)+
  scale_x_continuous()+
  scale_x_log10()

ggplot(aes(x=www_likes),data=subset(pf,!is.na(gender)))+
  geom_freqpoly(aes(color=gender))+
  scale_x_continuous()+
  scale_x_log10()
  

Likes on the Web

by(pf$www_likes,pf$gender,sum)

Box Plots

qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')

ggplot(aes(x=gender,y=friend_count),
       data=subset(pf,!is.na(gender)))+
  geom_boxplot()


Adjust the code to focus on users who have friend counts between 0 and 1000.

qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot',ylim=c(0,1000))

qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')+
  scale_y_continuous(lim=c(0,1000))

ggplot(aes(x=gender,y=friend_count),
       data=subset(pf,!is.na(gender)))+
  geom_boxplot()+
  scale_y_continuous(lim=c(0,1000))

使用coord_cartesian

qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')+
  coord_cartesian(ylim=c(0,1000))

ggplot(aes(x=gender,y=friend_count),
       data=subset(pf,!is.na(gender)))+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,1000))

Box Plots, Quartiles, and Friendships

qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')+
  coord_cartesian(ylim=c(0,250))

ggplot(aes(x=gender,y=friend_count),
       data=subset(pf,!is.na(gender)))+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,250))

by(pf$friend_count,pf$gender,summary)

coord_cartesian的结果和表输出的结果一致(包括中位数等

names(pf)
by(pf$friendships_initiated,pf$gender,mean)
summary(pf$friendships_initiated)

qplot(x=gender,y=friendships_initiated,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')+
  coord_cartesian(ylim=c(0,200))

ggplot(aes(x=gender,y=friendships_initiated),
       data=subset(pf,!is.na(gender)))+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,200))

箱线图帮助我们理解数据的分布,感知异常值


Getting Logical 符合逻辑

summary(pf$mobile_likes)
summary(pf$mobile_likes>0)
pf$mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes>0,1,0)
pf$mobile_check_in <- factor(pf$mobile_check_in)
summary(pf$mobile_check_in)

what percent of check in using mobile?

sum(pf$mobile_check_in==1)/length(pf$mobile_check_in)

习题集
1.对数据的基本了解

data(diamonds)
View(diamonds)
str(diamonds)
?diamonds

2.价格直方图

qplot(data=diamonds,x=price,binwidth=300)+
  scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,2000))

ggplot(aes(x=price),data=diamonds)+
  geom_histogram(binwidth = 300)+
  scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,2000))

3.钻石数量

lessthan500 <-subset(diamonds,price<500)
dim(lessthan500)

lessthan250 <-subset(diamonds,price<250)
dim(lessthan250)

morethan15000 <-subset(diamonds,price>=15000)
dim(morethan15000)

4.廉价钻石

qplot(data=diamonds,x=price,binwidth=100)+
  scale_x_continuous(limits=c(0,2000),breaks=seq(0,2000,100))

ggplot(aes(x=price),data=diamonds)+
  geom_histogram(binwidth = 100)+
  scale_x_continuous(limits=c(0,2000),breaks=seq(0,2000,100))

ggsave('priceHistogram.png')

5.the histogram of diamond prices by cut.

qplot(data=diamonds,x=price,binwidth=1000)+
  scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,4000))+
  facet_wrap(~cut,ncol=5)

ggplot(aes(x=price),data=diamonds)+
  geom_histogram(binwidth = 1000)+
  scale_x_continuous(limits=c(0,20000),breaks=seq(0,20000,4000))+
  facet_wrap(~cut,ncol=5)

6.切工-价格

by(diamonds$price,diamonds$cut,max)
by(diamonds$price,diamonds$cut,min)
by(diamonds$price,diamonds$cut,median)

7.由切工决定的每克拉价格,使用scales,可使分隔后每个图的y轴标度不一样

ggplot(data=diamonds,aes(x=(price/carat)))+geom_histogram()+
  facet_wrap(~cut,scales='free_y')+
  scale_x_log10()

qplot(data=diamonds,x=(price/carat))+
  facet_wrap(~cut,scales='free_y')+
  scale_x_log10()

8.价格箱线图

qplot(data=diamonds,
      x=color,y=price,geom='boxplot')+
  coord_cartesian(ylim=c(0,10000))

ggplot(aes(x=color,y=price),data=diamonds)+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,10000))

9.四分位数以及IQR

quantile(subset(diamonds, color=='D')$price) 
quantile(subset(diamonds,color== 'J')$price)

IQR(subset(diamonds,color=='D')$price)
IQR(subset(diamonds,color=='J')$price)

10.由颜色表示的每克拉价格箱线图

ggplot(aes(x=color,y=price/carat),data=diamonds)+
  geom_boxplot()+
  coord_cartesian(ylim=c(0,8000))

qplot(x=color,y=price/carat,data=diamonds,geom='boxplot')+
  coord_cartesian(ylim=c(0,8000))

11.克拉频率多边形


qplot(x=carat,data=diamonds,
      xlab='carat',
      ylab='frequency',
      binwidth=0.01,geom='freqpoly')+
  scale_x_continuous(breaks=seq(0,5,0.2))+
  scale_y_continuous(breaks=seq(0,12000,2000))

ggplot(aes(x=carat),data=diamonds,
       xlab='carat',ylab='frequency')+
  geom_freqpoly(binwidth=0.01)+
  scale_x_continuous(breaks=seq(0,5,0.2))+
  scale_y_continuous(breaks=seq(0,12000,2000))
table(diamonds$carat)[table(diamonds$carat)>2000]

你可能感兴趣的:(探索单一变量)