R语言笔记六

去除NA值

方案一
ggplot(aes(x=friend_count), data=subset(pf, !is.na(gender)))+
        geom_histogram()+
        scale_x_continuous(limits = c(0,1000), breaks = seq(0,1000,50))+
        facet_wrap(~gender)
方案二
ggplot(aes(x=friend_count), data=na.omit(pf)))+  ##去除所有NA值
        geom_histogram()+
        scale_x_continuous(limits = c(0,1000), breaks = seq(0,1000,50))+
        facet_wrap(~gender)

同时总结两组变量

> by(pf$www_likes, pf$gender, sum)
pf$gender: female
[1] 3507665
--------------------------------------------------------- 
pf$gender: male
[1] 1430175

> by(pf$friend_count,pf$gender,summary)

pf$gender: female
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0      37      96     242     244    4923 
--------------------------------------------------------- 
pf$gender: male
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      0      27      74     165     182    4917 

填充颜色,按年计算

ggplot(aes(x = tenure/365), data = pf) +
        geom_histogram(binwidth = 0.1, color = "black", fill = "#099DD9")+
        scale_x_continuous(limits = c(0,7), breaks = seq(1,7,0.5))

自定义lab

ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender))) +
        geom_histogram(binwidth = 10,color = 'black',fill = 'green')+
        scale_x_continuous(limits = c(0,1000), breaks = seq(0, 1000, 50)) +
        xlab('朋友数')+
        ylab('满足该朋友数的人数')+
        facet_wrap(~gender)

qplot(x = friend_count, data = subset(pf, !is.na(gender)),
      color = I("black"),
      fill = I("green"),
      xlab = "朋友数",
      ylab = "满足该朋友数的人数")+
        scale_x_continuous(limits = c(0,1000), breaks = seq(0,1000,50))+
        facet_wrap(~gender)

同时打印多图

install.packages('gridExtra')
library(gridExtra)

p1<- qplot(x = friend_count,data = pf)
p2<- qplot(x = log10(friend_count),data = pf)
p3<- qplot(x = sqrt(friend_count),data = pf)
grid.arrange(p1,p2,p3, ncol=1)

另一种方法,使用ggplot()
p1<- ggplot(aes(x = friend_count), data = pf) + geom_histogram()
p2<- p1 + scale_x_log10()
p3<- p1 + scale_x_sqrt()
grid.arrange(p1,p2,p3, ncol = 1)

频率折线图

ggplot(aes(x = friend_count, y = ..count../sum(..count..)), ##百分数可用y=..density..
       data = subset(pf, !is.na(gender)) +
        geom_freqpoly(aes(color = gender)) +
        scale_x_continuous(limits = c(0,1000), breaks = seq(0,1000,50))+
        xlab("pengyoushu")+
        ylab("zhanzongpengyoushubaifenbi")

qplot(x = friend_count, y = ..count../sum(..count..), 
      data = subset(pf, !is.na(gender)),
      binwidth = 10, geom = "freqpoly", color = gender)+
        scale_x_continuous(limits = c(0,1000), breaks = seq(0, 1000, 50))

剔除长尾coord_cartesian(), 而不用scale_y_countinous

ggplot(aes(x = gender, y = friendships_initiated), 
       data = subset(pf, !is.na(gender)))+
        geom_boxplot()+
        coord_cartesian(ylim = c(0,150))

sum不能用于factor

> pf$mobile_check_in <- NA
> pf$mobile_check_in <- ifelse(pf$mobile_likes > 0, 1, 0)
> pf$mobile_check_in <- factor(pf$mobile_check_in)
> summary(pf$mobile_check_in)
    0     1 
35056 63947 

> sum(pf$mobile_check_in)  ##不能用于因子
Error in Summary.factor(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,  : 
  ‘sum’ not meaningful for factors
> length(pf$mobile_check_in) ##用length测量长度
[1] 99003

> by(summary(pf$mobile_check_in)/length(pf$mobile_check_in))
Error in by.default(summary(pf$mobile_check_in)/length(pf$mobile_check_in)) : 
  argument "INDICES" is missing, with no default
> summary(pf$mobile_check_in)/length(pf$mobile_check_in)
        0         1 
0.3540903 0.6459097 

> sum(pf$mobile_check_in == 1)/length(pf$mobile_check_in) ##此法甚棒!
[1] 0.6459097

使用max、min比summary要精准

> by(diamonds$price, diamonds$cut, summary)
diamonds$cut: Fair
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    337    2050    3282    4359    5206   18570 
----------------------------------------------------------- 
diamonds$cut: Good
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    327    1145    3050    3929    5028   18790 
----------------------------------------------------------- 
diamonds$cut: Very Good
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    336     912    2648    3982    5373   18820 
----------------------------------------------------------- 
diamonds$cut: Premium
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    326    1046    3185    4584    6296   18820 
----------------------------------------------------------- 
diamonds$cut: Ideal
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    326     878    1810    3458    4678   18810 
> by(diamonds$price, diamonds$cut, max)
diamonds$cut: Fair
[1] 18574
------------------------------------------------------ 
diamonds$cut: Good
[1] 18788
------------------------------------------------------ 
diamonds$cut: Very Good
[1] 18818
------------------------------------------------------ 
diamonds$cut: Premium
[1] 18823
------------------------------------------------------ 
diamonds$cut: Ideal
[1] 18806

facet_wrap(~cut, scales = “free”)

ggplot(aes(x = price), data = diamonds)+
     geom_histogram(binwidth = 100)+
     scale_x_continuous(breaks = seq(0,18000,1000))+
     facet_wrap(~cut, scales = "free_x")

四分位间距

> by(diamonds$price, diamonds$color, IQR)
diamonds$color: D
[1] 3302.5
------------------------------------------------------ 
diamonds$color: E
[1] 3121
------------------------------------------------------ 
diamonds$color: F
[1] 3886.25
------------------------------------------------------ 
diamonds$color: G
[1] 5117
------------------------------------------------------ 
diamonds$color: H
[1] 4996.25
------------------------------------------------------ 
diamonds$color: I
[1] 6081.25
------------------------------------------------------ 
diamonds$color: J
[1] 5834.5

设置level

levels(data$age.range)
levels(data$income.range)

你可能感兴趣的:(R语言笔记)