R基础

R基础

1、查看当前工作目录

getwd()

2、更改目录

setwd("~/Downloads")
attention:当不能读取文件时通常是因为目录不是文件存储所在的位置 error:cannot open file 'data':no such file or directory

3、读取文件

statesInfo <- read.csv('stateDate.csv) 并把文件命名为statesInfo

4、查找表中的数据

2. statesInfo[rows,columns]
statesInfo[statesInfo$state.region==1, ]```

5、取数据集中前2行和打印数据的大小

head(data,2)
dim(data)

6、查看数据信息;快捷键option+cmd+I

?cars
str(cars)```

7、查找数据

subset(data,mpg>=30|hp<60)

8、把变量制成表,观察每个组的人数

table(data$employment.status)#"mployment.status"这列数据制成表:会显示出这一列中各个值的数量相当于group by

9、查看统计信息

summary(reddit)

10、查看一个变量的级别

levels(data$columns)

11、画直方图,图片不显示在plots要先运行dev.off()

library(ggplot2) qplot(data=reddit,x=age.range)#显示reddit数据集中age.range中各种值的分布(使用于有几
个离散值的列

12、对直方图列上面的级别排序

reddit$age.range <- ordered(data$columns,levels=c('Under 18','18-24','25-34','35-44','45-54','55-64','65 or Above'),Drdered=T)#Drdered=T顺序设置为真

13、画直方图的两种方式

># 第一种方式
>qplot(x = dob_day, data = pf)+
  scale_x_continuous(breaks = 1:31)
># 第二种方式
>ggplot(aes(x = dob_day), data = pf) +
  geom_histogram(binwidth = 0.5) +
  scale_x_continuous(breaks = 1:31)#binwidth和bins的调整很重要尽量调小,越小越清晰
>##调整bins
>ggplot(aes(price),data = diamonds)+
  geom_histogram(bins = 300)+
  scale_x_log10()
>##默认的参数
>ggplot(aes(price),data = diamonds)+
  geom_histogram(bins = 300)+
  scale_x_log10()

 - 总体来说,更少的binwidth,更能反应数据的整体趋势,更多的bins(组数),能展示数据中的细节

14、直方图按按列dob_month的值分别画图,为每个类别变量创建相同类型的图形

ggplot(aes(x = dob_day), data = pf) +
  geom_histogram(binwidth = 0.5) +
  scale_x_continuous(breaks = 1:31)+
  facet_wrap(~dob_month,ncol=3)#ncol列数
这个是facet_wrap(formula) facet_wrap(~variable琢面包裹
facet_grid(formula)  琢面网格facet_grid(vertical~horizontal)垂直方向分割的变量 水平方向的分割的变量
教程链接

15、设置直方图x轴的起点位置和终点位置

qplot(data=pf,x=friend_count,xlim=c(0,1000))#设置x轴位置的方法
qplot(x=friend_count,data=pf)+
    scale_x_continuous(limits=c(0,1000))#设置图层的方法

16、忽略na值,

qplot(x=friend_count,data=subset(pf,!is.na(gender)),binwidth=25)+
 scale_x_continuous(limits=c(0,1000),breaks=seq(0,1000,25))+
  facet_wrap(~gender,ncol=2)
1.subset(pf,!is.na(gender))忽略gender列中的na值;
2.binwidth调组距;
3.scale_x_continuous(limits=c(0,1000)建立图层,设置X轴上起始和终点位置;
4.breaks设置0-1000的数据,组居为25;5.facet_wrap(~gender,ncol=2)按gender 的类别建立两个列图

17、查看统计数据

table(pf$gender)#查看数据集pf中gender字段各个值有多少

18、查看统计值

by(pf$friend_count,pf$gender,summary)
查看gender各列别的friend_count值统计

19、设置直方图的颜色

qplot(x=tenure,data=pf,binwidth=30,
      color=I('black'),fill=I('#099DD9'))

ggplot(aes(x=price,fill=cut),data=diamonds)+
  geom_histogram()+
  facet_wrap(~color)+
  scale_x_log10()+
  scale_fill_brewer(type="qual")
1. fill=cut 设置填充颜色

20、

qplot(x = age,data=pf,binwidth=1,
      color=I('black'),fill=I('#099DD9'))+
  scale_x_continuous(breaks= seq(0,113,5))
#scale_x_continuous 是在X轴上设置断点

21、对变量取对数转为正态分布

##summary(log10(pf$friend_count+1))
以10为低的对数,变量的转换常用对变量的分布转为正态分布
qplot(x=(price/carat+1),data=diamonds,binwidth=50)+
  facet_wrap(~cut)+
  scale_x_log10()

22、取对数、平方根画图

>p1 <- qplot(x=friend_count,data=pf)
p2<- qplot(x=log10(friend_count+1),data=pf)
p3<- qplot(x=sqrt(friend_count),data=pf)
grid.arrange(p1,p2,p3,ncol=1)

plot1 <- qplot(data=diamonds,x=price,binwidth=100,fill=I('#099DD9')) + 
  ggtitle('Price')

plot2 <- qplot(data=diamonds,x=price,binwidth=0.01,fill=I('#F79420')) +
  scale_x_log10()+
  ggtitle('Price (log10)')

library(gridExtra)
library(grid)
grid.arrange(plot1,plot2,ncol=2)        

23、创建频数多边形

##ggplot(aes(x = friend_count, y = ..count../sum(..count..)), data = subset(pf, !is.na(gender))) + 
  geom_freqpoly(aes(color = gender), binwidth=10) + 
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) + 
  xlab('好友数量') + 
  ylab('Percentage of users with that friend count')

24、求变量中类别的和

by(pf$www_likes,pf$gender,sum)

25、箱线图

qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot',ylim=c(0,1000))```


qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')+
  scale_y_continuous(limits=c(0,1000)) ```

  
qplot(x=gender,y=friend_count,
      data=subset(pf,!is.na(gender)),
      geom='boxplot')+
  coord_cartesian(ylim= c(0,1000))
#将y限制在0-1000

26、ifelse和转换成因素变量

mobile_check_in <- NA
pf$mobile_check_in <- ifelse(pf$mobile_likes>0,1,0)
pf$mobile_check_in <- factor(pf$mobile_check_in)#转换成因素变量
summary(pf$mobile_check_in)

27、研究两个连续变量之间的关系,做散点图

ggplot(aes(x=age,y=friend_count),data=pf)+
  geom_point(alpha=1/20)+
  xlim(13,90)+
  coord_trans(y='sqrt')
#geom_point的alpha参数是20个数据是一个全黑的点,coord_trans给y的值平方根也可以log10

ggplot(aes(x=age,y=friend_count),data=subset(pf,!is.na(gender)))+
  geom_jitter(alpha=1/20,aes(colour=gender),height=0)+
  xlim(13,90)+
  coord_trans(y='sqrt',limy=c(0,3000))
#geom_jitter给数据添加抖动,可能出现负值,所以要设置当为负时,高度为0

ggplot(aes(x=table,y=price),data=diamonds)+
  geom_point(alpha=1/5,aes(color=cut))+
  scale_x_continuous(breaks=seq(50,80,2))

28、数据分组、取每组里面的均值、中位数

age_groups <- group_by(pf,age)
pf.fc_by_age <- summarise(age_groups,fried_coun_mean=mean(friend_count),
          friend_count_median=median(friend_count),
          n=n())#建立一个新的表
qplot(x=fried_coun_mean,data=pf.fc_by_age)
pf.fc_by_age <- arrange(pf.fc_by_age,age)#排序
head(pf.fc_by_age)

ggplot(aes(x=age,y=friend_count),data=pf)+
  xlim(13,90)+
  geom_point(alpha=0.05,
             position=position_jitter(h=0),
             color='orange')+
  coord_trans(y='sqrt')+
  geom_line(stat='summary',fun.y=mean)+
  geom_line(stat="summary",fun.y=quantile,fun.args=list(probs= .9),
            linetype=2,color='blue')

  fun.y=quantile,fun.args=list(probs= .9)是设置分位数

29、散点图中添加中位数、分位数图层

ggplot(aes(x=age,y=friend_count),data=pf)+
  coord_cartesian(xlim=c(13,70),ylim=c(0,1000))+
  geom_point(alpha=0.05,
             position=position_jitter(h=0),
             color='orange')+
  #coord_trans(y='sqrt')+
  geom_line(stat='summary',fun.y=mean)+
  geom_line(stat="summary",fun.y=quantile,fun.args=list(probs= .9),
            linetype=2,color='blue')+
  geom_line(stat="summary",fun.y=quantile,fun.args=list(probs= .5),
            linetype=2,color='blue')+
  geom_line(stat="summary",fun.y=quantile,fun.args=list(probs= .1),
            linetype=2,color='blue')
1. coord_cartesian设置x、y轴的起始和终点位置
2. alpha=1/20 是20个数据位一个全黑的点;
2.1:position=position_jitter(h=0)抖动数据,是负值时设置位0
3.coord_trans(y='sqrt')取y轴平方跟也可以取对数;
4.取y 轴的均值geom_line(stat='summary',fun.y=mean);
5.取y轴的分位数geom_line(stat="summary",fun.y=quantile,fun.args=list(probs= .5),
            linetype=2,color='blue')
  

30、查看两个变量的相关系数


with(pf,cor.test(age,friend_count))

with(subset(pf,age<=70),cor.test(age,friend_count,
                                 method="pearson"))
1.pearson 积矩关联衡量两个变量之间的关联强度(default)


31、散点图+相关系数图+划分数据子集

ggplot(aes(x=www_likes_received,y=likes_received),data=pf)+
  geom_point()+
  xlim(0,quantile(pf$www_likes_received,0.95))+
  ylim(0,quantile(pf$likes_received,0.95))+
  geom_smooth(method='lm',color='red')
1.设施x\y的上下限用xlim\ylim 这里选取的是95%的数据;
2.显示相关系数的线:geom_smooth

32、平滑数据

p1<- ggplot(aes(x=age,y=fried_coun_mean),
       data=subset(pf.fc_by_age,age<71))+
  geom_line()

p2<-ggplot(aes(x=age_with_months,y=friend_count_mean),
       data=subset(pf.fc_by_age_months,age_with_months<71))+
  geom_line()+
  geom_smooth()
p3<- ggplot(aes(x=round(age / 5)*5,y=friend_count),
            data=subset(pf,age<71))+
  geom_line(stat='summary',fun.y=mean)
library(gridExtra)
grid.arrange(p2,p1,p3,ncol=1)

分析更多变量

library(dplyr)
pf.fc_age_gender<- pf%>%
  filter(!is.na(gender))%>%
  group_by(age,gender)%>%
  summarise(mean_friend_count=mean(friend_count),
            median_friend_count=median(friend_count),
            n=n())%>%
  ungroup()%>%
  arrange(age)
1.按性别、年龄、分组的数据框;
names(pf.fc_age_gender)
ggplot(aes(x=age,y=mean_friend_count),
       data=pf.fc_age_gender)+
  geom_line(aes(color=gender))
2.画散点图

2.长格式转换成宽格式

pf.fc_age_gender<- pf%>%
  filter(!is.na(gender))%>%
  group_by(age,gender)%>%
  summarise(mean_friend_count=mean(friend_count),
            median_friend_count=median(friend_count),
            n=n())%>%
  ungroup()%>%
  arrange(age)

library(reshape2)
pf.fc_by_age_gender.wide<- dcast(pf.fc_age_gender,
                                 age~gender,
                                 value.vat='median_friend_count')

3.比率图

ggplot(aes(x=age,y=female/male),
       data=pf.fc_by_age_gender.wide)+
  geom_line()+
  geom_hline(yintercept=1,alpha=0.3,linetype=2)

4.切割一个变量函数cut

pf$year_joined.bucket<- cut(pf$year_joined,
                            c(1994,1998,2001,2004))
1.这里c是指定切割的数,breaks=4就是以4的间隔切割

5.探索加入时间与好友数量的关系

ggplot(aes(x=age,y=friend_count),
     data=subset(pf,!is.na(year_joined.bucket)))+
geom_line(aes(color=year_joined.bucket),stat="summary",fun.y=mean)+
geom_line(stat="summary",fun.y=mean,linetype=2)
1.geom_line(aes(color=year_joined.bucket),stat="summary",fun.y=mean)是设置year_joined.bucket)每个组内的均值
2. geom_line(stat="summary",fun.y=mean,linetype=2)
是设置整体的均值

6.建立友谊与使用时长之间关系的线图

需要利用的变量有年龄、使用时长、建立的友谊和year_joined.bucket

ggplot(aes(x=30*round(tenure/30),y=friendships_initiated/tenure),
       data=subset(pf,tenure>=1))+
  geom_line(aes(color=year_joined.bucket),stat="summary",fun.y=mean)
1.30*round(tenure/30)是调整组距,降低噪声

ggplot(aes(x=30*round(tenure/30),y=friendships_initiated/tenure),
       data=subset(pf,tenure>=1))+
  geom_smooth(aes(color=year_joined.bucket))
geom_smooth平滑

7.创建散点图 矩阵


library(GGally)
theme_set(theme_minimal(20))
set.seed(1836)
pf_subset<- pf[,c(2:15)]
names(pf_subset)
ggpairs(pf_subset[sample.int(nrow(pf_subset),1000),])

带有分面和颜色的价格直方图

ggplot(aes(x=price,fill=cut),data=diamonds)+
  geom_histogram()+
  facet_wrap(~color)+
  scale_x_log10()+
  scale_fill_brewer(type="qual")

8.画散点图、选取99%的数据、按某离散变量分类、轴log10

diamonds$volume<- diamonds$x*diamonds$y*diamonds$z
ggplot(aes(x=volume,y=price,fill=clarity,color=clarity),
       data=diamonds)+
  xlim(0,quantile(diamonds$volume,0.99))+
  geom_point()+
  scale_y_log10()+
  scale_color_brewer(type='div')

9.散点图,取X轴的立方根,取Y轴的log10

ggplot(aes(carat,price),data=diamonds)+
  geom_point()+
  scale_x_continuous(trans=cuberoot_trans(),limits=c(0.2,3),
                     breaks=c(0.2,0.5,1,2,3))+
  scale_y_continuous(trans=log10_trans(),limits=c(350,15000),
                     breaks=c(350,1000,5000,10000,15000))+
  ggtitle('Price(log10) by Cube-Root of Carat')

ggplot(aes(carat, price), data = diamonds) + 
  geom_point(alpha=0.5,position='jitter',size=0.75) + 
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat')        

library('RColorBrewer')
ggplot(aes(x = carat, y = price,colour=clarity), data = diamonds) + 
  geom_point(alpha = 0.5, size = 1, position = 'jitter') +
  scale_color_brewer(type = 'div',
                     guide = guide_legend(title = 'Clarity', reverse = T,
                                          override.aes = list(alpha = 1, size = 2))) +  
  scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
                     breaks = c(0.2, 0.5, 1, 2, 3)) + 
  scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
                     breaks = c(350, 1000, 5000, 10000, 15000)) +
  ggtitle('Price (log10) by Cube-Root of Carat and Clarity')
1.设置颜色参数是净度-colour=clarity
2.显示的时候,最好的在最上面

创建线性模型并预测

要按转menisc包

bigdiamonds$logprice=log(bigdiamonds$price)
建立模型,模型有5个输入特征
m1<- lm(log(price)~I(carat^(1/3)),
        data=bigdiamonds[bigdiamonds$price<10000&
                           bigdiamonds$cert=='GIA',])
m2<- update(m1,~ . + carat)
m3<- update(m2,~ . + cut)
m4<- update(m3,~ . + color)
m5<- update(m4,~ . + clarity)
mtable(m1,m2,m3,m4,m5)

##输入5个特征的值
thisDiamond=data.frame(carat=2.00,cut='y.Good',
                       color='I',clarity='VS1')
##调整置信区间level
modelEstimate=predict(m5,newdata=thisDiamond,
                      interval='prediction',level= .95)
##查看预测的结果
exp(modelEstimate)


m1<-lm(I(quality)~I(alcohol),data=pp_subset)
m2<-update(m1,~ . + pH)
m3<-update(m2,~ . + volatile.acidity)
m4<-update(m3,~ . + citric.acid)
m5<-update(m4,~ . +residual.sugar)
mtable(m1,m2,m3,m4,m5)

thisDiamond=data.frame(alcohol=0.5,pH=0.4,volatile.acidity=5,citric.acid=2,residual.sugar=5)
modelEstimate=predict(m5,newdata=thisDiamond,
                      interval='prediction')
#预测
exp(modelEstimate)

调整字体大小和水平位置

theme(axis.title.x=element_text(size=60),axis.title.y=element_text(size=60))+theme(plot.title=element_text(hjust=0.5))

你可能感兴趣的:(R基础)