Mastering Data Analysis with R-笔记

  • 导入数据
    大数据集的导入以及处理一些特殊的文件类型。
setwd('D:\\Rdata\\Books学R语言\\R语言数据分析美盖尔盖伊')
library(hflights)
write.csv(hflights,'hflights.csv',row.names = FALSE)

str(hflights)

导入
使用system.time测试导入文件的时间

system.time(read.csv('hflights.csv'))

# 指定列的转换类型
colClasses <- sapply(hflights,class)
system.time(read.csv('hflights.csv',colClasses = colClasses))

对同一任务重复n此测试,然后再对仿真结果进行汇总
得到关于数据的多种观测结果,分析确定结果中统计的显著差异

# 默认运行100此
library(microbenchmark)
f <- function() read.csv('hflights.csv')
g <- function() read.csv('hflights.csv',colClasses = colClasses,nrows = 227496,comment.char = '')
res <- microbenchmark(f(),g(),times = 10)   # times 指定运行次数
res
# 按规定的字体大小输出测试结果
print(res,digits = 20)

结果分析

## italic
## expression

boxplot(res,xlab = '',
        main = expression(paste('Benchmarking ',italic('read.table'))))

规模大于物理内存的数据集

library(sqldf)
system.time(read.csv.sql('hflights.csv'))

library(ff)
system.time(read.csv.ffdf(file = 'hflights.csv'))

library(bigmemory)
system.time(read.big.matrix('hflights.csv',header = TRUE))

## 或者data.table

文本文件编译测试平台

library(data.table)
system.time(dt <- fread('hflights.csv'))

# data.table类转换为data.frame
df <- as.data.frame(dt)
is.data.frame(dt)

导入文本文件的子集
数据导入时进行筛选

df <- read.csv.sql('hflights.csv',sql = "select * from file where Dest = '\"BNA\"'")

注意,sql默认为"select * from file",且sqldf不能自动识别双引号

在导入到R会话前筛选平面文件
从数据库中导入数据

  • 数据筛选
    subset which [ [[ sqldf
library(sqldf)
sqldf("select * from mtcars where am = 1 and vs = 1")

# subset
subset(mtcars,am == 1 & vs == 1)

sqldf的row.names参数默认为FALSE

identical(
  sqldf("select * from mtcars where am = 1 and vs = 1",
        row.names = TRUE),
  subset(mtcars,am == 1 & vs == 1)
)

筛选指定列

subset(mtcars,am == 1 & vs == 1,select = hp:wt)

较大数据集//筛选
dplyr

system.time(sqldf("select * from hflights where Dest == 'BNA'",row.names = TRUE))
system.time(subset(hflights,Dest == 'BNA'))

library(dplyr)
system.time(filter(hflights,Dest == 'BNA'))

str(select(filter(hflights,Dest == 'BNA'),DepTime:ArrTime))

行的名称在dplyr结果中不保留

mtcars$rownames <- rownames(mtcars)
head(mtcars)

select(filter(mtcars,hp>300),c(rownames,hp))

data.table方法

library(data.table)
hflights_dt <- data.table(hflights)
## 筛选行
system.time(hflights_dt[Dest == 'BNA'])
head(hflights_dt)
# 筛选列
head(hflights_dt[Dest == 'BNA',.(Year,Dest)])    
##list
head(hflights_dt[Dest == 'BNA',list(Year,Dest)]) 
# 使用data.frame的函数c
head(hflights_dt[Dest == 'BNA',c('Year','ArrTime'),with = FALSE])

聚集
aggregate函数

aggregate(hflights$Diverted,by = list(hflights$DayOfWeek),FUN = mean)

使用with函数

with(hflights,aggregate(Diverted,by = list(DayOfWeek),FUN= mean))

公式化标记

aggregate(Diverted ~ DayOfWeek,data = hflights,FUN = mean)

使用基础的R命令实现快速聚集

# apply
tapply(hflights$Diverted, hflights$DayOfWeek, mean)

plyr包的ddply函数

library(plyr)
ddply(hflights,.(DayOfWeek),function(x) mean(x$Diverted))

plyr包的.函数为用户提供了一种方便的引用变量(名称)的方法。

显式指定相应的列名

# 调用summarise辅助函数来替代上面的匿名函数
ddply(hflights,.(DayOfWeek),summarise,Diverted = mean(Diverted))

dplyr

hflights_DayofWeek <- group_by(hflights,DayOfWeek)
str(attributes(hflights_DayofWeek))

# summarise
dplyr::summarise(hflights_DayofWeek,mean(Diverted))

使用data.table实现聚集

hflights_dt[,mean(Diverted),by = DayOfWeek]

# 列命名
hflights_dt[,.('mean'=mean(Diverted)),by = DayOfWeek]

测试

汇总函数

统计子分组样例数

ddply(hflights,.(DayOfWeek),summarise,n = length(Diverted))

ddply(hflights,.(DayOfWeek),nrow)

table(hflights$DayOfWeek)

# plyr的count函数
count(hflights,'DayOfWeek')

# dplyr
dplyr::summarise(hflights_DayofWeek,n())

# hflights_DayOfWeek的结构
attr(hflights_DayofWeek,'group_sizes')

# data.table
hflights_dt[,.N,by =.(DayOfWeek)]
  • 数据重构
# 数据重构

library(data.table)
library(dplyr)
# 1.矩阵转置
(m <- matrix(1:9,3))
t(m)
# 适用于data.frame对象
head(iris)
t(head(iris))

# 2.基于字符串匹配实现数据筛选
library(dplyr)
library(hflights)
str(select(hflights,ends_with("delay")))
# ignore.case 是否区分大小写
str(select(hflights,contains('T',ignore.case=FALSE)))

# 正则表达式
# match
# 长度为5或者6的列名
str(select(hflights,matches("^[[:alpha:]]{5,6}$")))   #[]{n}

# 符号- 筛选所有不符合表达式条件的列名
# 列名定义时最常用的字符个数
table(nchar(names(hflights)))
names(hflights)
colnames(hflights)
# 去掉列名长度为7或8的列
names(select(hflights,-matches("^[[:alpha:]]{7,8}$")))

# 3.数据重排序
str(arrange(hflights,ActualElapsedTime))
# 管道命令操作符
hflights %>% arrange(ActualElapsedTime) %>% str

# dplyr
hflights %>% 
  arrange(ActualElapsedTime) %>% 
  select(ActualElapsedTime,Dest) %>% 
  subset(Dest != 'Aus') %>% 
  head %>% 
  str
# data.table
str(head(data.table(hflights,key = 'ActualElapsedTime')[Dest != 'AUS',c('ActualElapsedTime','Dest'),with = FALSE]))

str(head(na.omit(
  data.table(hflights,key = 'ActualElapsedTime'))[Dest != 'AUS',.(ActualElapsedTime,Dest)]))

# na.omit在哪里调用

# 速度比较
system.time(hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137)
system.time(hflights_dt[,DistanceKMs := Distance / 0.62137])

# dplyr和data.table
# 内存使用分析
# 内存位置//指针的值
library(pryr)
hflights_dt <- data.table(hflights)
address(hflights_dt)
# 查看传统的赋值操作符是否会改变存放对象的地址
hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137
address(hflights_dt)

# 查看data.table包的:=的使用方法
hflights_dt <- data.table(hflights)
address(hflights_dt)
hflights_dt[,DistanceKMs := Distance / 0.62137]
address(hflights_dt)

# within
system.time(within(hflights_dt,DistanceKMs <- Distance / 0.62137))

# 同时创建多个变量
hflights_dt[,c('DistanceKMs','DistanceFeets'):= .(Distance / 0.62137,Distance * 5280)]

carriers <- unique(hflights_dt$UniqueCarrier)
carriers
hflights_dt[,paste('carrier',carriers,sep = '_'):=
              lapply(carriers,function(x) as.numeric(UniqueCarrier == x))]
str(hflights_dt[,grep('^carrier',names(hflights_dt)),with=FALSE])

# 正则
select(iris,grep('^P',colnames(iris)))

# 采用dplyr包生成新变量
hflights <- hflights %>% mutate(DistanceKMs = Distance / 0.62137)

# 数据集合并
# dplyr/join
# data.table调用[ 操作符的mult参数
(wdays <- data.frame(
 DayOfWeek = 1:7,
 DayOfWeekString = c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday")
))

system.time(merge(hflights,wdays))
system.time(merge(hflights_dt,wdays,by = 'DayOfWeek'))

# 相同结构 rbind cbind
# 稀疏矩阵 rBind cBind
# do.call命令,对list对象的所有元素执行rbind或cbind命令
# rbindlist 合并data.table对象

# 4.数据整形
# 将宽表转换为长表 melt
library(reshape2)
hflights_melted <- melt(hflights,id.vars = 0,measure.vars = c('ActualElapsedTime','AirTime'))
str(hflights_melted)
# ggplot2绘图需要

# 将长表转换为宽表 cast
hflights_melted <- melt(hflights,id.vars = 'Month',measure.vars = c('ActualElapsedTime','AirTime'))
head(hflights_melted)

(df <- dcast(hflights_melted,Month ~ variable,fun.aggregate = mean,na.rm = TRUE))

library(ggplot2)
ggplot(melt(df,id.vars = 'Month')) + 
  geom_line(aes(x = Month,y = value,color = variable)) + 
  scale_x_continuous(breaks = 1:12) + 
  theme_bw() + 
  theme(legend.position = 'top')
  

# tidyr包  gather和spread
library(tidyr)
str(gather(hflights[,c('Month','ActualElapsedTime','AirTime')],variable,value,-Month))
  • 建模

你可能感兴趣的:(Mastering Data Analysis with R-笔记)