导入数据之前
明确需求
理解数据
数据质量
导入数据
支持多种数据源
文本文件
read.table、read.csv、read.delim
scan
Excel文件
csv、prn格式+read.csv
剪贴板+read.delim
xlsx扩展包
rodbc包
数据清洗
缺失值处理
NA、NaN、Inf、-Inf
识别缺失数据
先把不合理值重编码为缺失值
is.na、complete.case
mice::pattern
VIM::aggr、VIM::matrixplot
处理缺失值
缺失值是否有业务含义
删除na.omit、行删除、配对删除
替换
异常值处理
数据转换
数据规范化:将数据按比例缩放,落入一个小的特定区间
极差化
标准化scale
小数定标规范化
改善分布的转换
取对数、平方、倒数、指数
数值变量离散化
ifelse()
ifelse的延伸
cut和Hmisc::cut2
数据筛选
随机采样
sample()
set.seed
按条件过滤
subset、which
SQL爱好者的福音
需要安装SQL包
install.packages(‘sqldf’)
library(sqldf)
变量筛选
选取有用变量
消除变量间的相关性
注:数据筛选和变量筛选也称为数据规约
导出数据
支持多种数据源
文本文件
write.table、write.csv
# 查看当前工作目录
> getwd()
[1] "C:/Users/Administrator/Documents"
# 修改当前工作目录
> setwd('G:/work/R')
> getwd()
[1] "G:/work/R"
# 读取数据示例
> x = read.table('G:/work/R/lastsave1.txt',header = F,sep = ',')
> x = read.csv('G:/work/R/lastsave1.txt',header = F)
> x = read.delim('G:/work/R/lastsave1.txt',header = F,sep = ',')
> x = read.delim('clipboard',header = F)
用scan读取的数据是一个列表,用as.data.frame转换成数据框
> x = scan('G:/work/R/lastsave1.txt',what = list(date = "",pv = 0,uv = 0),sep = ",")
> x = as.data.frame(x)
文件写入示例:
> write.table(x,'G:/work/R/lastsave2.txt',sep = ',',quote = F,col.names = T)
缺失值检查示例:
> (vx = c(1,3,NA,6,8))
[1] 1 3 NA 6 8
> (y1 = is.na(vx))
[1] FALSE FALSE TRUE FALSE FALSE
> (y2 = complete.cases(vx))
[1] TRUE TRUE FALSE TRUE TRUE
is.na和complete.cases的差异:
> (mx = matrix(1:12,3,4))
[,1] [,2] [,3] [,4]
[1,] 1 4 7 10
[2,] 2 5 8 11
[3,] 3 6 9 12
> mx[1,2:4] = NA
> mx
[,1] [,2] [,3] [,4]
[1,] 1 NA NA NA
[2,] 2 5 8 11
[3,] 3 6 9 12
> (y3 = is.na(mx))
[,1] [,2] [,3] [,4]
[1,] FALSE TRUE TRUE TRUE
[2,] FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE
> (y4 = complete.cases(mx))
[1] FALSE TRUE TRUE
缺失值处理:
> vx[y1] =10
> vx
[1] 1 3 10 6 8
> mx[y3] = 100
> mx
[,1] [,2] [,3] [,4]
[1,] 1 100 100 100
[2,] 2 5 8 11
[3,] 3 6 9 12
在R中安装mice和VIM包
> install.packages(c('mice','VIM'))
加载VIM包
> library(VIM)
> str(sleep)
> summary(sleep)
> sum(is.na(sleep$Dream))
> apply(is.na(sleep),2,sum)
加载mice包
> library(mice)
> md.pattern(sleep)
> library(VIM)
图形化表现缺失值:
> aggr(sleep,prop=F,number=TRUE) # 绘制缺失值直方图和
> matrixplot(sleep) # 红色代表缺失值 从浅到深表示数值变化
缺失值处理
# 删除带有缺失值的数据
> newsleep = na.omit(sleep)
> str(newsleep)
> md.pattern(newsleep)
> cor(sleep,use = "pairwise.complete.obs") # 配对删除
# 用均值填充缺失值 也可以用中位数median填充缺失值
> x = c(1,2,NA,100,NA,2,4)
> mean(x,na.rm = T)
[1] 21.8
> x[is.na(x)] = mean(x,na.rm = T)
> x
[1] 1.0 2.0 21.8 100.0 21.8 2.0 4.0
# 按照列或者行相关性填充
数据转换
> str(airquality)
> airquality
# 排序
> airquality = airquality[order(airquality$Temp),]
> head(airquality,5)
Ozone Solar.R Wind Temp Month Day
5 NA NA 14.3 56 5 5
18 6 78 18.4 57 5 18
25 NA 66 16.6 57 5 25
27 NA NA 8.0 57 5 27
15 18 65 13.2 58 5 15
> quantile(airquality$Temp,probs = c(0,0.3,0.6,1.0))
0% 30% 60% 100%
56 74 81 97
> airquality$isHot = ifelse(airquality$Temp>80,T,F)
> head(airquality,5)
Ozone Solar.R Wind Temp Month Day isHot
5 NA NA 14.3 56 5 5 FALSE
18 6 78 18.4 57 5 18 FALSE
25 NA 66 16.6 57 5 25 FALSE
27 NA NA 8.0 57 5 27 FALSE
15 18 65 13.2 58 5 15 FALSE
> tail(airquality,3)
Ozone Solar.R Wind Temp Month Day isHot
123 85 188 6.3 94 8 31 TRUE
122 84 237 6.3 96 8 30 TRUE
120 76 203 9.7 97 8 28 TRUE
> airquality = within(airquality,{TempL = NA
+ TempL[Temp>80] = 'Hot'
+ TempL[Temp > 70 & Temp <= 80] = 'Warm'
+ TempL[Temp <= 70] = 'Cold'
+ })
> head(airquality,5)
Ozone Solar.R Wind Temp Month Day isHot TempL
5 NA NA 14.3 56 5 5 FALSE Cold
18 6 78 18.4 57 5 18 FALSE Cold
25 NA 66 16.6 57 5 25 FALSE Cold
27 NA NA 8.0 57 5 27 FALSE Cold
15 18 65 13.2 58 5 15 FALSE Cold
> airquality$TempL
# 转换成因子类型
> airquality$TempL = factor(airquality$TempL,levels = c('Cold','Warm','Hot'),ordered = TRUE)
> airquality$TempL
> unclass(airquality$TempL)
> airquality = within(airquality,{
+ TempL1 = cut(Temp,breaks = c(56,73,81,97),include.lowest = T)
+ })
> head(airquality,5)
> airquality$TempL1
> airquality = within(airquality,{
+ TempL2 = cut(Temp,breaks = quantile(Temp,probs = c(0.0,0.3,0.7,1.0)),include.lowest = T)
+ })
> table(airquality$TempL2) # 每个区间含有多少样本
> prop.table(table(airquality$TempL2)) # 每个区间样本所占比例
# Hmisc包里有cut2函数
> install.packages('Hmisc')
> library(Hmisc)
> airquality = within(airquality,{
+ TempL3 = cut2(Temp,g = 4)
+ })
> head(airquality,5)
Ozone Solar.R Wind Temp Month Day isHot TempL TempL1 TempL2 TempL3
5 NA NA 14.3 56 5 5 FALSE Cold [56,73] [56,74] [56,73)
18 6 78 18.4 57 5 18 FALSE Cold [56,73] [56,74] [56,73)
25 NA 66 16.6 57 5 25 FALSE Cold [56,73] [56,74] [56,73)
27 NA NA 8.0 57 5 27 FALSE Cold [56,73] [56,74] [56,73)
15 18 65 13.2 58 5 15 FALSE Cold [56,73] [56,74] [56,73)
> as.numeric(airquality$TempL3)
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[40] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[79] 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[118] 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
> unclass(airquality$TempL3)
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[40] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[79] 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[118] 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
attr(,"levels")
[1] "[56,73)" "[73,80)" "[80,86)" "[86,97]"
数据规约
> subset(airquality,Month %in% c(7,9))
> subset(airquality,Month %in% c(7,9) & Day %in% 10:15)
> subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80)
> subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80 , select = c(Month,Day,Temp))
> subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80 , select = Temp:Day)
> x = sample(1:nrow(airquality),size = 10,replace = T)
> x
[1] 80 119 4 51 55 114 73 23 117 12
> airSam = airquality[x,]
> airSam
Ozone Solar.R Wind Temp Month Day isHot TempL TempL1 TempL2 TempL3
116 45 212 9.7 79 8 24 FALSE Warm (73,81] (74,83] [73,80)
97 35 NA 7.4 85 8 5 TRUE Hot (81,97] (83,97] [80,86)
27 NA NA 8.0 57 5 27 FALSE Cold [56,73] [56,74] [56,73)
132 21 230 10.9 75 9 9 FALSE Warm (73,81] (74,83] [73,80)
53 NA 59 1.7 76 6 22 FALSE Warm (73,81] (74,83] [73,80)
129 32 92 15.5 84 9 6 TRUE Hot (81,97] (83,97] [80,86)
131 23 220 10.3 78 9 8 FALSE Warm (73,81] (74,83] [73,80)
28 23 13 12.0 67 5 28 FALSE Cold [56,73] [56,74] [56,73)
81 63 220 11.5 85 7 20 TRUE Hot (81,97] (83,97] [80,86)
4 18 313 11.5 62 5 4 FALSE Cold [56,73] [56,74] [56,73)
假定10个随机种子,可以保证多次结果一致,验证前次结果
> set.seed(10)
> x1 = runif(10,10,100)
> x2 = runif(10,10,100)
> x1
[1] 55.67304 37.60917 48.42169 72.37919 17.66224 30.28930 34.70775 34.50746 65.42464
[10] 48.67044
> x2
[1] 68.64901 61.09640 20.21581 63.63328 42.22450 48.59285 14.67130 33.77599 45.89117
[10] 85.25207
SQL语句数据筛选
> install.packages('sqldf')
> library(sqldf)
> newdf = sqldf('select * from airquality where Ozone>30')
> newdf
> newdf = sqldf('select Month,avg(Temp) as avTemp,count(*) as rCnt from airquality where Month in (7,9) group by Month')
> newdf
Month avTemp rCnt
1 7 83.90323 31
2 9 76.90000 30