R语言学习笔记_数据清理1

导入数据之前
明确需求
理解数据
数据质量
导入数据
支持多种数据源
文本文件
read.table、read.csv、read.delim
scan
Excel文件
csv、prn格式+read.csv
剪贴板+read.delim
xlsx扩展包
rodbc包
数据清洗
缺失值处理
NA、NaN、Inf、-Inf
识别缺失数据
先把不合理值重编码为缺失值
is.na、complete.case
mice::pattern
VIM::aggr、VIM::matrixplot
处理缺失值
缺失值是否有业务含义
删除na.omit、行删除、配对删除
替换
异常值处理
数据转换
数据规范化:将数据按比例缩放,落入一个小的特定区间
极差化
标准化scale
小数定标规范化
改善分布的转换
取对数、平方、倒数、指数
数值变量离散化
ifelse()
ifelse的延伸
cut和Hmisc::cut2
数据筛选
随机采样
sample()
set.seed
按条件过滤
subset、which
SQL爱好者的福音
需要安装SQL包
install.packages(‘sqldf’)
library(sqldf)
变量筛选
选取有用变量
消除变量间的相关性
注:数据筛选和变量筛选也称为数据规约
导出数据
支持多种数据源
文本文件
write.table、write.csv

# 查看当前工作目录
> getwd()
[1] "C:/Users/Administrator/Documents"
# 修改当前工作目录
> setwd('G:/work/R')
> getwd()
[1] "G:/work/R"
# 读取数据示例
> x = read.table('G:/work/R/lastsave1.txt',header = F,sep = ',')
> x = read.csv('G:/work/R/lastsave1.txt',header = F)
> x = read.delim('G:/work/R/lastsave1.txt',header = F,sep = ',')
> x = read.delim('clipboard',header = F)

用scan读取的数据是一个列表,用as.data.frame转换成数据框
> x = scan('G:/work/R/lastsave1.txt',what = list(date = "",pv = 0,uv = 0),sep = ",")
> x = as.data.frame(x)

文件写入示例:
> write.table(x,'G:/work/R/lastsave2.txt',sep = ',',quote = F,col.names = T)

缺失值检查示例:
> (vx = c(1,3,NA,6,8))
[1]  1  3 NA  6  8
> (y1 = is.na(vx))
[1] FALSE FALSE  TRUE FALSE FALSE
> (y2 = complete.cases(vx))
[1]  TRUE  TRUE FALSE  TRUE  TRUE

is.na和complete.cases的差异:
> (mx = matrix(1:12,3,4))
     [,1] [,2] [,3] [,4]
[1,]    1    4    7   10
[2,]    2    5    8   11
[3,]    3    6    9   12
> mx[1,2:4] = NA
> mx
     [,1] [,2] [,3] [,4]
[1,]    1   NA   NA   NA
[2,]    2    5    8   11
[3,]    3    6    9   12
> (y3 = is.na(mx))
      [,1]  [,2]  [,3]  [,4]
[1,] FALSE  TRUE  TRUE  TRUE
[2,] FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE
> (y4 = complete.cases(mx))
[1] FALSE  TRUE  TRUE

缺失值处理:
> vx[y1] =10
> vx
[1]  1  3 10  6  8
> mx[y3] = 100
> mx
     [,1] [,2] [,3] [,4]
[1,]    1  100  100  100
[2,]    2    5    8   11
[3,]    3    6    9   12

在R中安装mice和VIM包
> install.packages(c('mice','VIM'))

加载VIM包
> library(VIM)

> str(sleep)
> summary(sleep)
> sum(is.na(sleep$Dream))
> apply(is.na(sleep),2,sum)

加载mice包
> library(mice)

> md.pattern(sleep)

> library(VIM)
图形化表现缺失值:
> aggr(sleep,prop=F,number=TRUE)        # 绘制缺失值直方图和
> matrixplot(sleep)     # 红色代表缺失值 从浅到深表示数值变化

缺失值处理
# 删除带有缺失值的数据
> newsleep = na.omit(sleep)
> str(newsleep)
> md.pattern(newsleep)

> cor(sleep,use = "pairwise.complete.obs")  # 配对删除

# 用均值填充缺失值 也可以用中位数median填充缺失值
> x = c(1,2,NA,100,NA,2,4)
> mean(x,na.rm = T)
[1] 21.8
> x[is.na(x)] = mean(x,na.rm = T)
> x
[1]   1.0   2.0  21.8 100.0  21.8   2.0   4.0

# 按照列或者行相关性填充


数据转换
> str(airquality)
> airquality
# 排序
> airquality = airquality[order(airquality$Temp),]
> head(airquality,5)

   Ozone Solar.R Wind Temp Month Day
5     NA      NA 14.3   56     5   5
18     6      78 18.4   57     5  18
25    NA      66 16.6   57     5  25
27    NA      NA  8.0   57     5  27
15    18      65 13.2   58     5  15
> quantile(airquality$Temp,probs = c(0,0.3,0.6,1.0))
  0%  30%  60% 100%
  56   74   81   97
> airquality$isHot = ifelse(airquality$Temp>80,T,F)
> head(airquality,5)
   Ozone Solar.R Wind Temp Month Day isHot
5     NA      NA 14.3   56     5   5 FALSE
18     6      78 18.4   57     5  18 FALSE
25    NA      66 16.6   57     5  25 FALSE
27    NA      NA  8.0   57     5  27 FALSE
15    18      65 13.2   58     5  15 FALSE
> tail(airquality,3)
    Ozone Solar.R Wind Temp Month Day isHot
123    85     188  6.3   94     8  31  TRUE
122    84     237  6.3   96     8  30  TRUE
120    76     203  9.7   97     8  28  TRUE

> airquality = within(airquality,{TempL = NA
+     TempL[Temp>80] = 'Hot'
+     TempL[Temp > 70 & Temp <= 80] = 'Warm'
+     TempL[Temp <= 70] = 'Cold'
+ })

> head(airquality,5)
   Ozone Solar.R Wind Temp Month Day isHot TempL
5     NA      NA 14.3   56     5   5 FALSE  Cold
18     6      78 18.4   57     5  18 FALSE  Cold
25    NA      66 16.6   57     5  25 FALSE  Cold
27    NA      NA  8.0   57     5  27 FALSE  Cold
15    18      65 13.2   58     5  15 FALSE  Cold
> airquality$TempL

# 转换成因子类型
> airquality$TempL = factor(airquality$TempL,levels = c('Cold','Warm','Hot'),ordered = TRUE)
> airquality$TempL
> unclass(airquality$TempL)

> airquality = within(airquality,{
+     TempL1 = cut(Temp,breaks = c(56,73,81,97),include.lowest = T)
+ })

> head(airquality,5)
> airquality$TempL1

> airquality = within(airquality,{
+     TempL2 = cut(Temp,breaks = quantile(Temp,probs = c(0.0,0.3,0.7,1.0)),include.lowest = T)
+ })

> table(airquality$TempL2)      # 每个区间含有多少样本
> prop.table(table(airquality$TempL2))      # 每个区间样本所占比例

# Hmisc包里有cut2函数
> install.packages('Hmisc')
> library(Hmisc)

> airquality = within(airquality,{
+     TempL3 = cut2(Temp,g = 4)
+ })

> head(airquality,5)
   Ozone Solar.R Wind Temp Month Day isHot TempL  TempL1  TempL2  TempL3
5     NA      NA 14.3   56     5   5 FALSE  Cold [56,73] [56,74] [56,73)
18     6      78 18.4   57     5  18 FALSE  Cold [56,73] [56,74] [56,73)
25    NA      66 16.6   57     5  25 FALSE  Cold [56,73] [56,74] [56,73)
27    NA      NA  8.0   57     5  27 FALSE  Cold [56,73] [56,74] [56,73)
15    18      65 13.2   58     5  15 FALSE  Cold [56,73] [56,74] [56,73)

> as.numeric(airquality$TempL3)
  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [40] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 [79] 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[118] 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4

> unclass(airquality$TempL3)
  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [40] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 [79] 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[118] 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
attr(,"levels")
[1] "[56,73)" "[73,80)" "[80,86)" "[86,97]"

数据规约
> subset(airquality,Month %in% c(7,9))

> subset(airquality,Month %in% c(7,9) & Day %in% 10:15)

> subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80)

> subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80 , select = c(Month,Day,Temp))

> subset(airquality,Month %in% c(7,9) & Day %in% 10:15 | Temp > 80 , select = Temp:Day)

> x = sample(1:nrow(airquality),size = 10,replace = T)
> x
 [1]  80 119   4  51  55 114  73  23 117  12
> airSam = airquality[x,]
> airSam
    Ozone Solar.R Wind Temp Month Day isHot TempL  TempL1  TempL2  TempL3
116    45     212  9.7   79     8  24 FALSE  Warm (73,81] (74,83] [73,80)
97     35      NA  7.4   85     8   5  TRUE   Hot (81,97] (83,97] [80,86)
27     NA      NA  8.0   57     5  27 FALSE  Cold [56,73] [56,74] [56,73)
132    21     230 10.9   75     9   9 FALSE  Warm (73,81] (74,83] [73,80)
53     NA      59  1.7   76     6  22 FALSE  Warm (73,81] (74,83] [73,80)
129    32      92 15.5   84     9   6  TRUE   Hot (81,97] (83,97] [80,86)
131    23     220 10.3   78     9   8 FALSE  Warm (73,81] (74,83] [73,80)
28     23      13 12.0   67     5  28 FALSE  Cold [56,73] [56,74] [56,73)
81     63     220 11.5   85     7  20  TRUE   Hot (81,97] (83,97] [80,86)
4      18     313 11.5   62     5   4 FALSE  Cold [56,73] [56,74] [56,73)

假定10个随机种子,可以保证多次结果一致,验证前次结果
> set.seed(10)
> x1 = runif(10,10,100)
> x2 = runif(10,10,100)
> x1
 [1] 55.67304 37.60917 48.42169 72.37919 17.66224 30.28930 34.70775 34.50746 65.42464
[10] 48.67044
> x2
 [1] 68.64901 61.09640 20.21581 63.63328 42.22450 48.59285 14.67130 33.77599 45.89117
[10] 85.25207

SQL语句数据筛选
> install.packages('sqldf')
> library(sqldf)

> newdf = sqldf('select * from airquality where Ozone>30')
> newdf

> newdf = sqldf('select Month,avg(Temp) as avTemp,count(*) as rCnt from airquality where Month in (7,9) group by Month')
> newdf
  Month   avTemp rCnt
1     7 83.90323   31
2     9 76.90000   30

你可能感兴趣的:(R语言学习)