数据清洗主要是删除原始数据集中的无关数据、重复数据、平滑噪声数据,筛选掉与挖掘主题无关的数据,处理缺失值、异常值等。
检测重复观测
duplicated:返回的是Series,输出为bool值,值为True表示该行重复。
data.duoduplicated()
处理重复观测
drop_duplicates:返回的是DataFrame,默认保留第一个遇到的序列,将后续重复行去掉
data.drop_duplicates(['列名'],keep='last')
import pandas as pd
data=pd.DataFrame({'x1':['fruit','sale']*3+['sale'],'x2':['appl1',3,'banana',3,'water',4,4]})
print("原始数据:\n",data)
'''
x1 x2
0 fruit appl1
1 sale 3
2 fruit banana
3 sale 3
4 fruit water
5 sale 4
6 sale 4
'''
data.duplicated()
print("判断各行是否是重复行:\n",data.duplicated())
'''
判断各行是否是重复行:
0 False
1 False
2 False
3 True
4 False
5 False
6 True
dtype: bool
'''
data.drop_duplicates()
print("删除重复行,返回删除后的数据:\n",data.drop_duplicates())
'''
删除重复行,返回删除后的数据:
x1 x2
0 fruit appl1
1 sale 3
2 fruit banana
4 fruit water
5 sale 4
'''
data.drop_duplicates(['x1'])
print("根据x1列删除重复数据:\n",data.drop_duplicates(['x1']))
'''
x1 x2
0 fruit appl1
1 sale 3
'''
data1=data.drop_duplicates(['x1'],keep='last')
print("根据x1列删除重复数据并保留最后一项:\n",data1)
'''
x1 x2
4 fruit water
6 sale 4
'''
isnull:是空返回False,非空返回True。
notnull:与isnull相反
import pandas as pd
import numpy as np
dat = pd.Series([2312,13213, np.nan, 33.21, np.nan, -9])
print(dat)
print(dat.isnull())
'''
0 2312.00
1 13213.00
2 NaN
3 33.21
4 NaN
5 -9.00
dtype: float64
0 False
1 False
2 True
3 False
4 True
5 False
dtype: bool
'''
dropna(axis=0,how='any',thresh=None,subset=None,inplace=False)
axis 0为行,1为列,默认为0
how any删除带有nan的行,all删除全为nan的行
thresh int型,保留至少int个非nan行
subset list,在特定列缺失值处理
inplace bool,是否修改源文件
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data=pd.Series([1,2,3,4,5,np.NaN,321],index=['A','B','C','D','E','F','G'])
print(data)
print(data.dropna())
'''
A 1.0
B 2.0
C 3.0
D 4.0
E 5.0
F NaN
G 321.0
dtype: float64
A 1.0
B 2.0
C 3.0
D 4.0
E 5.0
G 321.0
dtype: float64
'''
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
df = pd.DataFrame([[np.nan,4,6], [3,np.nan,5], [2,9,5]],
index=['blue', 'green', 'red'],
columns=['ball', 'mug', 'pen'])
print(df)
print("--------------")
print(df.dropna())
'''
ball mug pen
blue NaN 4.0 6
green 3.0 NaN 5
red 2.0 9.0 5
--------------
ball mug pen
red 2.0 9.0 5
'''
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data = pd.DataFrame([[6,np.nan,6], [np.nan,np.nan,np.nan], [2,np.nan,5]],
index=['blue', 'green', 'red'],
columns=['ball', 'mug', 'pen'])
print(data)
print("--------------")
print(data.dropna(how='all'))
'''
ball mug pen
blue 6.0 NaN 6.0
green NaN NaN NaN
red 2.0 NaN 5.0
--------------
ball mug pen
blue 6.0 NaN 6.0
red 2.0 NaN 5.0
'''
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data = pd.DataFrame([[6,np.nan,6], [432,8,34], [2,np.nan,5]],
index=['blue', 'green', 'red'],
columns=['ball', 'mug', 'pen'])
print(data)
print("--------------")
print("只保留至少2个非NaN值的行")
print(data.dropna(thresh=2))
print("只保留至少3个非NaN值的行")
print(data.dropna(thresh=3))
'''
ball mug pen
blue 6 NaN 6
green 432 8.0 34
red 2 NaN 5
--------------
只保留至少2个非NaN值的行
ball mug pen
blue 6 NaN 6
green 432 8.0 34
red 2 NaN 5
只保留至少3个非NaN值的行
ball mug pen
green 432 8.0 34
'''
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data = pd.DataFrame([[np.nan,np.nan,6], [432,8,34], [2,np.nan,np.nan]],
index=['blue', 'green', 'red'],
columns=['ball', 'mug', 'pen'])
print(data)
print("--------------")
print(data.dropna(subset=['ball','pen']))
'''
ball mug pen
blue NaN NaN 6.0
green 432.0 8.0 34.0
red 2.0 NaN NaN
--------------
ball mug pen
green 432.0 8.0 34.0
'''
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data=pd.Series([1,2,3,4,5,np.NaN,321],index=['A','B','C','D','E','F','G'])
print(data)
print(data[data.notnull()])
'''
A 1.0
B 2.0
C 3.0
D 4.0
E 5.0
F NaN
G 321.0
dtype: float64
A 1.0
B 2.0
C 3.0
D 4.0
E 5.0
G 321.0
dtype: float64
'''
语法格式
fillna(value=None,method=None,axis=None,inplace=False)
value 固定值填充
method "ffill"用前一个非空缺值填充,"bfill"用后一个填充
axis "index"按行填充,"columns"按列填充
inplace 是否用新生成的列表替换原列表
import pandas as pd
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
[3, 4, np.nan, 1],
[np.nan, np.nan, np.nan, 5],
[np.nan, 3, np.nan, 4]],
columns=list('ABCD'))
print(df)
'''
A B C D
0 NaN 2.0 NaN 0
1 3.0 4.0 NaN 1
2 NaN NaN NaN 5
3 NaN 3.0 NaN 4
'''
#填补特定值
df1=df.fillna(10)
print(df1)
'''
A B C D
0 10.0 2.0 10.0 0
1 3.0 4.0 10.0 1
2 10.0 10.0 10.0 5
3 10.0 3.0 10.0 4
'''
#按列填补特定值
val={'A':11,'B':22,'C':33,'D':44}
df2=df.fillna(val)
print(df2)
'''
A B C D
0 11.0 2.0 33.0 0
1 3.0 4.0 33.0 1
2 11.0 22.0 33.0 5
3 11.0 3.0 33.0 4
'''
#用前一个非空缺值填充
df3=df.fillna(method='ffill')
print(df3)
'''
A B C D
0 NaN 2.0 NaN 0
1 3.0 4.0 NaN 1
2 3.0 4.0 NaN 5
3 3.0 3.0 NaN 4
'''
'''
#均值填充
df.fillna(value=df.mean())
#中位数填充
df.fillna(value=df.median())
#最大值填充
df.fillna(value=df.max())
#最小值填充
df.fillna(value=df.min())
#删除空缺值
df.dropna()
'''
即利用已知点建立合适的插值函数f(x),未知值由对应点x求出近似的函数值f(x)来代替。常见的插值法有拉格朗日插值法和牛顿插值法。
(1)删除含有异常值的记录
(2)将异常值视为缺失值,用缺失值方法处理
(3)可用前后两个观测值的平均值修正
中位数填充
df.fillna(value=df.median())
#最大值填充
df.fillna(value=df.max())
#最小值填充
df.fillna(value=df.min())
#删除空缺值
df.dropna()
‘’’
#### 2.2.3插值法
即利用已知点建立合适的插值函数f(x),未知值由对应点x求出近似的函数值f(x)来代替。常见的插值法有拉格朗日插值法和牛顿插值法。
[外链图片转存中...(img-zgGRD1Xa-1659860904240)]
[外链图片转存中...(img-G8mrtNe8-1659860904256)]
## 3.异常值处理
(1)删除含有异常值的记录
(2)将异常值视为缺失值,用缺失值方法处理
(3)可用前后两个观测值的平均值修正
(4)不处理,因为有些异常值可能蕴含有用的信息
参考书籍《Python数据分析与挖掘实战》