数据清洗
-
数据清洗是数据分析关键的一步,直接影响之后的处理工作
-
数据需要修改吗?有什么需要修改的吗?数据应该怎么调整才能适用于接下来的分析和挖掘?
-
是一个迭代的过程,实际项目中可能需要不止一次地执行这些清洗操作
-
处理缺失数据:pd.fillna(),pd.dropna()
数据连接(pd.merge)
-
pd.merge
-
根据单个或多个键将不同DataFrame的行连接起来
-
类似数据库的连接操作
import pandas as pd
import numpy as np
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})
print(df_obj1)
print(df_obj2)
1. 默认将重叠列的列名作为“外键”进行连接
# 默认将重叠列的列名作为“外键”进行连接
print(pd.merge(df_obj1, df_obj2))
2. on显示指定“外键”
# on显示指定“外键”
print(pd.merge(df_obj1, df_obj2, on='key'))
3. left_on,左侧数据的“外键”,right_on,右侧数据的“外键”
# left_on,right_on分别指定左侧数据和右侧数据的“外键”
# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2'))
默认是“内连接”(inner),即结果中的键是交集
how
指定连接方式
4. “外连接”(outer),结果中的键是并集
# “外连接”
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer'))
5. “左连接”(left)
# 左连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left'))
6. “右连接”(right)
右连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right'))
整体代码:
import pandas as pd import numpy as np df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': np.random.randint(0, 10, 7)}) df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': np.random.randint(0, 10, 3)}) print(df_obj1) print(df_obj2) # 默认将重叠列的列名作为“外键”进行连接 print(pd.merge(df_obj1, df_obj2)) # on显示指定“外键” print(pd.merge(df_obj1, df_obj2, on='key')) # left_on,right_on分别指定左侧数据和右侧数据的“外键” # 更改列名 df_obj1 = df_obj1.rename(columns={'key': 'key1'}) df_obj2 = df_obj2.rename(columns={'key': 'key2'}) print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')) # “外连接” print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')) # 左连接 print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')) # 右连接 print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right'))
效果
key data1 0 b 0 1 b 3 2 a 8 3 c 9 4 a 4 5 a 4 6 b 2 key data2 0 a 7 1 b 0 2 d 1 key data1 data2 0 b 0 0 1 b 3 0 2 b 2 0 3 a 8 7 4 a 4 7 5 a 4 7 key data1 data2 0 b 0 0 1 b 3 0 2 b 2 0 3 a 8 7 4 a 4 7 5 a 4 7 key1 data1 key2 data2 0 b 0 b 0 1 b 3 b 0 2 b 2 b 0 3 a 8 a 7 4 a 4 a 7 5 a 4 a 7 key1 data1 key2 data2 0 b 0.0 b 0.0 1 b 3.0 b 0.0 2 b 2.0 b 0.0 3 a 8.0 a 7.0 4 a 4.0 a 7.0 5 a 4.0 a 7.0 6 c 9.0 NaN NaN 7 NaN NaN d 1.0 key1 data1 key2 data2 0 b 0 b 0.0 1 b 3 b 0.0 2 a 8 a 7.0 3 c 9 NaN NaN 4 a 4 a 7.0 5 a 4 a 7.0 6 b 2 b 0.0 key1 data1 key2 data2 0 b 0.0 b 0 1 b 3.0 b 0 2 b 2.0 b 0 3 a 8.0 a 7 4 a 4.0 a 7 5 a 4.0 a 7 6 NaN NaN d 1
7. 处理重复列名
suffixes,默认为_x, _y
import pandas as pd import numpy as np # 处理重复列名 df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data' : np.random.randint(0,10,7)}) df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data' : np.random.randint(0,10,3)}) print(pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right')))
效果:
key data_left data_right
0 b 0 4
1 b 2 4
2 b 6 4
3 a 2 4
4 a 1 4
5 a 7 4
8. 按索引连接
left_index=True或right_index=True
# 按索引连接
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
print(pd.merge(df_obj1, df_obj2, left_on='key', right_index=True))
数据合并(pd.concat)
import numpy as np
import pandas as pd
arr1 = np.random.randint(0, 10, (3, 4))
arr2 = np.random.randint(0, 10, (3, 4))
print(arr1)
print(arr2)
print(np.concatenate([arr1, arr2]))
print(np.concatenate([arr1, arr2], axis=1))
效果:
key data1 data2 0 b 2 1 1 b 1 1 6 b 9 1 2 a 9 0 4 a 1 0 5 a 5 0
2. pd.concat
-
注意指定轴方向,默认axis=0
-
join指定合并方式,默认为outer
-
Series合并时查看行索引有无重复
1) index 没有重复的情况
# index 没有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(0,5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(5,9))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(9,12))
print(ser_obj1)
print(ser_obj2)
print(ser_obj3)
print(pd.concat([ser_obj1, ser_obj2, ser_obj3]))
print(pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1))
效果:
0 5 1 9 2 1 3 9 4 8 dtype: int32 5 2 6 1 7 0 8 8 dtype: int32 9 8 10 3 11 4 dtype: int32 0 5 1 9 2 1 3 9 4 8 5 2 6 1 7 0 8 8 9 8 10 3 11 4 dtype: int32 0 1 2 0 5.0 NaN NaN 1 9.0 NaN NaN 2 1.0 NaN NaN 3 9.0 NaN NaN 4 8.0 NaN NaN 5 NaN 2.0 NaN 6 NaN 1.0 NaN 7 NaN 0.0 NaN 8 NaN 8.0 NaN 9 NaN NaN 8.0 10 NaN NaN 3.0 11 NaN NaN 4.0
2) index 有重复的情况
# index 有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(4))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(3))
print(ser_obj1)
print(ser_obj2)
print(ser_obj3)
print(pd.concat([ser_obj1, ser_obj2, ser_obj3]))
效果:
0 5 1 5 2 5 3 7 4 2 dtype: int32 0 0 1 7 2 5 3 0 dtype: int32 0 3 1 5 2 9 dtype: int32 0 5 1 5 2 5 3 7 4 2 0 0 1 7 2 5 3 0 0 3 1 5 2 9 dtype: int32
3) DataFrame合并时同时查看行索引和列索引有无重复
import pandas as pd import numpy as np df_obj1 = pd.DataFrame(np.random.randint(0, 10, (3, 2)), index=['a', 'b', 'c'], columns=['A', 'B']) df_obj2 = pd.DataFrame(np.random.randint(0, 10, (2, 2)), index=['a', 'b'], columns=['C', 'D']) print(df_obj1) print(df_obj2) print(pd.concat([df_obj1, df_obj2],sort=False)) print(pd.concat([df_obj1, df_obj2], axis=1, join='inner'))
效果
A B a 8 4 b 5 2 c 5 0 C D a 2 8 b 8 4 A B C D a 8.0 4.0 NaN NaN b 5.0 2.0 NaN NaN c 5.0 0.0 NaN NaN a NaN NaN 2.0 8.0 b NaN NaN 8.0 4.0 A B C D a 8 4 2 8 b 5 2 8 4
数据重构
1. stack
-
将列索引旋转为行索引,完成层级索引
-
DataFrame->Series
- 沿轴方向将多个对象合并到一起
1. NumPy的concat
np.concatenate
import numpy as np
import pandas as pd
df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])
print(df_obj)
stacked = df_obj.stack()
print(stacked)
效果:
data1 data2 0 5 2 1 0 4 2 0 4 3 3 2 4 6 0 0 data1 5 data2 2 1 data1 0 data2 4 2 data1 0 data2 4 3 data1 3 data2 2 4 data1 6 data2 0 dtype: int32
2. unstack
-
将层级索引展开
-
Series->DataFrame
认操作内层索引,即level=-1
import numpy as np import pandas as pd df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2']) print(df_obj) stacked = df_obj.stack() # 默认操作内层索引 print(stacked.unstack()) # 通过level指定操作索引的级别 print(stacked.unstack(level=0))
效果:
data1 data2 0 9 8 1 8 5 2 1 7 3 5 8 4 9 9 data1 data2 0 9 8 1 8 5 2 1 7 3 5 8 4 9 9 0 1 2 3 4 data1 9 8 1 5 9 data2 8 5 7 8 9
数据转换
一、 处理重复数据
1 duplicated()
返回布尔型Series表示每行是否为重复行
import numpy as np
import pandas as pd
df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4,
'data2' : np.random.randint(0, 4, 8)})
print(df_obj)
print(df_obj.duplicated())
2 drop_duplicates()
过滤重复行
默认判断全部列
可指定按某些列判断
print(df_obj.drop_duplicates())
print(df_obj.drop_duplicates('data2'))
3. 根据map
传入的函数对每行或每列进行转换
- Series根据
map
传入的函数对每行或每列进行转换
ser_obj = pd.Series(np.random.randint(0,10,10))
print(ser_obj)
print(ser_obj.map(lambda x : x ** 2))
二、数据替换
replace
根据值的内容进行替换
# 单个值替换单个值
print(ser_obj.replace(1, -100))
# 多个值替换一个值
print(ser_obj.replace([6, 8], -100))
# 多个值替换多个值
print(ser_obj.replace([4, 7], [-100, -200]))
整体代码
import numpy as np import pandas as pd df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4, 'data2' : np.random.randint(0, 4, 8)}) print("df_obj") print(df_obj) print("df_obj.duplicated()") print(df_obj.duplicated()) print("df_obj.drop_duplicates()") print(df_obj.drop_duplicates()) print("df_obj.drop_duplicates('data2')") print(df_obj.drop_duplicates('data2')) ser_obj = pd.Series(np.random.randint(0,10,10)) print("ser_obj") print(ser_obj) print("ser_obj.map(lambda x : x ** 2") print(ser_obj.map(lambda x : x ** 2)) print("ser_obj.replace(1, -100)") # 单个值替换单个值 print(ser_obj.replace(1, -100)) print("ser_obj.replace([6, 8], -100)") # 多个值替换一个值 print(ser_obj.replace([6, 8], -100)) print("ser_obj.replace([4, 7], [-100, -200])") # 多个值替换多个值 print(ser_obj.replace([4, 7], [-100, -200]))
效果
df_obj data1 data2 0 a 2 1 a 2 2 a 2 3 a 1 4 b 3 5 b 3 6 b 0 7 b 0 df_obj.duplicated() 0 False 1 True 2 True 3 False 4 False 5 True 6 False 7 True dtype: bool df_obj.drop_duplicates() data1 data2 0 a 2 3 a 1 4 b 3 6 b 0 df_obj.drop_duplicates('data2') data1 data2 0 a 2 3 a 1 4 b 3 6 b 0 ser_obj 0 1 1 2 2 3 3 8 4 8 5 9 6 5 7 2 8 6 9 6 dtype: int32 ser_obj.map(lambda x : x ** 2 0 1 1 4 2 9 3 64 4 64 5 81 6 25 7 4 8 36 9 36 dtype: int64 ser_obj.replace(1, -100) 0 -100 1 2 2 3 3 8 4 8 5 9 6 5 7 2 8 6 9 6 dtype: int32 ser_obj.replace([6, 8], -100) 0 1 1 2 2 3 3 -100 4 -100 5 9 6 5 7 2 8 -100 9 -100 dtype: int32 ser_obj.replace([4, 7], [-100, -200]) 0 1 1 2 2 3 3 8 4 8 5 9 6 5 7 2 8 6 9 6 dtype: int32