使用Numpy中的np.NaN或者np.nan
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data=pd.Series([1,2,3,4,5,np.NaN,321],index=['A','B','C','D','E','F','G'])
print(data)
data['B']=None
print(data)
dropna(axis=0,how='any',thresh=None,subset=None,inplace=False)
axis 0为行,1为列,默认为0
how any删除带有nan的行,all删除全为nan的行
thresh int型,保留至少int个非nan行
subset list,在特定列缺失值处理
inplace bool,是否修改源文件
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data=pd.Series([1,2,3,4,5,np.NaN,321],index=['A','B','C','D','E','F','G'])
print(data)
print(data.dropna())
'''
A 1.0
B 2.0
C 3.0
D 4.0
E 5.0
F NaN
G 321.0
dtype: float64
A 1.0
B 2.0
C 3.0
D 4.0
E 5.0
G 321.0
dtype: float64
'''
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
df = pd.DataFrame([[np.nan,4,6], [3,np.nan,5], [2,9,5]],
index=['blue', 'green', 'red'],
columns=['ball', 'mug', 'pen'])
print(df)
print("--------------")
print(df.dropna())
'''
ball mug pen
blue NaN 4.0 6
green 3.0 NaN 5
red 2.0 9.0 5
--------------
ball mug pen
red 2.0 9.0 5
'''
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data = pd.DataFrame([[6,np.nan,6], [np.nan,np.nan,np.nan], [2,np.nan,5]],
index=['blue', 'green', 'red'],
columns=['ball', 'mug', 'pen'])
print(data)
print("--------------")
print(data.dropna(how='all'))
'''
ball mug pen
blue 6.0 NaN 6.0
green NaN NaN NaN
red 2.0 NaN 5.0
--------------
ball mug pen
blue 6.0 NaN 6.0
red 2.0 NaN 5.0
'''
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data = pd.DataFrame([[6,np.nan,6], [432,8,34], [2,np.nan,5]],
index=['blue', 'green', 'red'],
columns=['ball', 'mug', 'pen'])
print(data)
print("--------------")
print("只保留至少2个非NaN值的行")
print(data.dropna(thresh=2))
print("只保留至少3个非NaN值的行")
print(data.dropna(thresh=3))
'''
ball mug pen
blue 6 NaN 6
green 432 8.0 34
red 2 NaN 5
--------------
只保留至少2个非NaN值的行
ball mug pen
blue 6 NaN 6
green 432 8.0 34
red 2 NaN 5
只保留至少3个非NaN值的行
ball mug pen
green 432 8.0 34
'''
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data = pd.DataFrame([[np.nan,np.nan,6], [432,8,34], [2,np.nan,np.nan]],
index=['blue', 'green', 'red'],
columns=['ball', 'mug', 'pen'])
print(data)
print("--------------")
print(data.dropna(subset=['ball','pen']))
'''
ball mug pen
blue NaN NaN 6.0
green 432.0 8.0 34.0
red 2.0 NaN NaN
--------------
ball mug pen
green 432.0 8.0 34.0
'''
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data=pd.Series([1,2,3,4,5,np.NaN,321],index=['A','B','C','D','E','F','G'])
print(data)
print(data[data.notnull()])
'''
A 1.0
B 2.0
C 3.0
D 4.0
E 5.0
F NaN
G 321.0
dtype: float64
A 1.0
B 2.0
C 3.0
D 4.0
E 5.0
G 321.0
dtype: float64
'''
相当于数据库SQL中的 JOIN 操作,即用几个表共有的引用值(键)从不同的表获取数据。以这些键为基础,我们能够获取列表形式的新数据,这些数据是对几个表的数据进行组合得到的。
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data1=pd.DataFrame({'id':['spring','summer','autumn','winter'],
'temperature':['12','34','23','6']})
data2=pd.DataFrame({'id':['spring','summer','autumn','winter'],
'color':['flower','green','yellow','white']})
print(data1)
print("*********************************")
print(data2)
print("*********************************")
print(pd.merge(data1,data2))
'''
id temperature
0 spring 12
1 summer 34
2 autumn 23
3 winter 6
*********************************
color id
0 flower spring
1 green summer
2 yellow autumn
3 white winter
*********************************
id temperature color
0 spring 12 flower
1 summer 34 green
2 autumn 23 yellow
3 winter 6 white
'''
指定基于哪个列进行合并
我们定义的两个DataFrame对象,当一个对象的列名称在另一个对象中也存在,所以对它们执行合并操作将得到一个空DataFrame对象。
例如:
# -*- coding: UTF-8 -*-
import pandas as pd
data1 = pd.DataFrame({'id':['ball', 'pencil', 'pen', 'mug', 'ashtray'],
'color':['white', 'red', 'red', 'black','green'],
'brand':['OMG', 'ABC', 'ABC', 'POD', 'POD']})
data2 = pd.DataFrame({'id':['pencil', 'pencil', 'ball', 'pen'],
'brand':['OMG', 'POD', 'ABC', 'POD']})
print(data1)
print('------------')
print(data2)
print('-----------')
print(pd.merge(data1, data2))
'''
brand color id
0 OMG white ball
1 ABC red pencil
2 ABC red pen
3 POD black mug
4 POD green ashtray
------------
brand id
0 OMG pencil
1 POD pencil
2 ABC ball
3 POD pen
-----------
Empty DataFrame
Columns: [brand, color, id]
Index: []
'''
因此我们需要指定基准列
# -*- coding: UTF-8 -*-
import pandas as pd
data1 = pd.DataFrame({'id':['ball', 'pencil', 'pen', 'mug', 'ashtray'],
'color':['white', 'red', 'red', 'black','green'],
'brand':['OMG', 'ABC', 'ABC', 'POD', 'POD']})
data2 = pd.DataFrame({'id':['pencil', 'pencil', 'ball', 'pen'],
'brand':['OMG', 'POD', 'ABC', 'POD']})
print(data1)
print('------------')
print(data2)
print('-----------')
print(pd.merge(data1, data2,on='id'))
print("*************************")
print(pd.merge(data1,data2,on='brand'))
'''
brand color id
0 OMG white ball
1 ABC red pencil
2 ABC red pen
3 POD black mug
4 POD green ashtray
------------
brand id
0 OMG pencil
1 POD pencil
2 ABC ball
3 POD pen
-----------
brand_x color id brand_y
0 OMG white ball ABC
1 ABC red pencil OMG
2 ABC red pencil POD
3 ABC red pen POD
*************************
brand color id_x id_y
0 OMG white ball pencil
1 ABC red pencil ball
2 ABC red pen ball
3 POD black mug pencil
4 POD black mug pen
5 POD green ashtray pencil
6 POD green ashtray pen
'''
假如基准列名称不一致,仍需要合并,则可使用left_on 和 right_on进行指定基准列进行合并
# -*- coding: UTF-8 -*-
import pandas as pd
data1 = pd.DataFrame({'id':['ball', 'pencil', 'pen', 'mug', 'ashtray'],
'color':['white', 'red', 'red', 'black','green'],
'brand':['OMG', 'ABC', 'ABC', 'POD', 'POD']})
data2 = pd.DataFrame({'did':['pencil', 'pencil', 'ball', 'pen'],
'brand':['OMG', 'POD', 'ABC', 'POD']})
print(data1)
print('------------')
print(data2)
print('-----------')
print(pd.merge(data1, data2,left_on='id',right_on='did'))
'''
brand color id
0 OMG white ball
1 ABC red pencil
2 ABC red pen
3 POD black mug
4 POD green ashtray
------------
brand did
0 OMG pencil
1 POD pencil
2 ABC ball
3 POD pen
-----------
brand_x color id brand_y did
0 OMG white ball ABC ball
1 ABC red pencil OMG pencil
2 ABC red pencil POD pencil
3 ABC red pen POD pen
'''
merge()默认执行内连接,外连接相当于左连接、右连接效果之和,用how指定
要合并多个键,同样可以使用on选项
# -*- coding: UTF-8 -*-
import pandas as pd
data1 = pd.DataFrame({'id':['ball', 'pencil', 'pen', 'mug', 'ashtray'],
'color':['white', 'red', 'red', 'black','green'],
'brand':['OMG', 'ABC', 'ABC', 'POD', 'POD']})
data2 = pd.DataFrame({'id':['pencil', 'pencil', 'ball', 'pen'],
'brand':['OMG', 'POD', 'ABC', 'POD']})
print(pd.merge(data1,data2,how='outer'))
print('*************************************')
print(pd.merge(data1,data2,how='left'))
print('*************************************')
print(pd.merge(data1,data2,how='right'))
'''
brand color id
0 OMG white ball
1 ABC red pencil
2 ABC red pen
3 POD black mug
4 POD green ashtray
5 OMG NaN pencil
6 POD NaN pencil
7 ABC NaN ball
8 POD NaN pen
*************************************
brand color id
0 OMG white ball
1 ABC red pencil
2 ABC red pen
3 POD black mug
4 POD green ashtray
*************************************
brand color id
0 OMG NaN pencil
1 POD NaN pencil
2 ABC NaN ball
3 POD NaN pen
'''
把left_index和right_index选项的值置为True
# -*- coding: UTF-8 -*-
import pandas as pd
data1 = pd.DataFrame({'id':['ball', 'pencil', 'pen', 'mug', 'ashtray'],
'color':['white', 'red', 'red', 'black','green'],
'brand':['OMG', 'ABC', 'ABC', 'POD', 'POD']})
data2 = pd.DataFrame({'id':['pencil', 'pencil', 'ball', 'pen'],
'brand':['OMG', 'POD', 'ABC', 'POD']})
print(pd.merge(data1,data2,right_index=True,left_index=True))
'''
brand_x color id_x brand_y id_y
0 OMG white ball OMG pencil
1 ABC red pencil POD pencil
2 ABC red pen ABC ball
3 POD black mug POD pen
'''
也可使用join()函数进行合并,用于合并多个索引相同列不同的DataFrame对象。但是如果两个对象的列名称有重合,会报错。
# -*- coding: UTF-8 -*-
import pandas as pd
data1 = pd.DataFrame({'id':['ball', 'pencil', 'pen', 'mug', 'ashtray'],
'color':['white', 'red', 'red', 'black','green'],
'brand':['OMG', 'ABC', 'ABC', 'POD', 'POD']})
data2 = pd.DataFrame({'id2':['pencil', 'pencil', 'ball', 'pen'],
'brand2':['OMG', 'POD', 'ABC', 'POD']})
print(data1.join(data2))
'''
brand color id brand2 id2
0 OMG white ball OMG pencil
1 ABC red pencil POD pencil
2 ABC red pen ABC ball
3 POD black mug POD pen
4 POD green ashtray NaN NaN
'''
numpy的函数用于数组拼接,拼接的行列要严格对齐,否则会报错。
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data1=np.array([[1,2,3],[4,5,6],[7,8,9]])
data2=data1+9
print(np.concatenate([data1,data2],axis=0))
'''
[[ 1 2 3]
[ 4 5 6]
[ 7 8 9]
[10 11 12]
[13 14 15]
[16 17 18]]
'''
Pandas的concat()函数实现了按索引拼接
默认按行,即axis=0
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data1=pd.Series(['A','B','C','D'],index=[1,2,3,4])
data2=pd.Series(['E','F','G','H'],index=[5,6,7,8])
print(data1)
print("******************")
print(data2)
print("******************")
print(pd.concat([data1,data2]))
'''
1 A
2 B
3 C
4 D
dtype: object
******************
5 E
6 F
7 G
8 H
dtype: object
******************
1 A
2 B
3 C
4 D
5 E
6 F
7 G
8 H
dtype: object
'''
也可按列,axis=1,返回的是DataFrame对象
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data1=pd.Series(['A','B','C','D'],index=[1,2,3,4])
data2=pd.Series(['E','F','G','H'],index=[5,6,7,8])
print(data1)
print("******************")
print(data2)
print("******************")
print(pd.concat([data1,data2],axis=1))
'''
1 A
2 B
3 C
4 D
dtype: object
******************
5 E
6 F
7 G
8 H
dtype: object
******************
0 1
1 A NaN
2 B NaN
3 C NaN
4 D NaN
5 NaN E
6 NaN F
7 NaN G
8 NaN H
'''
默认为外连接,可通过设置join为inner,执行内连接操作
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data1=pd.Series(['A','B','C','D'],index=[1,2,3,4])
data2=pd.Series(['E','F','G','H'],index=[5,6,7,8])
data=pd.concat([data1,data2],axis=1)
print(data1)
print("******************")
print(data2)
print("******************")
print(pd.concat([data1,data2],axis=1))
print("******************")
print(pd.concat([data1,data],axis=1,join='inner'))
'''
1 A
2 B
3 C
4 D
dtype: object
******************
5 E
6 F
7 G
8 H
dtype: object
******************
0 1
1 A NaN
2 B NaN
3 C NaN
4 D NaN
5 NaN E
6 NaN F
7 NaN G
8 NaN H
******************
0 0 1
1 A A NaN
2 B B NaN
3 C C NaN
4 D D NaN
'''
借助keys完成
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data1=pd.Series(['A','B','C','D'],index=[1,2,3,4])
data2=pd.Series(['E','F','G','H'],index=[5,6,7,8])
data=pd.concat([data1,data2],axis=1)
print(data1)
print("******************")
print(data2)
print("******************")
print(pd.concat([data1,data2],keys=[1,2]))
'''
1 A
2 B
3 C
4 D
dtype: object
******************
5 E
6 F
7 G
8 H
dtype: object
******************
1 1 A
2 B
3 C
4 D
2 5 E
6 F
7 G
8 H
dtype: object
'''
对列一样
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data1=pd.Series(['A','B','C','D'],index=[1,2,3,4])
data2=pd.Series(['E','F','G','H'],index=[5,6,7,8])
data=pd.concat([data1,data2],axis=1)
print(data1)
print("******************")
print(data2)
print("******************")
print(pd.concat([data1,data2],axis=1,keys=[1,2]))
'''
1 A
2 B
3 C
4 D
dtype: object
******************
5 E
6 F
7 G
8 H
dtype: object
******************
1 2
1 A NaN
2 B NaN
3 C NaN
4 D NaN
5 NaN E
6 NaN F
7 NaN G
8 NaN H
'''
对于DataFrame对象的拼接与Series对象拼接方法一样。
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data = pd.Series(['1','2','3','4'], index=['A','B','C','D'])
print(data)
print('------------')
print(data.drop('C'))
print("*************")
print("删除多项:")
print(data.drop(['A','C']))
'''
A 1
B 2
C 3
D 4
dtype: object
------------
A 1
B 2
D 4
dtype: object
*************
删除多项:
B 2
D 4
dtype: object
'''
删除DataFrame对象与上述类似,默认删除行,可指定axis=1来删除列
一次只能删除一列
import pandas as pd
import numpy as np
data = pd.DataFrame(np.arange(16).reshape((4,4)),
index=['A','B','C','D'],
columns=['yebi','kuka','masha','gula'])
print(data)
print('------------')
del data['yebi']
print(data)
'''
yebi kuka masha gula
A 0 1 2 3
B 4 5 6 7
C 8 9 10 11
D 12 13 14 15
------------
kuka masha gula
A 1 2 3
B 5 6 7
C 9 10 11
D 13 14 15
'''
用来查重,如果DataFrame对象中某行与前面行重复则返回True,否则则返回False
支持从前向后查找(first)和从后向前查找(last)
查询重复值的位置,返回布尔型 data.duplicated()
查询有多少重复值 data.duplicated().sum()
打印重复值 data[data.duplicated()]
打印非重复值 data[data.duplicated()=False]
# -*- coding: UTF-8 -*-
import pandas as pd
data=pd.DataFrame({'lan':['yebi','mana','yebi','kuki','rara','kuki'],
'value':[6,5,4,3,5,3]})
print(data)
print("**************")
print(data.duplicated())
'''
lan value
0 yebi 6
1 mana 5
2 yebi 4
3 kuki 3
4 rara 5
5 kuki 3
**************
0 False
1 False
2 False
3 False
4 False
5 True
dtype: bool
'''
返回去重后的DataFrame对象
data.drop_duplicates(subset=['a','b','b'],keep='first',inplace=True)
subset: 表示要进去重的列名,默认为 None。
keep: 有三个可选参数,分别是 first、last、False,默认为 first,表示只保留第一次出现的重复 项,删除其余重复项,last 表示只保留最后一次出现的重复项,False 则表示删除所有重复项。
inplace: 布尔值参数,默认为 False 表示删除重复项后返回一个副本,若为 Ture 则表示直接在原数据上 删除重复项。
# -*- coding: UTF-8 -*-
import pandas as pd
data=pd.DataFrame({'lan':['yebi','mana','yebi','kuki','yebi','kuki'],
'value':[6,5,4,3,6,3]})
print(data)
print("**************")
print(data.drop_duplicates())
'''
lan value
0 yebi 6
1 mana 5
2 yebi 4
3 kuki 3
4 yebi 6
5 kuki 3
**************
lan value
0 yebi 6
1 mana 5
2 yebi 4
3 kuki 3
'''
行列变换,通过T完成
# -*- coding: UTF-8 -*-
import pandas as pd
data=pd.DataFrame({'lan':['yebi','mana','yebi','kuki','rara','kuki'],
'value':[6,5,4,3,5,3]})
print(data)
print("**************")
print(data.T)
'''
lan value
0 yebi 6
1 mana 5
2 yebi 4
3 kuki 3
4 rara 5
5 kuki 3
**************
0 1 2 3 4 5
lan yebi mana yebi kuki rara kuki
value 6 5 4 3 5 3
'''
入栈(Stacking):旋转数据结构,列转行
出栈(unStacking):行转列
对DataFrame()对象应用stack()函数,列转行,得到一个Series对象
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data=pd.DataFrame(np.arange(9).reshape(3,3),
index=['yebi','kuki','muya'],
columns=['A','B','C'])
print(data)
print("**************")
print(data.stack())
'''
A B C
yebi 0 1 2
kuki 3 4 5
muya 6 7 8
**************
yebi A 0
B 1
C 2
kuki A 3
B 4
C 5
muya A 6
B 7
C 8
dtype: int64
'''
在具有等级索引结构的Series对象上执行unstack()操作,可以重建之前的DataFrame对象
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data=pd.DataFrame(np.arange(9).reshape(3,3),
index=['yebi','kuki','muya'],
columns=['A','B','C'])
data1=data.stack()
print(data1)
print("**************")
data2=data1.unstack()
print(data2)
'''
yebi A 0
B 1
C 2
kuki A 3
B 4
C 5
muya A 6
B 7
C 8
dtype: int64
**************
A B C
yebi 0 1 2
kuki 3 4 5
muya 6 7 8
'''
出栈操作可以应用到不同的层级,为unstack()函数传入表示层级的编号或名称,即可对相应层级进行操作
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
data=pd.DataFrame(np.arange(9).reshape(3,3),
index=['yebi','kuki','muya'],
columns=['A','B','C'])
data1=data.stack()
print(data1)
print("**************")
data2=data1.unstack(0)
print(data2)
print(data1.unstack(1))
'''
yebi A 0
B 1
C 2
kuki A 3
B 4
C 5
muya A 6
B 7
C 8
dtype: int64
**************
yebi kuki muya
A 0 3 6
B 1 4 7
C 2 5 8
A B C
yebi 0 1 2
kuki 3 4 5
muya 6 7 8
'''
可以用作键的一列或多列作为参数
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
longframe = pd.DataFrame({'color':['white', 'white', 'white', 'red', 'red', 'red', 'black', 'black', 'black'],
'item':['ball', 'pen', 'mug', 'ball', 'pen', 'mug', 'ball', 'pen', 'mug'],
'value': np.random.rand(9)})
print(longframe)
print('------------')
wideframe = longframe.pivot('color', 'item')
print(wideframe)
'''
color item value
0 white ball 0.380597
1 white pen 0.888624
2 white mug 0.589127
3 red ball 0.334201
4 red pen 0.145386
5 red mug 0.326554
6 black ball 0.411483
7 black pen 0.558148
8 black mug 0.950048
------------
value
item ball mug pen
color
black 0.411483 0.950048 0.558148
red 0.334201 0.326554 0.145386
white 0.380597 0.589127 0.888624
'''
三个工作阶段。
分组 将数据集分成多个组
用函数处理 用函数处理每一个组
合并 把不同组得到的结果合并起来
import pandas as pd
df = pd.DataFrame({'color' : ['white', 'red', 'green', 'red', 'green'],
'object': ['pen', 'pencil', 'pencil', 'ashtray', 'pen'],
'price1': [5.56, 4.20, 1.30, 0.56, 2.75],
'price2': [4.75, 4.12, 1.60, 0.75, 3.15]})
group = df['price1'].groupby(df['color'])
#上述一句是分组操作,把含有相同颜色的行分到同一个组中
print(group.groups)#输出分组
print(group.mean())#输出分组的平均值
print(group.sum())#输出分组的和
'''
{'white': Int64Index([0], dtype='int64'), 'green': Int64Index([2, 4], dtype='int64'), 'red': Int64Index([1, 3], dtype='int64')}
color
green 2.025
red 2.380
white 5.560
Name: price1, dtype: float64
color
green 4.05
red 4.76
white 5.56
Name: price1, dtype: float64
'''
前面介绍了用一列元素作为键为数据分组,同理,也可以使用多列,也就是使用多个键,按照等级关系分组。
import pandas as pd
df = pd.DataFrame({'color' : ['white', 'red', 'green', 'red', 'green'],
'object': ['pen', 'pencil', 'pencil', 'ashtray', 'pen'],
'price1': [5.56, 4.20, 1.30, 0.56, 2.75],
'price2': [4.75, 4.12, 1.60, 0.75, 3.15]})
group = df['price1'].groupby([df['color'], df['object']])
print(group.groups)
print(group.sum())
print(df[['price1','price2']].groupby(df['color']).mean())
'''
{('green', 'pen'): Int64Index([4], dtype='int64'), ('red', 'pencil'): Int64Index([1], dtype='int64'), ('green', 'pencil'): Int64Index([2], dtype='int64'), ('red', 'ashtray'): Int64Index([3], dtype='int64'), ('white', 'pen'): Int64Index([0], dtype='int64')}
color object
green pen 2.75
pencil 1.30
red ashtray 0.56
pencil 4.20
white pen 5.56
Name: price1, dtype: float64
price1 price2
color
green 2.025 2.375
red 2.380 2.435
white 5.560 4.750
'''