Pandas库网址:https://pandas.pydata.org/
import pandas as pd
Pandas基于Numpy实现,常与Numpy和Matplotlib一同使用
请在Anaconda的IPython平台运行代码
import pandas as pd
d = pd.Series(range(20))
d
Out[3]:
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
17 17
18 18
19 19
dtype: int64
d.cumsum() #计算前N项累加和
Out[4]:
0 0
1 1
2 3
3 6
4 10
5 15
6 21
7 28
8 36
9 45
10 55
11 66
12 78
13 91
14 105
15 120
16 136
17 153
18 171
19 190
dtype: int64
Series类型由一组数据及与之相关的数据索引组成
import pandas as pd
a = pd.Series([9,8,7,6])
a
Out[3]:
0 9
1 8
2 7
3 6
dtype: int64 #NumPy中数据类型
import pandas as pd
b = pd.Series([9,8,7,6],index=['a','b','c','d']) #作为第二个参数时,可以省略index=
b
Out[6]:
a 9
b 8
c 7
d 6
dtype: int64
import pandas as pd
s = pd.Series(25,index=['a','b','c']) #此处不能省略index
s
Out[9]:
a 25
b 25
c 25
dtype: int64
import pandas as pd
d = pd.Series({
'a':9,'b':8,'c':7})
d
Out[12]:
a 9
b 8
c 7
dtype: int64
e = pd.Series({
'a':9,'b':8,'c':7},index=['c','a','b','d'])
#index从字典中进行选择操作
e
Out[14]:
c 7.0
a 9.0
b 8.0
d NaN
dtype: float64
import pandas as pd
import numpy as np
n = pd.Series(np.arange(5))
n
Out[4]:
0 0
1 1
2 2
3 3
4 4
dtype: int32
m = pd.Series(np.arange(5),index=np.arange(9,4,-1))
m
Out[6]:
9 0
8 1
7 2
6 3
5 4
dtype: int32
import pandas as pd
b = pd.Series([9,8,7,6],['a','b','c','d'])
b
Out[3]:
a 9
b 8
c 7
d 6
dtype: int64
b.index #.index 获得索引
Out[4]: Index(['a', 'b', 'c', 'd'], dtype='object')
b.values #.values 获得数据
Out[5]: array([9, 8, 7, 6], dtype=int64)
b['b'] #自动索引
Out[6]: 8
b[1] #自定义索引
Out[7]: 8
b[['c','d',0]] #两套索引并存,但不能混用
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
b[['c','d','a']]
Out[10]:
c 7
d 6
a 9
dtype: int64
import pandas as pd
b = pd.Series([9,8,7,6],['a','b','c','d'])
b
Out[3]:
a 9
b 8
c 7
d 6
dtype: int64
b[3]
Out[4]: 6
b[:3]
Out[5]:
a 9
b 8
c 7
dtype: int64
b[b > b.median()]
Out[6]:
a 9
b 8
dtype: int64
np.exp(b)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-7-a3137cbb6ea4> in <module>
----> 1 np.exp(b)
import pandas as pd
b = pd.Series([9,8,7,6],['a','b','c','d'])
b['b']
Out[3]: 8
'c' in b
Out[4]: True
0 in b
Out[5]: False
b.get('f',100)
Out[6]: 100
Series + Series
import pandas as pd
a = pd.Series([1,2,3],['c','d','e'])
b = pd.Series([9,8,7,6],['a','b','c','d'])
a + b
Out[10]:
a NaN
b NaN
c 8.0
d 8.0
e NaN
dtype: float64
Series类型在运算中会自动对齐不同索引的数据
import pandas as pd
b = pd.Series([9,8,7,6],['a','b','c','d'])
b.name
b.name = 'Series对象'
b.index.name = '索引列'
b
Out[16]:
索引列
a 9
b 8
c 7
d 6
Name: Series对象, dtype: int64
Series对象可以随时修改并即刻生效
import pandas as pd
b = pd.Series([9,8,7,6],['a','b','c','d'])
b.name = "Series"
b
Out[20]:
a 9
b 8
c 7
d 6
Name: Series, dtype: int64
b.name = "New Series"
b['b','c'] = 20
b
Out[23]:
a 9
b 20
c 20
d 6
Name: New Series, dtype: int64
import pandas as pd
import numpy as np
d = pd.DataFrame(np.arange(10).reshape(2,5))
d
Out[4]:
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
import pandas as pd
dl = {
'one':[1,2,3,4],'two':[9,8,7,6]}
d = pd.DataFrame(dl,index = ['a','b','c','d'])
d
Out[7]:
one two
a 1 9
b 2 8
c 3 7
d 4 6
import pandas as pd
import numpy as np
a = pd.DataFrame(np.arange(12).reshape(3,4))
a
Out[4]:
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
b = pd.DataFrame(np.arange(20).reshape(4,5))
b
Out[6]:
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
a + b
Out[7]:
0 1 2 3 4
0 0.0 2.0 4.0 6.0 NaN
1 9.0 11.0 13.0 15.0 NaN
2 18.0 20.0 22.0 24.0 NaN
3 NaN NaN NaN NaN NaN
a * b
Out[8]:
0 1 2 3 4
0 0.0 1.0 4.0 9.0 NaN
1 20.0 30.0 42.0 56.0 NaN
2 80.0 99.0 120.0 143.0 NaN
3 NaN NaN NaN NaN NaN #自动补齐,缺项补NAN
import pandas as pd
import numpy as np
a = pd.DataFrame(np.arange(12).reshape(3,4))
a
Out[12]:
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
b = pd.DataFrame(np.arange(20),.reshape(4,5))
File "" , line 1
b = pd.DataFrame(np.arange(20),.reshape(4,5))
^
SyntaxError: invalid syntax
b = pd.DataFrame(np.arange(20).reshape(4,5))
b
Out[15]:
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
b.add(a,fill_value = 100)
Out[16]:
0 1 2 3 4
0 0.0 2.0 4.0 6.0 104.0
1 9.0 11.0 13.0 15.0 109.0
2 18.0 20.0 22.0 24.0 114.0
3 115.0 116.0 117.0 118.0 119.0
a.mul(b,fill_value = 0)
Out[17]:
0 1 2 3 4
0 0.0 1.0 4.0 9.0 0.0
1 20.0 30.0 42.0 56.0 0.0
2 80.0 99.0 120.0 143.0 0.0
#fill_value参数替代NAN,替代后参与运算
不同维度间为广播运算,一维Series默认在轴1参与运算
import pandas as pd
import numpy as np
b = pd.DataFrame(np.arange(20).reshape(4,5))
b
Out[21]:
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
c = pd.Series(np.arange(4))
c
Out[23]:
0 0
1 1
2 2
3 3
dtype: int32
c -10
Out[24]:
0 -10
1 -9
2 -8
3 -7
dtype: int32
b - c
Out[25]:
0 1 2 3 4
0 0.0 0.0 0.0 0.0 NaN
1 5.0 5.0 5.0 5.0 NaN
2 10.0 10.0 10.0 10.0 NaN
3 15.0 15.0 15.0 15.0 NaN
使用运算方法可以令一维Series参与轴0运算
import pandas as pd
import numpy as np
b = pd.DataFrame(np.arange(20).reshape(4,5))
b
Out[29]:
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
c = pd.Series(np.arange(4))
c
Out[31]:
0 0
1 1
2 2
3 3
dtype: int32
b.sub(c,axis=0)
Out[32]:
0 1 2 3 4
0 0 1 2 3 4
1 4 5 6 7 8
2 8 9 10 11 12
3 12 13 14 15 16
同维度运算,尺寸一致
import pandas as pd
import numpy as np
a = pd.DataFrame(np.arange(12).reshape(3,4))
a
Out[36]:
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
d = pd.DataFrame(np.arange(12,0,-1).reshape(3,4))
d
Out[38]:
0 1 2 3
0 12 11 10 9
1 8 7 6 5
2 4 3 2 1
a > d
Out[39]:
0 1 2 3
0 False False False False
1 False False False True
2 True True True True
a == d
Out[40]:
0 1 2 3
0 False False False False
1 False False True False
2 False False False False
不同维度,广播运算,默认在1轴
import pandas as np
import numpy as np
a = pd.DataFrame(np.arange(12).reshape(3,4))
a
Out[44]:
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
c = pd.Series(np.arange(4))
c
Out[46]:
0 0
1 1
2 2
3 3
dtype: int32
a > c
Out[47]:
0 1 2 3
0 False False False False
1 True True True True
2 True True True True
c >0
Out[48]:
0 False
1 True
2 True
3 True
dtype: bool