pandas入门
源码点这里
from pandas import Series,DataFrame
import pandas as pd
pandas的两个主要数据结构:Series和DataFrame
Series 是一中类似于一维数组的对象,它由一组数据以及一组与之相关的数据标签(即索引)组成。
obj=Series([4,7,-5,3])
obj
0 4 1 7 2 -5 3 3 dtype: int64
obj.values
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2=Series([4,7,-5,3],index=['d','b','a','c'])
obj2
d 4 b 7 a -5 c 3 dtype: int64
obj2['a']
-5
obj2['d']
4
obj2[['c','a','d']]
c 3 a -5 d 4 dtype: int64
obj2[obj2>0]
d 4 b 7 c 3 dtype: int64
obj2*2
d 8 b 14 a -10 c 6 dtype: int64
import numpy as np
np.exp(obj2)
d 54.598150 b 1096.633158 a 0.006738 c 20.085537 dtype: float64
sdata={'0hio':35000,'Texta':79300,'Ohjsodf':16000,'Jsdf':5000}
obj3=Series(sdata)
obj3
0hio 35000 Jsdf 5000 Ohjsodf 16000 Texta 79300 dtype: int64
states=['California','0hio','Texta','Ohjsodf']
obj4=Series(sdata,index=states)
obj4
California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 dtype: float64
pd.isnull(obj4)
California True 0hio False Texta False Ohjsodf False dtype: bool
pd.notnull(obj4)
California False 0hio True Texta True Ohjsodf True dtype: bool
obj4.isnull()
California True 0hio False Texta False Ohjsodf False dtype: bool **下面重点关注如何处理缺失数据**
obj3
0hio 35000 Jsdf 5000 Ohjsodf 16000 Texta 79300 dtype: int64
obj4
California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 dtype: float64
obj3+obj4
0hio 70000.0 California NaN Jsdf NaN Ohjsodf 32000.0 Texta 158600.0 dtype: float64
obj4.name='population'
obj4.index.name='state'
obj4
state California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 Name: population, dtype: float64
obj
0 4 1 7 2 -5 3 3 dtype: int64
obj.index=['Bob','Steve','Jeff','Ryan']
obj
Bob 4 Steve 7 Jeff -5 Ryan 3 dtype: int64
DataFrame
####DataFrame是一个表格型的数据结构,它含有一组有序的列,每列可以是不同的值类型(数值、字符串、布尔值等) ####DataFrame既又行索引也有列索引,它可以被看做Series组成的字典(共同用一个索引)
data={'state':['0hi0','0hio','0hio','Nevada','Nevada'],
'year':[2000,2001,2002,2001,2002],
'pop':[1.5,1.7,3.6,2.4,2.9]
}
frame=DataFrame(data)
frame
|
pop |
state |
year |
0 |
1.5 |
0hi0 |
2000 |
1 |
1.7 |
0hio |
2001 |
2 |
3.6 |
0hio |
2002 |
3 |
2.4 |
Nevada |
2001 |
4 |
2.9 |
Nevada |
2002 |
DataFrame(data,columns=['year','state','pop'])
|
year |
state |
pop |
0 |
2000 |
0hi0 |
1.5 |
1 |
2001 |
0hio |
1.7 |
2 |
2002 |
0hio |
3.6 |
3 |
2001 |
Nevada |
2.4 |
4 |
2002 |
Nevada |
2.9 |
frame2=DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five'])
frame2
|
year |
state |
pop |
debt |
one |
2000 |
0hi0 |
1.5 |
NaN |
two |
2001 |
0hio |
1.7 |
NaN |
three |
2002 |
0hio |
3.6 |
NaN |
four |
2001 |
Nevada |
2.4 |
NaN |
five |
2002 |
Nevada |
2.9 |
NaN |
frame2.columns
Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’)
frame2['state']
one 0hi0 two 0hio three 0hio four Nevada five Nevada Name: state, dtype: object
frame2.year
one 2000 two 2001 three 2002 four 2001 five 2002 Name: year, dtype: int64
frame2.ix['three']
year 2002 state 0hio pop 3.6 debt NaN Name: three, dtype: object
frame2['debt']=16.
frame2
frame2['debt']=np.arange(5.)
frame2
|
year |
state |
pop |
debt |
one |
2000 |
0hi0 |
1.5 |
0.0 |
two |
2001 |
0hio |
1.7 |
1.0 |
three |
2002 |
0hio |
3.6 |
2.0 |
four |
2001 |
Nevada |
2.4 |
3.0 |
five |
2002 |
Nevada |
2.9 |
4.0 |
val=Series([-1.2,-1.5,-1.7],index=['two','four','five'])
frame2['debt']=val
frame2
|
year |
state |
pop |
debt |
one |
2000 |
0hi0 |
1.5 |
NaN |
two |
2001 |
0hio |
1.7 |
-1.2 |
three |
2002 |
0hio |
3.6 |
NaN |
four |
2001 |
Nevada |
2.4 |
-1.5 |
five |
2002 |
Nevada |
2.9 |
-1.7 |
frame2['eastern']=frame2.state=='0hi0'
frame2
|
year |
state |
pop |
debt |
eastern |
one |
2000 |
0hi0 |
1.5 |
NaN |
True |
two |
2001 |
0hio |
1.7 |
-1.2 |
False |
three |
2002 |
0hio |
3.6 |
NaN |
False |
four |
2001 |
Nevada |
2.4 |
-1.5 |
False |
five |
2002 |
Nevada |
2.9 |
-1.7 |
False |
del frame2['eastern']
frame2.columns
Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’)
pop={'Nevada':{2001:2.4,2002:2.9},'0hio':{2000:1.5,2001:1.7,2002:3.6}}
frame3=DataFrame(pop)
frame3
|
0hio |
Nevada |
2000 |
1.5 |
NaN |
2001 |
1.7 |
2.4 |
2002 |
3.6 |
2.9 |
frame3.T
|
2000 |
2001 |
2002 |
0hio |
1.5 |
1.7 |
3.6 |
Nevada |
NaN |
2.4 |
2.9 |
DataFrame(pop,index=[2001,2002,2003])
|
0hio |
Nevada |
2001 |
1.7 |
2.4 |
2002 |
3.6 |
2.9 |
2003 |
NaN |
NaN |
frame3.index.name='year'
frame3.columns.name='state'
frame3
state |
0hio |
Nevada |
year |
|
|
2000 |
1.5 |
NaN |
2001 |
1.7 |
2.4 |
2002 |
3.6 |
2.9 |
frame3.values
array([[ 1.5, nan], [ 1.7, 2.4], [ 3.6, 2.9]])
frame2.values
array([[2000, ‘0hi0’, 1.5, nan], [2001, ‘0hio’, 1.7, -1.2], [2002, ‘0hio’, 3.6, nan], [2001, ‘Nevada’, 2.4, -1.5], [2002, ‘Nevada’, 2.9, -1.7]], dtype=object)
索引对象
obj=Series(range(3),index=['a','b','c'])
index=obj.index
index
Index([‘a’, ‘b’, ‘c’], dtype=’object’)
index[1:]
Index([‘b’, ‘c’], dtype=’object’)
index[1]='d'
————————————————————————— TypeError Traceback (most recent call last) in () 1 #Index对象是不能修改的(immutable),因此用户不能对其进行修改 —-> 2 index[1]=’d’ C:\Users\ZJL\AppData\Local\Programs\Python\Python35\lib\site-packages\pandas\indexes\base.py in __setitem__(self, key, value) 1402 1403 def __setitem__(self, key, value): -> 1404 raise TypeError(“Index does not support mutable operations”) 1405 1406 def __getitem__(self, key): TypeError: Index does not support mutable operations
index=pd.Index(np.arange(3))
obj2=Series([1.5,-2.5,0],index=index)
obj2.index is index
True
frame3
'0hio' in frame3.columns
True
2002in frame3.index
True ##重新索引
obj=Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
obj
d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64
obj2=obj.reindex(['a','b','c','d','e'])
obj2
a -5.3 b 7.2 c 3.6 d 4.5 e NaN dtype: float64
obj.reindex(['a','b','c','d','e'],fill_value=0)
a -5.3 b 7.2 c 3.6 d 4.5 e 0.0 dtype: float64
obj3=Series(['blue','purple','yellow'],index=[0,2,4])
obj3.reindex(range(6),method='ffill')
0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object
frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','b','c'],columns=['0hio','Texas','California'])
frame
|
0hio |
Texas |
California |
a |
0 |
1 |
2 |
b |
3 |
4 |
5 |
c |
6 |
7 |
8 |
frame2=frame.reindex(['a','b','c','d'])
frame2
|
0hio |
Texas |
California |
a |
0.0 |
1.0 |
2.0 |
b |
3.0 |
4.0 |
5.0 |
c |
6.0 |
7.0 |
8.0 |
d |
NaN |
NaN |
NaN |
states=['Texas','Utah','California']
frame.reindex(columns=states)
|
Texas |
Utah |
California |
a |
1 |
NaN |
2 |
b |
4 |
NaN |
5 |
c |
7 |
NaN |
8 |
frame.reindex(index=['a','b','c','d'],method='ffill',columns=states)
|
Texas |
Utah |
California |
a |
1 |
NaN |
2 |
b |
4 |
NaN |
5 |
c |
7 |
NaN |
8 |
d |
7 |
NaN |
8 |
frame.ix[['a','b','c','d'],states]
|
Texas |
Utah |
California |
a |
1.0 |
NaN |
2.0 |
b |
4.0 |
NaN |
5.0 |
c |
7.0 |
NaN |
8.0 |
d |
NaN |
NaN |
NaN |
丢弃指定轴上的项
drop方法返回一个在指定轴上删除了指定值的新对象
obj=Series(np.arange(5.0),index=['a','b','c','d','e'])
new_obj=obj.drop('c')
new_obj
a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64
obj.drop(['d','c'])
a 0.0 b 1.0 e 4.0 dtype: float64
data=DataFrame(np.arange(16).reshape((4,4)),index=['0hio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data
|
one |
two |
three |
four |
0hio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data.drop(['Colorado','0hio'])
data.drop('two',axis=1)
|
one |
three |
four |
0hio |
0 |
2 |
3 |
Colorado |
4 |
6 |
7 |
Utah |
8 |
10 |
11 |
New York |
12 |
14 |
15 |
data.drop(['two','four'],axis=1)
|
one |
three |
0hio |
0 |
2 |
Colorado |
4 |
6 |
Utah |
8 |
10 |
New York |
12 |
14 |
索引、选取和过滤
obj=Series(np.arange(4.),index=['a','b','c','d'])
obj['b']
1.0
obj[1]
1.0
obj[2:4]
c 2.0 d 3.0 dtype: float64
obj[['b','a','d']]
b 1.0 a 0.0 d 3.0 dtype: float64
obj[[1,3]]
b 1.0 d 3.0 dtype: float64
obj[obj<2]
a 0.0 b 1.0 dtype: float64
obj['b':'c']
b 1.0 c 2.0 dtype: float64
obj['b':'c']=5
obj
a 0.0 b 5.0 c 5.0 d 3.0 dtype: float64
data=DataFrame(np.arange(16).reshape((4,4)),index=['0hio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data
|
one |
two |
three |
four |
0hio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data['two']
0hio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int32
data[['two','three']]
|
two |
three |
0hio |
1 |
2 |
Colorado |
5 |
6 |
Utah |
9 |
10 |
New York |
13 |
14 |
data[:2]
|
one |
two |
three |
four |
0hio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
data[data['three']>5]
|
one |
two |
three |
four |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data<5
|
one |
two |
three |
four |
0hio |
True |
True |
True |
True |
Colorado |
True |
False |
False |
False |
Utah |
False |
False |
False |
False |
New York |
False |
False |
False |
False |
data[data<5]=0
data
|
one |
two |
three |
four |
0hio |
0 |
0 |
0 |
0 |
Colorado |
0 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data.ix['Colorado',['two','three']]
two 5 three 6 Name: Colorado, dtype: int32
data.ix[['Colorado','Utah'],[3,0,1]]
|
four |
one |
two |
Colorado |
7 |
0 |
5 |
Utah |
11 |
8 |
9 |
data.ix[data.three>5,:3]
|
one |
two |
three |
Colorado |
0 |
5 |
6 |
Utah |
8 |
9 |
10 |
New York |
12 |
13 |
14 |
算术运算和数据对齐
s1=Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2=Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
s1
a 7.3 c -2.5 d 3.4 e 1.5 dtype: float64
s2
a -2.1 c 3.6 e -1.5 f 4.0 g 3.1 dtype: float64
s1+s2
a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64
df1=DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['0hio','Texas','Colorado'])
df2=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','0hio','Texas','Oregon'])
df1
|
b |
c |
d |
0hio |
0.0 |
1.0 |
2.0 |
Texas |
3.0 |
4.0 |
5.0 |
Colorado |
6.0 |
7.0 |
8.0 |
df2
|
b |
d |
e |
Utah |
0.0 |
1.0 |
2.0 |
0hio |
3.0 |
4.0 |
5.0 |
Texas |
6.0 |
7.0 |
8.0 |
Oregon |
9.0 |
10.0 |
11.0 |
df1+df2
|
b |
c |
d |
e |
0hio |
3.0 |
NaN |
6.0 |
NaN |
Colorado |
NaN |
NaN |
NaN |
NaN |
Oregon |
NaN |
NaN |
NaN |
NaN |
Texas |
9.0 |
NaN |
12.0 |
NaN |
Utah |
NaN |
NaN |
NaN |
NaN |
df1=DataFrame(np.arange(12.).reshape((3,4)),columns=list('abcd'))
df2=DataFrame(np.arange(20.).reshape((4,5)),columns=list('abcde'))
df1
|
a |
b |
c |
d |
0 |
0.0 |
1.0 |
2.0 |
3.0 |
1 |
4.0 |
5.0 |
6.0 |
7.0 |
2 |
8.0 |
9.0 |
10.0 |
11.0 |
df2
|
a |
b |
c |
d |
e |
0 |
0.0 |
1.0 |
2.0 |
3.0 |
4.0 |
1 |
5.0 |
6.0 |
7.0 |
8.0 |
9.0 |
2 |
10.0 |
11.0 |
12.0 |
13.0 |
14.0 |
3 |
15.0 |
16.0 |
17.0 |
18.0 |
19.0 |
df1+df2
|
a |
b |
c |
d |
e |
0 |
0.0 |
2.0 |
4.0 |
6.0 |
NaN |
1 |
9.0 |
11.0 |
13.0 |
15.0 |
NaN |
2 |
18.0 |
20.0 |
22.0 |
24.0 |
NaN |
3 |
NaN |
NaN |
NaN |
NaN |
NaN |
df1.add(df2,fill_value=0)
|
a |
b |
c |
d |
e |
0 |
0.0 |
2.0 |
4.0 |
6.0 |
4.0 |
1 |
9.0 |
11.0 |
13.0 |
15.0 |
9.0 |
2 |
18.0 |
20.0 |
22.0 |
24.0 |
14.0 |
3 |
15.0 |
16.0 |
17.0 |
18.0 |
19.0 |
df1.reindex(columns=df2.columns,fill_value=0)
|
a |
b |
c |
d |
e |
0 |
0.0 |
1.0 |
2.0 |
3.0 |
0 |
1 |
4.0 |
5.0 |
6.0 |
7.0 |
0 |
2 |
8.0 |
9.0 |
10.0 |
11.0 |
0 |
DataFrame和Series之间的运算
arr=np.arange(12.).reshape((3,4))
arr
array([[ 0., 1., 2., 3.], [ 4., 5., 6., 7.], [ 8., 9., 10., 11.]])
arr[0]
array([ 0., 1., 2., 3.])
arr-arr[0]
array([[ 0., 0., 0., 0.], [ 4., 4., 4., 4.], [ 8., 8., 8., 8.]])
frame=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','0hio','Texas','Oregon'])
frame
|
b |
d |
e |
Utah |
0.0 |
1.0 |
2.0 |
0hio |
3.0 |
4.0 |
5.0 |
Texas |
6.0 |
7.0 |
8.0 |
Oregon |
9.0 |
10.0 |
11.0 |
series=frame.ix[0]
series
b 0.0 d 1.0 e 2.0 Name: Utah, dtype: float64
frame-series
|
b |
d |
e |
Utah |
0.0 |
0.0 |
0.0 |
0hio |
3.0 |
3.0 |
3.0 |
Texas |
6.0 |
6.0 |
6.0 |
Oregon |
9.0 |
9.0 |
9.0 |
series2=Series(range(3),index=['b','e','f'])
frame+series2
|
b |
d |
e |
f |
Utah |
0.0 |
NaN |
3.0 |
NaN |
0hio |
3.0 |
NaN |
6.0 |
NaN |
Texas |
6.0 |
NaN |
9.0 |
NaN |
Oregon |
9.0 |
NaN |
12.0 |
NaN |
series3=frame['d']
frame
|
b |
d |
e |
Utah |
0.0 |
1.0 |
2.0 |
0hio |
3.0 |
4.0 |
5.0 |
Texas |
6.0 |
7.0 |
8.0 |
Oregon |
9.0 |
10.0 |
11.0 |
series3
Utah 1.0 0hio 4.0 Texas 7.0 Oregon 10.0 Name: d, dtype: float64
frame.sub(series3,axis=0)
|
b |
d |
e |
Utah |
-1.0 |
0.0 |
1.0 |
0hio |
-1.0 |
0.0 |
1.0 |
Texas |
-1.0 |
0.0 |
1.0 |
Oregon |
-1.0 |
0.0 |
1.0 |
函数应用和映射
frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','0hio','Texas','Oregon'])
frame
|
b |
d |
e |
Utah |
-0.191031 |
-0.004688 |
-0.329970 |
0hio |
0.708249 |
0.265398 |
-2.346897 |
Texas |
1.064349 |
-1.811846 |
-0.899921 |
Oregon |
0.334061 |
-1.058506 |
0.655632 |
np.abs(frame)
|
b |
d |
e |
Utah |
0.191031 |
0.004688 |
0.329970 |
0hio |
0.708249 |
0.265398 |
2.346897 |
Texas |
1.064349 |
1.811846 |
0.899921 |
Oregon |
0.334061 |
1.058506 |
0.655632 |
f=lambda x:x.max()-x.min()
frame.apply(f)
b 1.255380 d 2.077245 e 3.002529 dtype: float64
frame.apply(f,axis=1)
Utah 0.325281 0hio 3.055145 Texas 2.876195 Oregon 1.714138 dtype: float64
def f(x):
return Series([x.min(),x.max()],index=['min','max'])
frame.apply(f)
|
b |
d |
e |
min |
-0.191031 |
-1.811846 |
-2.346897 |
max |
1.064349 |
0.265398 |
0.655632 |
format=lambda x:'%.2f' %x
frame.applymap(format)
|
b |
d |
e |
Utah |
-0.19 |
-0.00 |
-0.33 |
0hio |
0.71 |
0.27 |
-2.35 |
Texas |
1.06 |
-1.81 |
-0.90 |
Oregon |
0.33 |
-1.06 |
0.66 |
frame['e'].map(format)
Utah -0.33 0hio -2.35 Texas -0.90 Oregon 0.66 Name: e, dtype: object ###排序和排名
obj=Series(range(4),index=['d','a','b','c'])
obj.sort_index()
a 1 b 2 c 3 d 0 dtype: int32
obj=Series(range(4),index=list('bacd'))
obj.sort_index()
a 1 b 0 c 2 d 3 dtype: int32
frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','a','b','c'])
frame.sort_index()
|
d |
a |
b |
c |
one |
4 |
5 |
6 |
7 |
three |
0 |
1 |
2 |
3 |
frame.sort_index(axis=1)
|
a |
b |
c |
d |
three |
1 |
2 |
3 |
0 |
one |
5 |
6 |
7 |
4 |
frame.sort_index(axis=1,ascending=False)
|
d |
c |
b |
a |
three |
0 |
3 |
2 |
1 |
one |
4 |
7 |
6 |
5 |
obj=Series([4,7,-3,2])
obj.sort_values()
2 -3 3 2 0 4 1 7 dtype: int64
obj=Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values()
4 -3.0 5 2.0 0 4.0 2 7.0 1 NaN 3 NaN dtype: float64
frame=DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame
|
a |
b |
0 |
0 |
4 |
1 |
1 |
7 |
2 |
0 |
-3 |
3 |
1 |
2 |
frame.sort_values(by='b')
|
a |
b |
2 |
0 |
-3 |
3 |
1 |
2 |
0 |
0 |
4 |
1 |
1 |
7 |
frame.sort_values(by=['a','b'])
|
a |
b |
2 |
0 |
-3 |
0 |
0 |
4 |
3 |
1 |
2 |
1 |
1 |
7 |
obj=Series([7,-5,7,4,2,0,4])
obj.rank()
0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64
obj.rank(method='first')
0 6.0 1 1.0 2 7.0 3 4.0 4 3.0 5 2.0 6 5.0 dtype: float64
obj.rank(ascending=False,method='max')
0 2.0 1 7.0 2 2.0 3 4.0 4 5.0 5 6.0 6 4.0 dtype: float64
frame=DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frame
|
a |
b |
c |
0 |
0 |
4.3 |
-2.0 |
1 |
1 |
7.0 |
5.0 |
2 |
0 |
-3.0 |
8.0 |
3 |
1 |
2.0 |
-2.5 |
frame.rank(axis=1)
|
a |
b |
c |
0 |
2.0 |
3.0 |
1.0 |
1 |
1.0 |
3.0 |
2.0 |
2 |
2.0 |
1.0 |
3.0 |
3 |
2.0 |
3.0 |
1.0 |
带有重复值的轴索引
obj=Series(range(5),index=['a','a','b','b','c'])
obj
a 0 a 1 b 2 b 3 c 4 dtype: int32
obj.index.is_unique
False
obj['a']
a 0 a 1 dtype: int32
obj['c']
4
df=DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df
|
0 |
1 |
2 |
a |
-0.524361 |
-0.145395 |
-1.322196 |
a |
-0.666326 |
-0.496612 |
1.486401 |
b |
-0.395841 |
-0.921194 |
0.260437 |
b |
-0.187285 |
-0.456014 |
1.434571 |
df.ix['b']
|
0 |
1 |
2 |
b |
-0.395841 |
-0.921194 |
0.260437 |
b |
-0.187285 |
-0.456014 |
1.434571 |
汇总和计算描述统计
df=DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
df
|
one |
two |
a |
1.40 |
NaN |
b |
7.10 |
-4.5 |
c |
NaN |
NaN |
d |
0.75 |
-1.3 |
df.sum()
one 9.25 two -5.80 dtype: float64
df.sum(axis=1)
a 1.40 b 2.60 c NaN d -0.55 dtype: float64
df.mean(axis=1,skipna=False)
a NaN b 1.300 c NaN d -0.275 dtype: float64
df.idxmax()
one b two d dtype: object
df.cumsum()
|
one |
two |
a |
1.40 |
NaN |
b |
8.50 |
-4.5 |
c |
NaN |
NaN |
d |
9.25 |
-5.8 |
df.describe()
|
one |
two |
count |
3.000000 |
2.000000 |
mean |
3.083333 |
-2.900000 |
std |
3.493685 |
2.262742 |
min |
0.750000 |
-4.500000 |
25% |
1.075000 |
-3.700000 |
50% |
1.400000 |
-2.900000 |
75% |
4.250000 |
-2.100000 |
max |
7.100000 |
-1.300000 |
obj=Series(['a','a','b','c']*4)
obj.describe()
count 16 unique 3 top a freq 8 dtype: object ##相关系数与协方差
import pandas_datareader.data as web
all_data={}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
price=DataFrame({tic:data['Adj Close'] for tic,data in all_data.items()})
volume=DataFrame({tic:data['Volume'] for tic,data in all_data.items()})
returns=price.pct_change()
returns.tail()
|
AAPL |
GOOG |
IBM |
MSFT |
Date |
|
|
|
|
2009-12-24 |
0.034339 |
0.011117 |
0.004385 |
0.002587 |
2009-12-28 |
0.012294 |
0.007098 |
0.013326 |
0.005484 |
2009-12-29 |
-0.011861 |
-0.005571 |
-0.003477 |
0.007058 |
2009-12-30 |
0.012147 |
0.005376 |
0.005461 |
-0.013699 |
2009-12-31 |
-0.004300 |
-0.004416 |
-0.012597 |
-0.015504 |
returns.MSFT.corr(returns.IBM)
0.49597963862836764
returns.corr()
|
AAPL |
GOOG |
IBM |
MSFT |
AAPL |
1.000000 |
0.470676 |
0.410011 |
0.424305 |
GOOG |
0.470676 |
1.000000 |
0.390689 |
0.443587 |
IBM |
0.410011 |
0.390689 |
1.000000 |
0.495980 |
MSFT |
0.424305 |
0.443587 |
0.495980 |
1.000000 |
returns.cov()
|
AAPL |
GOOG |
IBM |
MSFT |
AAPL |
0.001027 |
0.000303 |
0.000252 |
0.000309 |
GOOG |
0.000303 |
0.000580 |
0.000142 |
0.000205 |
IBM |
0.000252 |
0.000142 |
0.000367 |
0.000216 |
MSFT |
0.000309 |
0.000205 |
0.000216 |
0.000516 |
returns.corrwith(returns.IBM)
AAPL 0.410011 GOOG 0.390689 IBM 1.000000 MSFT 0.495980 dtype: float64
returns.corrwith(volume)
————————————————————————— NameError Traceback (most recent call last) in () 1 #传入一个DataFrame则会计算按列名配对的相关系数。这里,计算百分比变化与成交量的相关系数 —-> 2 returns.corrwith(volume) NameError: name ‘returns’ is not defined
唯一值、值计数以及成员资格
obj=Series(['c','a','d','a','a','b','b','c','c'])
uniques=obj.unique()
uniques
array([‘c’, ‘a’, ‘d’, ‘b’], dtype=object)
obj.value_counts()
c 3 a 3 b 2 d 1 dtype: int64
pd.value_counts(obj.values,sort=False)
d 1 b 2 a 3 c 3 dtype: int64
mask=obj.isin(['b','c'])
mask
0 True 1 False 2 False 3 False 4 False 5 True 6 True 7 True 8 True dtype: bool
obj[mask]
0 c 5 b 6 b 7 c 8 c dtype: object
data=DataFrame({'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]})
data
|
Qu1 |
Qu2 |
Qu3 |
0 |
1 |
2 |
1 |
1 |
3 |
3 |
5 |
2 |
4 |
1 |
2 |
3 |
3 |
2 |
4 |
4 |
4 |
3 |
4 |
result=data.apply(pd.value_counts).fillna(0)
result
|
Qu1 |
Qu2 |
Qu3 |
1 |
1.0 |
1.0 |
1.0 |
2 |
0.0 |
2.0 |
1.0 |
3 |
2.0 |
2.0 |
0.0 |
4 |
2.0 |
0.0 |
2.0 |
5 |
0.0 |
0.0 |
1.0 |
处理缺失数据
string_data=Series(['aaedvark','artichoke',np.nan,'avocado'])
string_data
0 aaedvark 1 artichoke 2 NaN 3 avocado dtype: object
string_data.isnull()
0 False 1 False 2 True 3 False dtype: bool
string_data[0]=None
string_data.isnull()
0 True 1 False 2 True 3 False dtype: bool ###滤除缺失数据
from numpy import nan as NA
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
data=Series([1,NA,3.5,NA,7])
data.dropna()
0 1.0 2 3.5 4 7.0 dtype: float64
data[data.notnull()]
0 1.0 2 3.5 4 7.0 dtype: float64
data=DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
clearned=data.dropna()
data
|
0 |
1 |
2 |
0 |
1.0 |
6.5 |
3.0 |
1 |
1.0 |
NaN |
NaN |
2 |
NaN |
NaN |
NaN |
3 |
NaN |
6.5 |
3.0 |
clearned
data.dropna(how='all')
|
0 |
1 |
2 |
0 |
1.0 |
6.5 |
3.0 |
1 |
1.0 |
NaN |
NaN |
3 |
NaN |
6.5 |
3.0 |
data[4]=NA
data
|
0 |
1 |
2 |
4 |
0 |
1.0 |
6.5 |
3.0 |
NaN |
1 |
1.0 |
NaN |
NaN |
NaN |
2 |
NaN |
NaN |
NaN |
NaN |
3 |
NaN |
6.5 |
3.0 |
NaN |
data.dropna(axis=1,how='all')
|
0 |
1 |
2 |
0 |
1.0 |
6.5 |
3.0 |
1 |
1.0 |
NaN |
NaN |
2 |
NaN |
NaN |
NaN |
3 |
NaN |
6.5 |
3.0 |
df=DataFrame(np.random.randn(7,3))
df.ix[:4,1]=NA
df.ix[:2,2]=NA
df
|
0 |
1 |
2 |
0 |
-1.637463 |
NaN |
NaN |
1 |
-1.259674 |
NaN |
NaN |
2 |
-0.284635 |
NaN |
NaN |
3 |
0.818905 |
NaN |
-1.878244 |
4 |
-2.402401 |
NaN |
-0.533942 |
5 |
-0.623351 |
-1.472599 |
-0.860614 |
6 |
-0.194565 |
-1.757851 |
-1.251312 |
df.dropna(thresh=3)
|
0 |
1 |
2 |
5 |
-0.623351 |
-1.472599 |
-0.860614 |
6 |
-0.194565 |
-1.757851 |
-1.251312 |
填充缺失数据
df.fillna(0)
|
0 |
1 |
2 |
0 |
-1.637463 |
0.000000 |
0.000000 |
1 |
-1.259674 |
0.000000 |
0.000000 |
2 |
-0.284635 |
0.000000 |
0.000000 |
3 |
0.818905 |
0.000000 |
-1.878244 |
4 |
-2.402401 |
0.000000 |
-0.533942 |
5 |
-0.623351 |
-1.472599 |
-0.860614 |
6 |
-0.194565 |
-1.757851 |
-1.251312 |
df.fillna({1:0.5,2:-1})
|
0 |
1 |
2 |
0 |
-1.637463 |
0.500000 |
-1.000000 |
1 |
-1.259674 |
0.500000 |
-1.000000 |
2 |
-0.284635 |
0.500000 |
-1.000000 |
3 |
0.818905 |
0.500000 |
-1.878244 |
4 |
-2.402401 |
0.500000 |
-0.533942 |
5 |
-0.623351 |
-1.472599 |
-0.860614 |
6 |
-0.194565 |
-1.757851 |
-1.251312 |
_=df.fillna(0,inplace=True)
df
|
0 |
1 |
2 |
0 |
-1.637463 |
0.000000 |
0.000000 |
1 |
-1.259674 |
0.000000 |
0.000000 |
2 |
-0.284635 |
0.000000 |
0.000000 |
3 |
0.818905 |
0.000000 |
-1.878244 |
4 |
-2.402401 |
0.000000 |
-0.533942 |
5 |
-0.623351 |
-1.472599 |
-0.860614 |
6 |
-0.194565 |
-1.757851 |
-1.251312 |
df=DataFrame(np.random.randn(6,3))
df.ix[2:,1]=NA
df.ix[4:,2]=NA
df
|
0 |
1 |
2 |
0 |
0.173799 |
0.267422 |
0.480141 |
1 |
1.303258 |
-0.429756 |
-0.790661 |
2 |
-0.110613 |
NaN |
0.878062 |
3 |
1.188953 |
NaN |
-0.125561 |
4 |
-0.512800 |
NaN |
NaN |
5 |
-0.383978 |
NaN |
NaN |
df.fillna(method='ffill')
|
0 |
1 |
2 |
0 |
0.173799 |
0.267422 |
0.480141 |
1 |
1.303258 |
-0.429756 |
-0.790661 |
2 |
-0.110613 |
-0.429756 |
0.878062 |
3 |
1.188953 |
-0.429756 |
-0.125561 |
4 |
-0.512800 |
-0.429756 |
-0.125561 |
5 |
-0.383978 |
-0.429756 |
-0.125561 |
df.fillna(method='ffill',limit=2)
|
0 |
1 |
2 |
0 |
0.173799 |
0.267422 |
0.480141 |
1 |
1.303258 |
-0.429756 |
-0.790661 |
2 |
-0.110613 |
-0.429756 |
0.878062 |
3 |
1.188953 |
-0.429756 |
-0.125561 |
4 |
-0.512800 |
NaN |
-0.125561 |
5 |
-0.383978 |
NaN |
-0.125561 |
data=Series([1.,NA,3.5,NA,7])
data.fillna(data.mean())
0 1.000000 1 3.833333 2 3.500000 3 3.833333 4 7.000000 dtype: float64
层次化索引
data=Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
data
a 1 -2.059265 2 0.276982 3 -1.771092 b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 d 2 -1.697117 3 -0.659792 dtype: float64
data.index
MultiIndex(levels=[[‘a’, ‘b’, ‘c’, ‘d’], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
data['b']
1 0.501535 2 1.547647 3 -0.038850 dtype: float64
data['b':'c']
b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 dtype: float64
data.ix[['b','c']]
b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 dtype: float64
data[:,2]
a 0.276982 b 1.547647 c -0.905470 d -1.697117 dtype: float64
data.unstack()
|
1 |
2 |
3 |
a |
-2.059265 |
0.276982 |
-1.771092 |
b |
0.501535 |
1.547647 |
-0.038850 |
c |
1.963156 |
-0.905470 |
NaN |
d |
NaN |
-1.697117 |
-0.659792 |
data.unstack().stack()
a 1 -2.059265 2 0.276982 3 -1.771092 b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 d 2 -1.697117 3 -0.659792 dtype: float64
frame=DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],
columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame
|
|
Ohio |
Colorado |
|
|
Green |
Red |
Green |
a |
1 |
0 |
1 |
2 |
2 |
3 |
4 |
5 |
b |
1 |
6 |
7 |
8 |
2 |
9 |
10 |
11 |
frame.index.names=['key1','key2']
frame.columns.names=['state','color']
frame
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key1 |
key2 |
|
|
|
a |
1 |
0 |
1 |
2 |
2 |
3 |
4 |
5 |
b |
1 |
6 |
7 |
8 |
2 |
9 |
10 |
11 |
frame['Ohio']
|
color |
Green |
Red |
key1 |
key2 |
|
|
a |
1 |
0 |
1 |
2 |
3 |
4 |
b |
1 |
6 |
7 |
2 |
9 |
10 |
重排分级顺序
frame.swaplevel('key1','key2')
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key2 |
key1 |
|
|
|
1 |
a |
0 |
1 |
2 |
2 |
a |
3 |
4 |
5 |
1 |
b |
6 |
7 |
8 |
2 |
b |
9 |
10 |
11 |
frame.sortlevel(1)
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key1 |
key2 |
|
|
|
a |
1 |
0 |
1 |
2 |
b |
1 |
6 |
7 |
8 |
a |
2 |
3 |
4 |
5 |
b |
2 |
9 |
10 |
11 |
frame.swaplevel(0,1).sortlevel(0)
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key2 |
key1 |
|
|
|
1 |
a |
0 |
1 |
2 |
b |
6 |
7 |
8 |
2 |
a |
3 |
4 |
5 |
b |
9 |
10 |
11 |
根据级别汇总统计
frame.sum(level='key2')
state |
Ohio |
Colorado |
color |
Green |
Red |
Green |
key2 |
|
|
|
1 |
6 |
8 |
10 |
2 |
12 |
14 |
16 |
frame.sum(level='color',axis=1)
|
color |
Green |
Red |
key1 |
key2 |
|
|
a |
1 |
2 |
1 |
2 |
8 |
4 |
b |
1 |
14 |
7 |
2 |
20 |
10 |
使用DataFrame的列
frame=DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],
'd':[0,1,2,0,1,2,3]})
frame
|
a |
b |
c |
d |
0 |
0 |
7 |
one |
0 |
1 |
1 |
6 |
one |
1 |
2 |
2 |
5 |
one |
2 |
3 |
3 |
4 |
two |
0 |
4 |
4 |
3 |
two |
1 |
5 |
5 |
2 |
two |
2 |
6 |
6 |
1 |
two |
3 |
frame2=frame.set_index(['c','d'])
frame2
|
|
a |
b |
c |
d |
|
|
one |
0 |
0 |
7 |
1 |
1 |
6 |
2 |
2 |
5 |
two |
0 |
3 |
4 |
1 |
4 |
3 |
2 |
5 |
2 |
3 |
6 |
1 |
frame.set_index(['c','d'],drop=False)
|
|
a |
b |
c |
d |
c |
d |
|
|
|
|
one |
0 |
0 |
7 |
one |
0 |
1 |
1 |
6 |
one |
1 |
2 |
2 |
5 |
one |
2 |
two |
0 |
3 |
4 |
two |
0 |
1 |
4 |
3 |
two |
1 |
2 |
5 |
2 |
two |
2 |
3 |
6 |
1 |
two |
3 |
frame2.reset_index()
|
c |
d |
a |
b |
0 |
one |
0 |
0 |
7 |
1 |
one |
1 |
1 |
6 |
2 |
one |
2 |
2 |
5 |
3 |
two |
0 |
3 |
4 |
4 |
two |
1 |
4 |
3 |
5 |
two |
2 |
5 |
2 |
6 |
two |
3 |
6 |
1 |
其他有关pandas的话题
整数索引
ser=Series(np.arange(3.))
ser
0 0.0
1 1.0
2 2.0
dtype: float64
ser[1]
1.0
ser
ser2=Series(np.arange(3.),index=['a','b','c'])
ser2[-1]
2.0
ser.ix[:1]
0 0.0
1 1.0
dtype: float64
ser3=Series(range(3),index=[-5,1,3])
ser3.iloc[2]
2
frame=DataFrame(np.arange(6).reshape(3,2),index=[2,0,1])
frame.iloc[0]
0 0
1 1
Name: 2, dtype: int32
面板数据
from pandas_datareader import data as web
pdata=pd.Panel(dict((stk,web.get_data_yahoo(stk,'1/1/2009','6/1/2012') ) for stk in ['AAPL','GOOG','MSFT','DELL']))
pdata