pandas的数据结构介绍
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
Series
(索引在左边,值在右边。可看作是一个定长的有序字典)
obj = Series([4,7,-5,3])
obj
0 4
1 7
2 -5
3 3
dtype: int64
obj.values
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2 = Series([4,7,-5,3],index=['d','b','a','c'])
obj2.index
Index(['d', 'b', 'a', 'c'], dtype='object')
obj2[obj2 > 0]
obj2 * 2
np.exp(obj2)
'b' in obj2
'e' in obj2
False
sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
obj3 = Series(sdata)
states = ['California','Ohio','Oregon','Texas']
obj4 = Series(sdata,index=states)
obj4
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
pd.isnull(obj4)
pd.notnull(obj4)
California False
Ohio True
Oregon True
Texas True
dtype: bool
obj3 + obj4
California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
obj4.name = 'population'
obj4.index.name = 'state'
obj4
state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
obj.index = ['Bob','Steve','Jeff','Ryan']
obj
Bob 4
Steve 7
Jeff -5
Ryan 3
dtype: int64
DataFrame
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
'year':[2000,2001,2002,2001,2002],
'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(data)
frame
|
state |
year |
pop |
0 |
Ohio |
2000 |
1.5 |
1 |
Ohio |
2001 |
1.7 |
2 |
Ohio |
2002 |
3.6 |
3 |
Nevada |
2001 |
2.4 |
4 |
Nevada |
2002 |
2.9 |
DataFrame(data,columns=['year','state','pop'])
|
year |
state |
pop |
0 |
2000 |
Ohio |
1.5 |
1 |
2001 |
Ohio |
1.7 |
2 |
2002 |
Ohio |
3.6 |
3 |
2001 |
Nevada |
2.4 |
4 |
2002 |
Nevada |
2.9 |
frame2 = DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five'])
frame2
|
year |
state |
pop |
debt |
one |
2000 |
Ohio |
1.5 |
NaN |
two |
2001 |
Ohio |
1.7 |
NaN |
three |
2002 |
Ohio |
3.6 |
NaN |
four |
2001 |
Nevada |
2.4 |
NaN |
five |
2002 |
Nevada |
2.9 |
NaN |
frame2['state']
frame2.year
frame2.loc['three']
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
frame2['debt'] = 16.5
frame2['debt'] = np.arange(5.)
frame2
|
year |
state |
pop |
debt |
one |
2000 |
Ohio |
1.5 |
0.0 |
two |
2001 |
Ohio |
1.7 |
1.0 |
three |
2002 |
Ohio |
3.6 |
2.0 |
four |
2001 |
Nevada |
2.4 |
3.0 |
five |
2002 |
Nevada |
2.9 |
4.0 |
val = Series([-1.2,-1.5,-1.7],index=['two','four','five'])
frame2['debt'] = val
frame2
|
year |
state |
pop |
debt |
one |
2000 |
Ohio |
1.5 |
NaN |
two |
2001 |
Ohio |
1.7 |
-1.2 |
three |
2002 |
Ohio |
3.6 |
NaN |
four |
2001 |
Nevada |
2.4 |
-1.5 |
five |
2002 |
Nevada |
2.9 |
-1.7 |
frame2['eastern'] = frame2.state == 'Ohio'
frame2
|
year |
state |
pop |
debt |
eastern |
one |
2000 |
Ohio |
1.5 |
NaN |
True |
two |
2001 |
Ohio |
1.7 |
-1.2 |
True |
three |
2002 |
Ohio |
3.6 |
NaN |
True |
four |
2001 |
Nevada |
2.4 |
-1.5 |
False |
five |
2002 |
Nevada |
2.9 |
-1.7 |
False |
del frame2['eastern']
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
pop = {'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame3 = DataFrame(pop)
frame3.T
|
2001 |
2002 |
2000 |
Nevada |
2.4 |
2.9 |
NaN |
Ohio |
1.7 |
3.6 |
1.5 |
#有个知识点
frame3.index.name = 'year';frame3.columns.name = 'state'
frame3.values
frame2.values
array([[2000, 'Ohio', 1.5, nan],
[2001, 'Ohio', 1.7, -1.2],
[2002, 'Ohio', 3.6, nan],
[2001, 'Nevada', 2.4, -1.5],
[2002, 'Nevada', 2.9, -1.7]], dtype=object)
索引对象
Index对象是不可修改的
obj =Series(range(3),index=['a','b','c'])
index = obj.index
index[1:]
Index(['b', 'c'], dtype='object')
index = pd.Index(np.arange(3))
obj2 = Series([1.5,-2.5,0],index=index)
obj2.index is index
True
#又有个知识点
'Ohio' in frame3.columns
2003 in frame3.index
False
#有个知识点
重新索引
obj = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
obj2 = obj.reindex(['a','b','c','d','e'])
obj2
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
obj.reindex(['a','b','c','d','e'],fill_value=0)
a -5.3
b 7.2
c 3.6
d 4.5
e 0.0
dtype: float64
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])
obj3.reindex(range(6),method='ffill')
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
frame = DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['Ohio','Texas','California'])
frame2 = frame.reindex(['a','b','c','d'])
frame2
|
Ohio |
Texas |
California |
a |
0.0 |
1.0 |
2.0 |
b |
NaN |
NaN |
NaN |
c |
3.0 |
4.0 |
5.0 |
d |
6.0 |
7.0 |
8.0 |
states = ['Texas','Utah','California']
frame.reindex(columns=states)
|
Texas |
Utah |
California |
a |
1 |
NaN |
2 |
c |
4 |
NaN |
5 |
d |
7 |
NaN |
8 |
frame.reindex(index=['a','b','c','d'],columns=states).ffill()
|
Texas |
Utah |
California |
a |
1.0 |
NaN |
2.0 |
b |
1.0 |
NaN |
2.0 |
c |
4.0 |
NaN |
5.0 |
d |
7.0 |
NaN |
8.0 |
frame = frame.reindex(['a','b','c','d'])
frame = frame.reindex(columns=states)
frame.loc[['a','b','c','d'],states]
|
Texas |
Utah |
California |
a |
1.0 |
NaN |
2.0 |
b |
NaN |
NaN |
NaN |
c |
4.0 |
NaN |
5.0 |
d |
7.0 |
NaN |
8.0 |
#又又又有个知识点
丢弃指定轴上的项
obj = Series(np.arange(5.),index=['a','b','c','d','e'])
new_obj = obj.drop('c')
new_obj
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
data =DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data.drop(['Colorado','Ohio'])
|
one |
two |
three |
four |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data.drop('two',axis=1)
|
one |
three |
four |
Ohio |
0 |
2 |
3 |
Colorado |
4 |
6 |
7 |
Utah |
8 |
10 |
11 |
New York |
12 |
14 |
15 |
索引、选取和过滤
obj = Series(np.arange(4.),index=['a','b','c','d'])
obj['b']
obj[1]
1.0
obj[obj<2]
a 0.0
b 1.0
dtype: float64
obj['b':'c']
b 1.0
c 2.0
dtype: float64
data = DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data[['three','one']]
|
three |
one |
Ohio |
2 |
0 |
Colorado |
6 |
4 |
Utah |
10 |
8 |
New York |
14 |
12 |
data[:2]
|
one |
two |
three |
four |
Ohio |
0 |
1 |
2 |
3 |
Colorado |
4 |
5 |
6 |
7 |
data[data['three']>5]
|
one |
two |
three |
four |
Colorado |
4 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data[data<5] = 0
data
|
one |
two |
three |
four |
Ohio |
0 |
0 |
0 |
0 |
Colorado |
0 |
5 |
6 |
7 |
Utah |
8 |
9 |
10 |
11 |
New York |
12 |
13 |
14 |
15 |
data.loc['Colorado',['two','three']]
two 5
three 6
Name: Colorado, dtype: int32
data.loc[data.three>5,:'three']
|
one |
two |
three |
Colorado |
0 |
5 |
6 |
Utah |
8 |
9 |
10 |
New York |
12 |
13 |
14 |
#又又又又有个知识点
算术运算和数据对齐
s1 = Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
s1+s2
a 5.2
c 1.1
d NaN
e 0.0
f NaN
g NaN
dtype: float64
df1 = DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
df1+df2
|
b |
c |
d |
e |
Colorado |
NaN |
NaN |
NaN |
NaN |
Ohio |
3.0 |
NaN |
6.0 |
NaN |
Oregon |
NaN |
NaN |
NaN |
NaN |
Texas |
9.0 |
NaN |
12.0 |
NaN |
Utah |
NaN |
NaN |
NaN |
NaN |
在算术方法中填充值
add 加法
sub 减法
div 除法
mul 乘法
df1 = DataFrame(np.arange(12.).reshape((3,4)),columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4,5)),columns=list('abcde'))
df1.add(df2,fill_value=0)
df1.reindex(columns=df2.columns,fill_value=0)
|
a |
b |
c |
d |
e |
0 |
0.0 |
1.0 |
2.0 |
3.0 |
0 |
1 |
4.0 |
5.0 |
6.0 |
7.0 |
0 |
2 |
8.0 |
9.0 |
10.0 |
11.0 |
0 |
DataFrame和Series之间的运算
arr = np.arange(12.).reshape((3,4))
arr-arr[0]
array([[0., 0., 0., 0.],
[4., 4., 4., 4.],
[8., 8., 8., 8.]])
frame = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
series = frame.loc['Utah']
frame-series
|
b |
d |
e |
Utah |
0.0 |
0.0 |
0.0 |
Ohio |
3.0 |
3.0 |
3.0 |
Texas |
6.0 |
6.0 |
6.0 |
Oregon |
9.0 |
9.0 |
9.0 |
series2 = Series(range(3),index=['b','e','f'])
frame+series2
|
b |
d |
e |
f |
Utah |
0.0 |
NaN |
3.0 |
NaN |
Ohio |
3.0 |
NaN |
6.0 |
NaN |
Texas |
6.0 |
NaN |
9.0 |
NaN |
Oregon |
9.0 |
NaN |
12.0 |
NaN |
series3 = frame['d']
frame.sub(series3,axis=0)
|
b |
d |
e |
Utah |
-1.0 |
0.0 |
1.0 |
Ohio |
-1.0 |
0.0 |
1.0 |
Texas |
-1.0 |
0.0 |
1.0 |
Oregon |
-1.0 |
0.0 |
1.0 |
函数应用和映射
frame = DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
np.abs(frame)
f = lambda x:x.max()-x.min()
frame.apply(f)
frame.apply(f,axis=1)
Utah 2.341364
Ohio 1.889609
Texas 1.479165
Oregon 1.122892
dtype: float64
def f(x):
return Series([x.min(),x.max()],index=['min','max'])
frame.apply(f)
|
b |
d |
e |
min |
-2.121313 |
-0.231704 |
-0.309807 |
max |
1.255592 |
0.670467 |
1.786023 |
format = lambda x: '%.2f' % x
frame.applymap(format)
frame['e'].map(format)
Utah 1.79
Ohio -0.31
Texas 1.42
Oregon 0.13
Name: e, dtype: object
排序和排名
obj = Series(range(4),index=['d','a','b','c'])
obj.sort_index()
a 1
b 2
c 3
d 0
dtype: int64
frame = DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','a','b','c'])
frame.sort_index()
|
d |
a |
b |
c |
one |
4 |
5 |
6 |
7 |
three |
0 |
1 |
2 |
3 |
frame.sort_index(axis=1)
|
a |
b |
c |
d |
three |
1 |
2 |
3 |
0 |
one |
5 |
6 |
7 |
4 |
frame.sort_index(axis=1,ascending=False)
|
d |
c |
b |
a |
three |
0 |
3 |
2 |
1 |
one |
4 |
7 |
6 |
5 |
obj = Series([4,7,-3,2])
obj.sort_values()
2 -3
3 2
0 4
1 7
dtype: int64
obj = Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values()
4 -3.0
5 2.0
0 4.0
2 7.0
1 NaN
3 NaN
dtype: float64
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame.sort_values(by='b')
|
b |
a |
2 |
-3 |
0 |
3 |
2 |
1 |
0 |
4 |
0 |
1 |
7 |
1 |
frame.sort_values(by=['a','b'])
|
b |
a |
2 |
-3 |
0 |
0 |
4 |
0 |
3 |
2 |
1 |
1 |
7 |
1 |
obj = Series([7,-5,7,4,2,0,4])
obj.rank()
0 6.5
1 1.0
2 6.5
3 4.5
4 3.0
5 2.0
6 4.5
dtype: float64
obj.rank(method='first')
0 6.0
1 1.0
2 7.0
3 4.0
4 3.0
5 2.0
6 5.0
dtype: float64
obj.rank(ascending=False,method='max')
0 2.0
1 7.0
2 2.0
3 4.0
4 5.0
5 6.0
6 4.0
dtype: float64
#又又又又又有个知识点
带有重复值的轴索引
obj = Series(range(5),index=['a','a','b','b','c'])
obj.index.is_unique
False
df = DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df.loc['b']
|
0 |
1 |
2 |
b |
-0.994941 |
0.304769 |
0.930754 |
b |
0.918218 |
0.577393 |
2.664499 |
汇总和计算描述统计
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
df.sum()
df.sum(axis=1)
a NaN
b 2.60
c NaN
d -0.55
dtype: float64
df.mean(axis=1,skipna=False)
a NaN
b 1.300
c NaN
d -0.275
dtype: float64
#又又又又又又有个知识点
df.idxmax()
one b
two d
dtype: object
df.cumsum()
|
one |
two |
a |
1.40 |
NaN |
b |
8.50 |
-4.5 |
c |
NaN |
NaN |
d |
9.25 |
-5.8 |
df.describe()
|
one |
two |
count |
3.000000 |
2.000000 |
mean |
3.083333 |
-2.900000 |
std |
3.493685 |
2.262742 |
min |
0.750000 |
-4.500000 |
25% |
1.075000 |
-3.700000 |
50% |
1.400000 |
-2.900000 |
75% |
4.250000 |
-2.100000 |
max |
7.100000 |
-1.300000 |
obj = Series(['a','a','b','c']*4)
obj.describe()
count 16
unique 3
top a
freq 8
dtype: object
#又又又又又又又有个知识点
相关系数与协方差
import pandas_datareader.data as web
all_data = {}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
all_data[ticker] = web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
price = DataFrame({tic:data['Adj Close'] for tic,data in all_data.iteritems()})
volume = DataFrame({tic:data['Volume'] for tic,data in all_data.iteritems()})
报错。
price = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_price.pkl')
volume = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_volume.pkl')
price.head()
|
AAPL |
GOOG |
IBM |
MSFT |
Date |
|
|
|
|
2010-01-04 |
27.990226 |
313.062468 |
113.304536 |
25.884104 |
2010-01-05 |
28.038618 |
311.683844 |
111.935822 |
25.892466 |
2010-01-06 |
27.592626 |
303.826685 |
111.208683 |
25.733566 |
2010-01-07 |
27.541619 |
296.753749 |
110.823732 |
25.465944 |
2010-01-08 |
27.724725 |
300.709808 |
111.935822 |
25.641571 |
volume.head()
|
AAPL |
GOOG |
IBM |
MSFT |
Date |
|
|
|
|
2010-01-04 |
123432400 |
3927000 |
6155300 |
38409100 |
2010-01-05 |
150476200 |
6031900 |
6841400 |
49749600 |
2010-01-06 |
138040000 |
7987100 |
5605300 |
58182400 |
2010-01-07 |
119282800 |
12876600 |
5840600 |
50559700 |
2010-01-08 |
111902700 |
9483900 |
4197200 |
51197400 |
returns = price.pct_change()
returns.tail()
|
AAPL |
GOOG |
IBM |
MSFT |
Date |
|
|
|
|
2016-10-17 |
-0.000680 |
0.001837 |
0.002072 |
-0.003483 |
2016-10-18 |
-0.000681 |
0.019616 |
-0.026168 |
0.007690 |
2016-10-19 |
-0.002979 |
0.007846 |
0.003583 |
-0.002255 |
2016-10-20 |
-0.000512 |
-0.005652 |
0.001719 |
-0.004867 |
2016-10-21 |
-0.003930 |
0.003011 |
-0.012474 |
0.042096 |
returns.MSFT.corr(returns.IBM)
0.49976361144151155
returns.MSFT.cov(returns.IBM)
8.870655479703546e-05
returns.corr()
|
AAPL |
GOOG |
IBM |
MSFT |
AAPL |
1.000000 |
0.407919 |
0.386817 |
0.389695 |
GOOG |
0.407919 |
1.000000 |
0.405099 |
0.465919 |
IBM |
0.386817 |
0.405099 |
1.000000 |
0.499764 |
MSFT |
0.389695 |
0.465919 |
0.499764 |
1.000000 |
returns.cov()
|
AAPL |
GOOG |
IBM |
MSFT |
AAPL |
0.000277 |
0.000107 |
0.000078 |
0.000095 |
GOOG |
0.000107 |
0.000251 |
0.000078 |
0.000108 |
IBM |
0.000078 |
0.000078 |
0.000146 |
0.000089 |
MSFT |
0.000095 |
0.000108 |
0.000089 |
0.000215 |
returns.corrwith(returns.IBM)
AAPL 0.386817
GOOG 0.405099
IBM 1.000000
MSFT 0.499764
dtype: float64
returns.corrwith(volume)
AAPL -0.075565
GOOG -0.007067
IBM -0.204849
MSFT -0.092950
dtype: float64
唯一值、值计数以及成员资格
isin 计算一个表示“Series各值是否包含于传入的值序列中”的布尔型数组;
unique 计算Series中的唯一值数组,按发现的顺序返回;
value_counts 返回一个Series,其索引为唯一值,其值为频率,按计数值降序排列
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
uniques
array(['c', 'a', 'd', 'b'], dtype=object)
obj.value_counts()
c 3
a 3
b 2
d 1
dtype: int64
pd.value_counts(obj.values,sort=False)
c 3
a 3
d 1
b 2
dtype: int64
mask = obj.isin(['b','c'])
obj[mask]
0 c
5 b
6 b
7 c
8 c
dtype: object
data = DataFrame({'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]})
result = data.apply(pd.value_counts).fillna(0)
result
|
Qu1 |
Qu2 |
Qu3 |
1 |
1.0 |
1.0 |
1.0 |
2 |
0.0 |
2.0 |
1.0 |
3 |
2.0 |
2.0 |
0.0 |
4 |
2.0 |
0.0 |
2.0 |
5 |
0.0 |
0.0 |
1.0 |
处理缺失数据
pandas对象上的所有描述统计都排除了缺失数据
string_data = Series(['aardvark','artichoke',np.nan,'avocado'])
string_data.isnull()
0 False
1 False
2 True
3 False
dtype: bool
string_data[0] = None
string_data.isnull()
0 True
1 False
2 True
3 False
dtype: bool
#又又又又又又又又有个知识点
滤除缺失值
from numpy import nan as NA
data = Series([1,NA,3.5,NA,7])
data.dropna()
0 1.0
2 3.5
4 7.0
dtype: float64
data[data.notnull()]
0 1.0
2 3.5
4 7.0
dtype: float64
data = DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
cleaned = data.dropna()
cleaned
data.dropna(how='all')
|
0 |
1 |
2 |
0 |
1.0 |
6.5 |
3.0 |
1 |
1.0 |
NaN |
NaN |
3 |
NaN |
6.5 |
3.0 |
data[4] = NA
data.dropna(axis=1,how='all')
|
0 |
1 |
2 |
0 |
1.0 |
6.5 |
3.0 |
1 |
1.0 |
NaN |
NaN |
2 |
NaN |
NaN |
NaN |
3 |
NaN |
6.5 |
3.0 |
df = DataFrame(np.random.randn(7,3))
df.loc[:4,1] = NA;df.loc[:2,2] = NA
df.dropna(thresh=3)
|
0 |
1 |
2 |
5 |
-1.991372 |
-1.644575 |
0.675400 |
6 |
1.718451 |
0.312742 |
-1.484959 |
填充缺失数据
df.fillna(0)
|
0 |
1 |
2 |
0 |
-0.292230 |
0.000000 |
0.000000 |
1 |
0.129826 |
0.000000 |
0.000000 |
2 |
-0.075307 |
0.000000 |
0.000000 |
3 |
0.280476 |
0.000000 |
-1.259970 |
4 |
-1.171738 |
0.000000 |
0.206481 |
5 |
-1.991372 |
-1.644575 |
0.675400 |
6 |
1.718451 |
0.312742 |
-1.484959 |
df.fillna({1:0.5,3:-1})
|
0 |
1 |
2 |
0 |
-0.292230 |
0.500000 |
NaN |
1 |
0.129826 |
0.500000 |
NaN |
2 |
-0.075307 |
0.500000 |
NaN |
3 |
0.280476 |
0.500000 |
-1.259970 |
4 |
-1.171738 |
0.500000 |
0.206481 |
5 |
-1.991372 |
-1.644575 |
0.675400 |
6 |
1.718451 |
0.312742 |
-1.484959 |
_ = df.fillna(0,inplace=True)
df
|
0 |
1 |
2 |
0 |
-0.292230 |
0.000000 |
0.000000 |
1 |
0.129826 |
0.000000 |
0.000000 |
2 |
-0.075307 |
0.000000 |
0.000000 |
3 |
0.280476 |
0.000000 |
-1.259970 |
4 |
-1.171738 |
0.000000 |
0.206481 |
5 |
-1.991372 |
-1.644575 |
0.675400 |
6 |
1.718451 |
0.312742 |
-1.484959 |
df = DataFrame(np.random.randn(6,3))
df.loc[2:,1] = NA;df.loc[4:,2] = NA
df.fillna(method='ffill',limit=2)
|
0 |
1 |
2 |
0 |
-0.814015 |
-1.672914 |
-0.437364 |
1 |
0.294209 |
0.038563 |
-0.141332 |
2 |
-0.337091 |
0.038563 |
-0.041438 |
3 |
0.698458 |
0.038563 |
-0.750640 |
4 |
-0.369432 |
NaN |
-0.750640 |
5 |
-0.437763 |
NaN |
-0.750640 |
data = Series([1.,NA,3.5,NA,7])
data.fillna(data.mean())
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000
dtype: float64
#又又又又又又又又又有个知识点
层次化索引
能使一个轴上拥有多个索引级别,能以低维度形式处理高维度数据
data = Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
data
a 1 0.861096
2 0.613551
3 1.130427
b 1 -0.210724
2 0.962846
3 0.393051
c 1 -0.774183
2 0.456655
d 2 -0.824490
3 0.908530
dtype: float64
data.index
MultiIndex([('a', 1),
('a', 2),
('a', 3),
('b', 1),
('b', 2),
('b', 3),
('c', 1),
('c', 2),
('d', 2),
('d', 3)],
)
data['b']
1 -0.210724
2 0.962846
3 0.393051
dtype: float64
data.loc[['b','d']]
b 1 -0.210724
2 0.962846
3 0.393051
d 2 -0.824490
3 0.908530
dtype: float64
data[:,2]
a 0.613551
b 0.962846
c 0.456655
d -0.824490
dtype: float64
data.unstack()
|
1 |
2 |
3 |
a |
0.861096 |
0.613551 |
1.130427 |
b |
-0.210724 |
0.962846 |
0.393051 |
c |
-0.774183 |
0.456655 |
NaN |
d |
NaN |
-0.824490 |
0.908530 |
data.unstack().stack()
a 1 0.861096
2 0.613551
3 1.130427
b 1 -0.210724
2 0.962846
3 0.393051
c 1 -0.774183
2 0.456655
d 2 -0.824490
3 0.908530
dtype: float64
frame = DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key1 |
key2 |
|
|
|
a |
1 |
0 |
1 |
2 |
2 |
3 |
4 |
5 |
b |
1 |
6 |
7 |
8 |
2 |
9 |
10 |
11 |
frame['Ohio']
|
color |
Green |
Red |
key1 |
key2 |
|
|
a |
1 |
0 |
1 |
2 |
3 |
4 |
b |
1 |
6 |
7 |
2 |
9 |
10 |
MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],['Green','Red','Green']],names=['state','color'])
大概这个意思
重排分级顺序
frame.swaplevel('key1','key2')
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key2 |
key1 |
|
|
|
1 |
a |
0 |
1 |
2 |
2 |
a |
3 |
4 |
5 |
1 |
b |
6 |
7 |
8 |
2 |
b |
9 |
10 |
11 |
frame.sort_values(axis=0,by='key2')
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key1 |
key2 |
|
|
|
a |
1 |
0 |
1 |
2 |
b |
1 |
6 |
7 |
8 |
a |
2 |
3 |
4 |
5 |
b |
2 |
9 |
10 |
11 |
frame.swaplevel(0,1).sort_index()
|
state |
Ohio |
Colorado |
|
color |
Green |
Red |
Green |
key2 |
key1 |
|
|
|
1 |
a |
0 |
1 |
2 |
b |
6 |
7 |
8 |
2 |
a |
3 |
4 |
5 |
b |
9 |
10 |
11 |
在层次化索引的对象上,如果索引是按字典方式从外到内排序(即调用sort_index),数据选取操作的性能要好很多。
根据级别汇总统计
frame.groupby(level='key2').sum()
state |
Ohio |
Colorado |
color |
Green |
Red |
Green |
key2 |
|
|
|
1 |
6 |
8 |
10 |
2 |
12 |
14 |
16 |
frame.groupby(level='color',axis=1).sum()
|
color |
Green |
Red |
key1 |
key2 |
|
|
a |
1 |
2 |
1 |
2 |
8 |
4 |
b |
1 |
14 |
7 |
2 |
20 |
10 |
使用DataFrame的列
frame = DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
frame2 = frame.set_index(['c','d'])
frame2
|
|
a |
b |
c |
d |
|
|
one |
0 |
0 |
7 |
1 |
1 |
6 |
2 |
2 |
5 |
two |
0 |
3 |
4 |
1 |
4 |
3 |
2 |
5 |
2 |
3 |
6 |
1 |
frame.set_index(['c','d'],drop=False)
frame2.reset_index()
|
c |
d |
a |
b |
0 |
one |
0 |
0 |
7 |
1 |
one |
1 |
1 |
6 |
2 |
one |
2 |
2 |
5 |
3 |
two |
0 |
3 |
4 |
4 |
two |
1 |
4 |
3 |
5 |
two |
2 |
5 |
2 |
6 |
two |
3 |
6 |
1 |
其他有关pandas的话题
整数索引
ser = Series(np.arange(3.))
ser2 = Series(np.arange(3.),index=['a','b','c'])
ser2[-1]
2.0
ser.loc[:1]
0 0.0
1 1.0
dtype: float64
ser3 = Series(range(3),index=[-5,1,3])
ser3.iat[2]
2
frame = DataFrame(np.arange(6).reshape(3,2),index=[2,0,1])
frame.iloc[0]
frame.iloc[0,:]
0 0
1 1
Name: 2, dtype: int32
面板数据
可以用一个由DataFrame对象组成的字典或一个三维ndarray来创建Panel对象
price = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_price.pkl')
type(price)
pandas.core.frame.DataFrame
price.loc['6/1/2012']
AAPL 73.371509
GOOG 285.205295
IBM 168.989059
MSFT 25.262972
Name: 2012-06-01 00:00:00, dtype: float64
stacked=price.loc['6/1/2012'].to_frame()
stacked
|
2012-06-01 |
AAPL |
73.371509 |
GOOG |
285.205295 |
IBM |
168.989059 |
MSFT |
25.262972 |
pd.Panel(stacked)
AttributeError: module 'pandas' has no attribute 'Panel'