pandas的groupby功能,可以计算 分组统计和生成透视表,可对数据集进行灵活的切片、切块、摘要等操作
GroupBy技术
“split-apply-comebine”(拆分-应用-合并)
import numpy as np
from pandas import DataFrame,Series
df=DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),
'data2':np.random.randn(5)})
df
|
data1 |
data2 |
key1 |
key2 |
0 |
1.160760 |
0.360555 |
a |
one |
1 |
-0.992606 |
-0.120562 |
a |
two |
2 |
-0.616727 |
0.856179 |
b |
one |
3 |
-1.921879 |
-0.690846 |
b |
two |
4 |
-0.458540 |
-0.093610 |
a |
one |
grouped=df['data1'].groupby(df['key1'])
grouped
grouped.mean()
key1 a -0.096796 b -1.269303 Name: data1, dtype: float64
means=df['data1'].groupby([df['key1'],df['key2']]).mean()
means
key1 key2 a one 0.351110 two -0.992606 b one -0.616727 two -1.921879 Name: data1, dtype: float64
means.unstack()
key2 |
one |
two |
key1 |
|
|
a |
0.351110 |
-0.992606 |
b |
-0.616727 |
-1.921879 |
states=np.array(['Ohio','Colifornia','Colifornia','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
Colifornia 2005 -0.992606 2006 -0.616727 Ohio 2005 -0.380560 2006 -0.458540 Name: data1, dtype: float64
df.groupby(['key1']).mean()
|
data1 |
data2 |
key1 |
|
|
a |
-0.096796 |
0.048794 |
b |
-1.269303 |
0.082666 |
df.groupby(['key1','key2']).mean()
|
|
data1 |
data2 |
key1 |
key2 |
|
|
a |
one |
0.351110 |
0.133473 |
two |
-0.992606 |
-0.120562 |
b |
one |
-0.616727 |
0.856179 |
two |
-1.921879 |
-0.690846 |
df.groupby(['key1','key2']).size()
key1 key2 a one 2 two 1 b one 1 two 1 dtype: int64
对分组进行迭代
for name,group in df.groupby('key1'):
print(name)
print(group)
a data1 data2 key1 key2 0 1.160760 0.360555 a one 1 -0.992606 -0.120562 a two 4 -0.458540 -0.093610 a one b data1 data2 key1 key2 2 -0.616727 0.856179 b one 3 -1.921879 -0.690846 b two
for (k1,k2),group in df.groupby(['key1','key2']):
print(k1,k2)
print(group)
a one data1 data2 key1 key2 0 1.16076 0.360555 a one 4 -0.45854 -0.093610 a one a two data1 data2 key1 key2 1 -0.992606 -0.120562 a two b one data1 data2 key1 key2 2 -0.616727 0.856179 b one b two data1 data2 key1 key2 3 -1.921879 -0.690846 b two
pieces=dict(list(df.groupby('key1')))
pieces['b']
|
data1 |
data2 |
key1 |
key2 |
2 |
-0.616727 |
0.856179 |
b |
one |
3 |
-1.921879 |
-0.690846 |
b |
two |
df.dtypes
data1 float64 data2 float64 key1 object key2 object dtype: object
grouped=df.groupby(df.dtypes,axis=1)
dict(list(grouped))
{dtype(‘float64’): data1 data2 0 1.160760 0.360555 1 -0.992606 -0.120562 2 -0.616727 0.856179 3 -1.921879 -0.690846 4 -0.458540 -0.093610, dtype(‘O’): key1 key2 0 a one 1 a two 2 b one 3 b two 4 a one}
选取一个或一组列
df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])
df.groupby(['key1','key2'])[['data2']].mean()
|
|
data2 |
key1 |
key2 |
|
a |
one |
0.133473 |
two |
-0.120562 |
b |
one |
0.856179 |
two |
-0.690846 |
s_grouped=df.groupby(['key1','key2'])['data2']
s_grouped
s_grouped.mean()
key1 key2 a one 0.133473 two -0.120562 b one 0.856179 two -0.690846 Name: data2, dtype: float64
通过字典或Series进行分组
除数组外,分组信息还可以其他形式存着
people=DataFrame(np.random.randn(5,5),
columns=['a','b','c','d','e'],
index=['Joe','Steve','Wes','Jim','Travis'])
people.ix[2:3,['b','c']]=np.nan
people
|
a |
b |
c |
d |
e |
Joe |
0.246182 |
0.556642 |
0.530663 |
0.072457 |
0.769930 |
Steve |
-0.735543 |
-0.046147 |
0.092191 |
0.659066 |
0.563112 |
Wes |
-0.671631 |
NaN |
NaN |
0.351555 |
0.320022 |
Jim |
0.730654 |
-0.554864 |
-0.013574 |
-0.238270 |
-1.276084 |
Travis |
-0.246124 |
0.494404 |
0.782177 |
-1.856125 |
0.838289 |
mapping={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column=people.groupby(mapping,axis=1)
by_column.sum()
|
blue |
red |
Joe |
0.603120 |
1.572753 |
Steve |
0.751258 |
-0.218579 |
Wes |
0.351555 |
-0.351610 |
Jim |
-0.251844 |
-1.100294 |
Travis |
-1.073948 |
1.086570 |
map_series=Series(mapping)
map_series
a red b red c blue d blue e red f orange dtype: object
people.groupby(map_series,axis=1).count()
|
blue |
red |
Joe |
2 |
3 |
Steve |
2 |
3 |
Wes |
1 |
2 |
Jim |
2 |
3 |
Travis |
2 |
3 |
通过函数进行分组
people.groupby(len).sum()
|
a |
b |
c |
d |
e |
3 |
0.305204 |
0.001778 |
0.517089 |
0.185742 |
-0.186132 |
5 |
-0.735543 |
-0.046147 |
0.092191 |
0.659066 |
0.563112 |
6 |
-0.246124 |
0.494404 |
0.782177 |
-1.856125 |
0.838289 |
key_list=['one','one','one','two','two']
people.groupby([len,key_list]).min()
|
|
a |
b |
c |
d |
e |
3 |
one |
-0.671631 |
0.556642 |
0.530663 |
0.072457 |
0.320022 |
two |
0.730654 |
-0.554864 |
-0.013574 |
-0.238270 |
-1.276084 |
5 |
one |
-0.735543 |
-0.046147 |
0.092191 |
0.659066 |
0.563112 |
6 |
two |
-0.246124 |
0.494404 |
0.782177 |
-1.856125 |
0.838289 |
根据索引级别分组
import pandas as pd
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
[1,3,5,1,3]],names=['city','tenor'])
hier_df=DataFrame(np.random.randn(4,5),columns=columns)
hier_df
city |
US |
JP |
tenor |
1 |
3 |
5 |
1 |
3 |
0 |
-0.729876 |
-0.490356 |
1.200420 |
-1.594183 |
-0.571277 |
1 |
-1.336457 |
-2.033271 |
-0.356616 |
0.915616 |
-0.234895 |
2 |
-0.065620 |
-0.102485 |
0.605027 |
-0.518972 |
1.190415 |
3 |
0.985298 |
0.923531 |
1.784194 |
1.815795 |
-1.261107 |
hier_df.groupby(level='city',axis=1).count()
city |
JP |
US |
0 |
2 |
3 |
1 |
2 |
3 |
2 |
2 |
3 |
3 |
2 |
3 |
数据聚合
df
|
data1 |
data2 |
key1 |
key2 |
0 |
1.160760 |
0.360555 |
a |
one |
1 |
-0.992606 |
-0.120562 |
a |
two |
2 |
-0.616727 |
0.856179 |
b |
one |
3 |
-1.921879 |
-0.690846 |
b |
two |
4 |
-0.458540 |
-0.093610 |
a |
one |
grouped=df.groupby('key1')
grouped['data1'].quantile(0.9)
key1 a 0.836900 b -0.747242 Name: data1, dtype: float64
def peak_to_peak(arr):
return arr.max()-arr.min()
grouped.agg(peak_to_peak)
|
data1 |
data2 |
key1 |
|
|
a |
2.153366 |
0.481117 |
b |
1.305152 |
1.547025 |
grouped.describe()
|
|
data1 |
data2 |
key1 |
|
|
|
a |
count |
3.000000 |
3.000000 |
mean |
-0.096796 |
0.048794 |
std |
1.121334 |
0.270329 |
min |
-0.992606 |
-0.120562 |
25% |
-0.725573 |
-0.107086 |
50% |
-0.458540 |
-0.093610 |
75% |
0.351110 |
0.133473 |
max |
1.160760 |
0.360555 |
b |
count |
2.000000 |
2.000000 |
mean |
-1.269303 |
0.082666 |
std |
0.922882 |
1.093912 |
min |
-1.921879 |
-0.690846 |
25% |
-1.595591 |
-0.304090 |
50% |
-1.269303 |
0.082666 |
75% |
-0.943015 |
0.469422 |
max |
-0.616727 |
0.856179 |
tips=pd.read_csv('ch08/tips.csv')
tips.head()
|
total_bill |
tip |
sex |
smoker |
day |
time |
size |
0 |
16.99 |
1.01 |
Female |
No |
Sun |
Dinner |
2 |
1 |
10.34 |
1.66 |
Male |
No |
Sun |
Dinner |
3 |
2 |
21.01 |
3.50 |
Male |
No |
Sun |
Dinner |
3 |
3 |
23.68 |
3.31 |
Male |
No |
Sun |
Dinner |
2 |
4 |
24.59 |
3.61 |
Female |
No |
Sun |
Dinner |
4 |
tips['tip_pct']=tips['tip']/tips['total_bill']
tips[:6]
|
total_bill |
tip |
sex |
smoker |
day |
time |
size |
tip_pct |
0 |
16.99 |
1.01 |
Female |
No |
Sun |
Dinner |
2 |
0.059447 |
1 |
10.34 |
1.66 |
Male |
No |
Sun |
Dinner |
3 |
0.160542 |
2 |
21.01 |
3.50 |
Male |
No |
Sun |
Dinner |
3 |
0.166587 |
3 |
23.68 |
3.31 |
Male |
No |
Sun |
Dinner |
2 |
0.139780 |
4 |
24.59 |
3.61 |
Female |
No |
Sun |
Dinner |
4 |
0.146808 |
5 |
25.29 |
4.71 |
Male |
No |
Sun |
Dinner |
4 |
0.186240 |
面向列的多函数应用
grouped=tips.groupby(['sex','smoker'])
grouped_pct=grouped['tip_pct']
grouped_pct.agg('mean')
sex smoker Female No 0.156921 Yes 0.182150 Male No 0.160669 Yes 0.152771 Name: tip_pct, dtype: float64
grouped_pct.agg(['mean','std',peak_to_peak])
|
|
mean |
std |
peak_to_peak |
sex |
smoker |
|
|
|
Female |
No |
0.156921 |
0.036421 |
0.195876 |
Yes |
0.182150 |
0.071595 |
0.360233 |
Male |
No |
0.160669 |
0.041849 |
0.220186 |
Yes |
0.152771 |
0.090588 |
0.674707 |
grouped_pct.agg([('foo','mean'),('bar',np.std)])
|
|
foo |
bar |
sex |
smoker |
|
|
Female |
No |
0.156921 |
0.036421 |
Yes |
0.182150 |
0.071595 |
Male |
No |
0.160669 |
0.041849 |
Yes |
0.152771 |
0.090588 |
functions=['count','mean','max']
result=grouped['tip_pct','total_bill'].agg(functions)
result
|
|
tip_pct |
total_bill |
|
|
count |
mean |
max |
count |
mean |
max |
sex |
smoker |
|
|
|
|
|
|
Female |
No |
54 |
0.156921 |
0.252672 |
54 |
18.105185 |
35.83 |
Yes |
33 |
0.182150 |
0.416667 |
33 |
17.977879 |
44.30 |
Male |
No |
97 |
0.160669 |
0.291990 |
97 |
19.791237 |
48.33 |
Yes |
60 |
0.152771 |
0.710345 |
60 |
22.284500 |
50.81 |
result['tip_pct']
|
|
count |
mean |
max |
sex |
smoker |
|
|
|
Female |
No |
54 |
0.156921 |
0.252672 |
Yes |
33 |
0.182150 |
0.416667 |
Male |
No |
97 |
0.160669 |
0.291990 |
Yes |
60 |
0.152771 |
0.710345 |
ftuples=[('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)
|
|
tip_pct |
total_bill |
|
|
Durchschnitt |
Abweichung |
Durchschnitt |
Abweichung |
sex |
smoker |
|
|
|
|
Female |
No |
0.156921 |
0.001327 |
18.105185 |
53.092422 |
Yes |
0.182150 |
0.005126 |
17.977879 |
84.451517 |
Male |
No |
0.160669 |
0.001751 |
19.791237 |
76.152961 |
Yes |
0.152771 |
0.008206 |
22.284500 |
98.244673 |
grouped.agg({'tip':np.max,'size':'sum'})
|
|
size |
tip |
sex |
smoker |
|
|
Female |
No |
140 |
5.2 |
Yes |
74 |
6.5 |
Male |
No |
263 |
9.0 |
Yes |
150 |
10.0 |
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})
|
|
size |
tip_pct |
|
|
sum |
min |
max |
mean |
std |
sex |
smoker |
|
|
|
|
|
Female |
No |
140 |
0.056797 |
0.252672 |
0.156921 |
0.036421 |
Yes |
74 |
0.056433 |
0.416667 |
0.182150 |
0.071595 |
Male |
No |
263 |
0.071804 |
0.291990 |
0.160669 |
0.041849 |
Yes |
150 |
0.035638 |
0.710345 |
0.152771 |
0.090588 |
以”无索引”的形式返回聚合数据
tips.groupby(['sex','smoker'],as_index=False).mean()
|
sex |
smoker |
total_bill |
tip |
size |
tip_pct |
0 |
Female |
No |
18.105185 |
2.773519 |
2.592593 |
0.156921 |
1 |
Female |
Yes |
17.977879 |
2.931515 |
2.242424 |
0.182150 |
2 |
Male |
No |
19.791237 |
3.113402 |
2.711340 |
0.160669 |
3 |
Male |
Yes |
22.284500 |
3.051167 |
2.500000 |
0.152771 |
分组级运算和转换
聚合只不过是分组运算的其中一种而已。它能够接受将一维数组简化为标量值的函数。本节介绍transform和apply方法,他们能执行更多其他的分组运算
df
|
data1 |
data2 |
key1 |
key2 |
0 |
1.160760 |
0.360555 |
a |
one |
1 |
-0.992606 |
-0.120562 |
a |
two |
2 |
-0.616727 |
0.856179 |
b |
one |
3 |
-1.921879 |
-0.690846 |
b |
two |
4 |
-0.458540 |
-0.093610 |
a |
one |
k1_means=df.groupby('key1').mean().add_prefix('mean_')
k1_means
|
mean_data1 |
mean_data2 |
key1 |
|
|
a |
-0.096796 |
0.048794 |
b |
-1.269303 |
0.082666 |
pd.merge(df,k1_means,left_on='key1',right_index=True)
|
data1 |
data2 |
key1 |
key2 |
mean_data1 |
mean_data2 |
0 |
1.160760 |
0.360555 |
a |
one |
-0.096796 |
0.048794 |
1 |
-0.992606 |
-0.120562 |
a |
two |
-0.096796 |
0.048794 |
4 |
-0.458540 |
-0.093610 |
a |
one |
-0.096796 |
0.048794 |
2 |
-0.616727 |
0.856179 |
b |
one |
-1.269303 |
0.082666 |
3 |
-1.921879 |
-0.690846 |
b |
two |
-1.269303 |
0.082666 |
key=['one','two','one','two','one']
people.groupby(key).mean()
|
a |
b |
c |
d |
e |
one |
-0.223858 |
0.525523 |
0.656420 |
-0.477371 |
0.642747 |
two |
-0.002445 |
-0.300505 |
0.039309 |
0.210398 |
-0.356486 |
people.groupby(key).transform(np.mean)
|
a |
b |
c |
d |
e |
Joe |
-0.223858 |
0.525523 |
0.656420 |
-0.477371 |
0.642747 |
Steve |
-0.002445 |
-0.300505 |
0.039309 |
0.210398 |
-0.356486 |
Wes |
-0.223858 |
0.525523 |
0.656420 |
-0.477371 |
0.642747 |
Jim |
-0.002445 |
-0.300505 |
0.039309 |
0.210398 |
-0.356486 |
Travis |
-0.223858 |
0.525523 |
0.656420 |
-0.477371 |
0.642747 |
def demean(arr):
return arr-arr.mean()
demeaned=people.groupby(key).transform(demean)
demeaned
|
a |
b |
c |
d |
e |
Joe |
0.470039 |
0.031119 |
-0.125757 |
0.549828 |
0.127183 |
Steve |
-0.733099 |
0.254358 |
0.052883 |
0.448668 |
0.919598 |
Wes |
-0.447773 |
NaN |
NaN |
0.828926 |
-0.322725 |
Jim |
0.733099 |
-0.254358 |
-0.052883 |
-0.448668 |
-0.919598 |
Travis |
-0.022266 |
-0.031119 |
0.125757 |
-1.378754 |
0.195542 |
demeaned.groupby(key).mean()
|
a |
b |
c |
d |
e |
one |
1.850372e-17 |
-5.551115e-17 |
0.000000e+00 |
0.000000e+00 |
1.110223e-16 |
two |
0.000000e+00 |
2.775558e-17 |
-3.469447e-18 |
-2.775558e-17 |
0.000000e+00 |
apply:一般性的“拆分-应用-合并”
跟aggregate一样,transform是一个有着严格条件的特殊函数:传入的函数只能产生两种结果,要么产生一个可以广播的标量值
如(np.mean),要么产生一个相同大小的结果数组。最一般化的GroupBy方法是apply,本节重点演示它。
apply会将待处理的对象拆分成多个片段,然后对各片段调用传入的函数,最后尝试将各片段组合到一起。
def top(df,n=5,column='tip_pct'):
return df.sort_values(by=column)[-n:]
top(tips,n=6)
|
total_bill |
tip |
sex |
smoker |
day |
time |
size |
tip_pct |
109 |
14.31 |
4.00 |
Female |
Yes |
Sat |
Dinner |
2 |
0.279525 |
183 |
23.17 |
6.50 |
Male |
Yes |
Sun |
Dinner |
4 |
0.280535 |
232 |
11.61 |
3.39 |
Male |
No |
Sat |
Dinner |
2 |
0.291990 |
67 |
3.07 |
1.00 |
Female |
Yes |
Sat |
Dinner |
1 |
0.325733 |
178 |
9.60 |
4.00 |
Female |
Yes |
Sun |
Dinner |
2 |
0.416667 |
172 |
7.25 |
5.15 |
Male |
Yes |
Sun |
Dinner |
2 |
0.710345 |
tips.groupby('smoker').apply(top)
|
|
total_bill |
tip |
sex |
smoker |
day |
time |
size |
tip_pct |
smoker |
|
|
|
|
|
|
|
|
|
No |
88 |
24.71 |
5.85 |
Male |
No |
Thur |
Lunch |
2 |
0.236746 |
185 |
20.69 |
5.00 |
Male |
No |
Sun |
Dinner |
5 |
0.241663 |
51 |
10.29 |
2.60 |
Female |
No |
Sun |
Dinner |
2 |
0.252672 |
149 |
7.51 |
2.00 |
Male |
No |
Thur |
Lunch |
2 |
0.266312 |
232 |
11.61 |
3.39 |
Male |
No |
Sat |
Dinner |
2 |
0.291990 |
Yes |
109 |
14.31 |
4.00 |
Female |
Yes |
Sat |
Dinner |
2 |
0.279525 |
183 |
23.17 |
6.50 |
Male |
Yes |
Sun |
Dinner |
4 |
0.280535 |
67 |
3.07 |
1.00 |
Female |
Yes |
Sat |
Dinner |
1 |
0.325733 |
178 |
9.60 |
4.00 |
Female |
Yes |
Sun |
Dinner |
2 |
0.416667 |
172 |
7.25 |
5.15 |
Male |
Yes |
Sun |
Dinner |
2 |
0.710345 |
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
|
|
|
total_bill |
tip |
sex |
smoker |
day |
time |
size |
tip_pct |
smoker |
day |
|
|
|
|
|
|
|
|
|
No |
Fri |
94 |
22.75 |
3.25 |
Female |
No |
Fri |
Dinner |
2 |
0.142857 |
Sat |
212 |
48.33 |
9.00 |
Male |
No |
Sat |
Dinner |
4 |
0.186220 |
Sun |
156 |
48.17 |
5.00 |
Male |
No |
Sun |
Dinner |
6 |
0.103799 |
Thur |
142 |
41.19 |
5.00 |
Male |
No |
Thur |
Lunch |
5 |
0.121389 |
Yes |
Fri |
95 |
40.17 |
4.73 |
Male |
Yes |
Fri |
Dinner |
4 |
0.117750 |
Sat |
170 |
50.81 |
10.00 |
Male |
Yes |
Sat |
Dinner |
3 |
0.196812 |
Sun |
182 |
45.35 |
3.50 |
Male |
Yes |
Sun |
Dinner |
3 |
0.077178 |
Thur |
197 |
43.11 |
5.00 |
Female |
Yes |
Thur |
Lunch |
4 |
0.115982 |
除了基本用法之外,能否发挥apply的威力很大程度取决于你的创造力,传入的函数能做什么全由你说了算,只需要返回一个pandas对象或者标量即可。
result=tips.groupby('smoker')['tip_pct'].describe()
result
smoker No count 151.000000 mean 0.159328 std 0.039910 min 0.056797 25% 0.136906 50% 0.155625 75% 0.185014 max 0.291990 Yes count 93.000000 mean 0.163196 std 0.085119 min 0.035638 25% 0.106771 50% 0.153846 75% 0.195059 max 0.710345 Name: tip_pct, dtype: float64
result.unstack('smoker')
smoker |
No |
Yes |
count |
151.000000 |
93.000000 |
mean |
0.159328 |
0.163196 |
std |
0.039910 |
0.085119 |
min |
0.056797 |
0.035638 |
25% |
0.136906 |
0.106771 |
50% |
0.155625 |
0.153846 |
75% |
0.185014 |
0.195059 |
max |
0.291990 |
0.710345 |
f=lambda x:x.describe()
grouped.apply(f)
禁止分组键
从上面的例子可以看出,分组键会跟原始的索引共同构成结果中的层次化索引,将group_keys=Flase传入groupby即可禁止该效果
tips.groupby('smoker',group_keys=False).apply(top)
|
total_bill |
tip |
sex |
smoker |
day |
time |
size |
tip_pct |
88 |
24.71 |
5.85 |
Male |
No |
Thur |
Lunch |
2 |
0.236746 |
185 |
20.69 |
5.00 |
Male |
No |
Sun |
Dinner |
5 |
0.241663 |
51 |
10.29 |
2.60 |
Female |
No |
Sun |
Dinner |
2 |
0.252672 |
149 |
7.51 |
2.00 |
Male |
No |
Thur |
Lunch |
2 |
0.266312 |
232 |
11.61 |
3.39 |
Male |
No |
Sat |
Dinner |
2 |
0.291990 |
109 |
14.31 |
4.00 |
Female |
Yes |
Sat |
Dinner |
2 |
0.279525 |
183 |
23.17 |
6.50 |
Male |
Yes |
Sun |
Dinner |
4 |
0.280535 |
67 |
3.07 |
1.00 |
Female |
Yes |
Sat |
Dinner |
1 |
0.325733 |
178 |
9.60 |
4.00 |
Female |
Yes |
Sun |
Dinner |
2 |
0.416667 |
172 |
7.25 |
5.15 |
Male |
Yes |
Sun |
Dinner |
2 |
0.710345 |
分位数和桶分析
pandas有一些能根据指定面元或者样本分位数将数据拆分成多块的工具(比如cut和qcut)。将这些函数跟groupby结合其阿里,能
轻松实现对数据集的桶(bucket)或者分位数(quantile)分析了。
frame=DataFrame({'data1':np.random.randn(100),
'data2':np.random.randn(100)})
frame.head()
|
data1 |
data2 |
0 |
1.421652 |
-0.133642 |
1 |
1.663593 |
1.570306 |
2 |
0.072588 |
1.445291 |
3 |
-1.117481 |
0.485219 |
4 |
0.673224 |
-0.565916 |
factor=pd.cut(frame.data1,4)
factor[:10]
0 (0.592, 1.913] 1 (0.592, 1.913] 2 (-0.73, 0.592] 3 (-2.0564, -0.73] 4 (0.592, 1.913] 5 (0.592, 1.913] 6 (0.592, 1.913] 7 (-0.73, 0.592] 8 (-0.73, 0.592] 9 (-2.0564, -0.73] Name: data1, dtype: category Categories (4, object): [(-2.0564, -0.73]
def get_stats(group):
return{'min':group.min(),'max':group.max(),'count':group.count()
,'mean':group.mean()}
grouped=frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
|
count |
max |
mean |
min |
data1 |
|
|
|
|
(-2.0564, -0.73] |
22.0 |
1.656349 |
-0.048650 |
-2.129592 |
(-0.73, 0.592] |
50.0 |
2.118117 |
0.006637 |
-2.178494 |
(0.592, 1.913] |
25.0 |
2.893815 |
0.525179 |
-2.531124 |
(1.913, 3.234] |
3.0 |
1.423038 |
-0.643547 |
-2.888465 |
grouping=pd.qcut(frame.data1,10,labels=False)
grouped=frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
|
count |
max |
mean |
min |
data1 |
|
|
|
|
0 |
10.0 |
1.656349 |
0.200332 |
-2.129592 |
1 |
10.0 |
0.892938 |
-0.161470 |
-1.448465 |
2 |
10.0 |
0.936595 |
-0.502936 |
-1.486019 |
3 |
10.0 |
1.225042 |
-0.136442 |
-1.471110 |
4 |
10.0 |
2.118117 |
0.223633 |
-2.178494 |
5 |
10.0 |
1.445291 |
-0.010989 |
-1.145924 |
6 |
10.0 |
1.166204 |
0.230245 |
-0.578312 |
7 |
10.0 |
1.582353 |
0.491112 |
-0.565916 |
8 |
10.0 |
2.893815 |
0.183646 |
-2.531124 |
9 |
10.0 |
2.215117 |
0.528908 |
-2.888465 |
示例:用特定于分组的值填充缺失值
s=Series(np.random.randn(6))
s[::2]=np.nan
s
0 NaN 1 0.480519 2 NaN 3 0.994221 4 NaN 5 0.324907 dtype: float64
s.fillna(s.mean())
0 0.599882 1 0.480519 2 0.599882 3 0.994221 4 0.599882 5 0.324907 dtype: float64
states=['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho']
group_key=['East']*4+['West']*4
data=Series(np.random.randn(8),index=states)
data[['Vermont','Nevada','Idaho']]=np.nan
data
Ohio -0.714495 New York -0.484234 Vermont NaN Florida -0.485962 Oregon 0.399898 Nevada NaN California -0.956605 Idaho NaN dtype: float64
data.groupby(group_key).mean()
East -0.561564 West -0.278353 dtype: float64
fill_mean=lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Ohio -0.714495 New York -0.484234 Vermont -0.561564 Florida -0.485962 Oregon 0.399898 Nevada -0.278353 California -0.956605 Idaho -0.278353 dtype: float64
fill_values={'East':0.5,'West':-1}
fill_func=lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)
Ohio -0.714495
New York -0.484234
Vermont 0.500000
Florida -0.485962
Oregon 0.399898
Nevada -1.000000
California -0.956605
Idaho -1.000000
dtype: float64
示例:随机采样和排列
suits=['H','S','C','D']
card_val=( list(range(1,11))+ [10]*3)*4
base_names=['A']+list(range(2,11))+['J','Q','K']
cards=[]
for suit in ['H','S','C','D']:
cards.extend(str(num)+suit for num in base_names)
deck=Series(card_val,index=cards)
deck[:13]
AH 1 2H 2 3H 3 4H 4 5H 5 6H 6 7H 7 8H 8 9H 9 10H 10 JH 10 QH 10 KH 10 dtype: int64
def draw(deck,n=5):
return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)
10D 10 5D 5 10H 10 3S 3 8C 8 dtype: int64
get_suit=lambda card:card[-1]
deck.groupby(get_suit).apply(draw,n=2)
C 7C 7 3C 3 D 10D 10 KD 10 H 5H 5 10H 10 S 10S 10 4S 4 dtype: int64
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)
7C 7
10C 10
KD 10
QD 10
QH 10
JH 10
3S 3
KS 10
dtype: int64
示例:分组加权平均数和相关系数
根据groupby的“拆分-应用-合并”范式,DataFrame的列与列之间或两个Series之间的运算(比如分组加权平均)成为一种标准作业。
以下面这个数据集为例,它含有分组键、值以及一些权重值
df=DataFrame({'category':['a','a','a','a','b','b','b','b'],'data':np.random.randn(8),
'weights':np.random.rand(8)})
df
|
category |
data |
weights |
0 |
a |
-1.279366 |
0.262668 |
1 |
a |
-0.993197 |
0.124788 |
2 |
a |
-0.092631 |
0.644840 |
3 |
a |
0.216670 |
0.413393 |
4 |
b |
-0.697899 |
0.621993 |
5 |
b |
-0.568083 |
0.190767 |
6 |
b |
-0.963962 |
0.587816 |
7 |
b |
0.637361 |
0.522886 |
grouped=df.groupby('category')
get_wavg=lambda g:np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
category a -0.297540 b -0.403348 dtype: float64
close_px=pd.read_csv('ch09/stock_px.csv',parse_dates=True,index_col=0)
close_px[-4:]
|
AAPL |
MSFT |
XOM |
SPX |
2011-10-11 |
400.29 |
27.00 |
76.27 |
1195.54 |
2011-10-12 |
402.19 |
26.96 |
77.16 |
1207.25 |
2011-10-13 |
408.43 |
27.18 |
76.37 |
1203.66 |
2011-10-14 |
422.00 |
27.27 |
78.11 |
1224.58 |
rets=close_px.pct_change().dropna()
spx_corr=lambda x:x.corrwith(x['SPX'])
by_year=rets.groupby(lambda x:x.year)
by_year.apply(spx_corr)
|
AAPL |
MSFT |
XOM |
SPX |
2003 |
0.541124 |
0.745174 |
0.661265 |
1.0 |
2004 |
0.374283 |
0.588531 |
0.557742 |
1.0 |
2005 |
0.467540 |
0.562374 |
0.631010 |
1.0 |
2006 |
0.428267 |
0.406126 |
0.518514 |
1.0 |
2007 |
0.508118 |
0.658770 |
0.786264 |
1.0 |
2008 |
0.681434 |
0.804626 |
0.828303 |
1.0 |
2009 |
0.707103 |
0.654902 |
0.797921 |
1.0 |
2010 |
0.710105 |
0.730118 |
0.839057 |
1.0 |
2011 |
0.691931 |
0.800996 |
0.859975 |
1.0 |
by_year.apply(lambda g:g['AAPL'].corr(g['MSFT']))
2003 0.480868
2004 0.259024
2005 0.300093
2006 0.161735
2007 0.417738
2008 0.611901
2009 0.432738
2010 0.571946
2011 0.581987
dtype: float64
示例:面向分组的线性回归
顺着上一个例子继续,你可以用groupby执行更为复杂的分组统计分析,只要函数返回的是一个pandas对象或标量值即可。
例如,我们可以定义下面这个regress函数(利用statsmodels库)对各数据块执行普通最小二乘法(Ordinary Least Squares,OLS)回归
import statsmodels.api as sm
def regress(data,yvar,xvars):
Y=data[yvar]
X=data[xvars]
X['intercetp']=1.
result=sm.OLS(Y,X).fit()
return result.params
by_year.apply(regress,'AAPL',['SPX'])
|
SPX |
intercetp |
2003 |
1.195406 |
0.000710 |
2004 |
1.363463 |
0.004201 |
2005 |
1.766415 |
0.003246 |
2006 |
1.645496 |
0.000080 |
2007 |
1.198761 |
0.003438 |
2008 |
0.968016 |
-0.001110 |
2009 |
0.879103 |
0.002954 |
2010 |
1.052608 |
0.001261 |
2011 |
0.806605 |
0.001514 |
#透视表和交叉表 透视表(pivot table)是各种电子表格程序和其他数据分析软件中一种常见的数据汇总工具。它根据一个或多个键对数据进行聚合,并根据行和列的分组键将数据分配到各个矩形区域中。 在Python和pandas中,可以通过本章所介绍的groupby功能以及重塑运算制作透视表。DataFrame有一个pivot_table方法,此外,还有一个顶级的pandas.pivot_table函数。除能为groupby提供便利之外,pivot_table还可以添加分项小计(也叫做margins) 回到小费数据集,假设我想要根据sex和smoker计算分组平均数(pivot_table的默认集合类型),并将sex和smoker放到行上。。
tips.pivot_table(index=['sex','smoker'])
|
|
size |
tip |
tip_pct |
total_bill |
sex |
smoker |
|
|
|
|
Female |
No |
2.592593 |
2.773519 |
0.156921 |
18.105185 |
Yes |
2.242424 |
2.931515 |
0.182150 |
17.977879 |
Male |
No |
2.711340 |
3.113402 |
0.160669 |
19.791237 |
Yes |
2.500000 |
3.051167 |
0.152771 |
22.284500 |
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns=['smoker'])
|
|
tip_pct |
size |
|
smoker |
No |
Yes |
No |
Yes |
sex |
day |
|
|
|
|
Female |
Fri |
0.165296 |
0.209129 |
2.500000 |
2.000000 |
Sat |
0.147993 |
0.163817 |
2.307692 |
2.200000 |
Sun |
0.165710 |
0.237075 |
3.071429 |
2.500000 |
Thur |
0.155971 |
0.163073 |
2.480000 |
2.428571 |
Male |
Fri |
0.138005 |
0.144730 |
2.000000 |
2.125000 |
Sat |
0.162132 |
0.139067 |
2.656250 |
2.629630 |
Sun |
0.158291 |
0.173964 |
2.883721 |
2.600000 |
Thur |
0.165706 |
0.164417 |
2.500000 |
2.300000 |
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns=['smoker'],margins=True)
|
|
tip_pct |
size |
|
smoker |
No |
Yes |
All |
No |
Yes |
All |
sex |
day |
|
|
|
|
|
|
Female |
Fri |
0.165296 |
0.209129 |
0.199388 |
2.500000 |
2.000000 |
2.111111 |
Sat |
0.147993 |
0.163817 |
0.156470 |
2.307692 |
2.200000 |
2.250000 |
Sun |
0.165710 |
0.237075 |
0.181569 |
3.071429 |
2.500000 |
2.944444 |
Thur |
0.155971 |
0.163073 |
0.157525 |
2.480000 |
2.428571 |
2.468750 |
Male |
Fri |
0.138005 |
0.144730 |
0.143385 |
2.000000 |
2.125000 |
2.100000 |
Sat |
0.162132 |
0.139067 |
0.151577 |
2.656250 |
2.629630 |
2.644068 |
Sun |
0.158291 |
0.173964 |
0.162344 |
2.883721 |
2.600000 |
2.810345 |
Thur |
0.165706 |
0.164417 |
0.165276 |
2.500000 |
2.300000 |
2.433333 |
All |
|
0.159328 |
0.163196 |
0.160803 |
2.668874 |
2.408602 |
2.569672 |
tips.pivot_table('tip_pct',index=['sex','smoker'],columns=['day'],aggfunc=len,margins=True)
|
day |
Fri |
Sat |
Sun |
Thur |
All |
sex |
smoker |
|
|
|
|
|
Female |
No |
2.0 |
13.0 |
14.0 |
25.0 |
54.0 |
Yes |
7.0 |
15.0 |
4.0 |
7.0 |
33.0 |
Male |
No |
2.0 |
32.0 |
43.0 |
20.0 |
97.0 |
Yes |
8.0 |
27.0 |
15.0 |
10.0 |
60.0 |
All |
|
19.0 |
87.0 |
76.0 |
62.0 |
244.0 |
tips.pivot_table('size',index=['time','sex','smoker'],columns=['day'],aggfunc='sum',fill_value=0)
|
|
day |
Fri |
Sat |
Sun |
Thur |
time |
sex |
smoker |
|
|
|
|
Dinner |
Female |
No |
2 |
30 |
43 |
2 |
Yes |
8 |
33 |
10 |
0 |
Male |
No |
4 |
85 |
124 |
0 |
Yes |
12 |
71 |
39 |
0 |
Lunch |
Female |
No |
3 |
0 |
0 |
60 |
Yes |
6 |
0 |
0 |
17 |
Male |
No |
0 |
0 |
0 |
50 |
Yes |
5 |
0 |
0 |
23 |
交叉表: crosstab
交叉表(cross-tabulation,简称crosstab)是一种用于计算分组频率的特殊透视表
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)
|
smoker |
No |
Yes |
All |
time |
day |
|
|
|
Dinner |
Fri |
3 |
9 |
12 |
Sat |
45 |
42 |
87 |
Sun |
57 |
19 |
76 |
Thur |
1 |
0 |
1 |
Lunch |
Fri |
1 |
6 |
7 |
Thur |
44 |
17 |
61 |
All |
|
151 |
93 |
244 |
示例:2012联邦选举委员会数据库
fec=pd.read_csv('ch09/P00000001-ALL.csv')
fec.head()
C:\Users\ZJL\AppData\Local\Programs\Python\Python35\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
0 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
250.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
1 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
50.0 |
23-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
2 |
C00410118 |
P20002978 |
Bachmann, Michelle |
SMITH, LANIER |
LANETT |
AL |
3.68633e+08 |
INFORMATION REQUESTED |
INFORMATION REQUESTED |
250.0 |
05-JUL-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
3 |
C00410118 |
P20002978 |
Bachmann, Michelle |
BLEVINS, DARONDA |
PIGGOTT |
AR |
7.24548e+08 |
NONE |
RETIRED |
250.0 |
01-AUG-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
4 |
C00410118 |
P20002978 |
Bachmann, Michelle |
WARDENBURG, HAROLD |
HOT SPRINGS NATION |
AR |
7.19016e+08 |
NONE |
RETIRED |
300.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
fec.ix[123456]
cmte_id C00431445 cand_id P80003338 cand_nm Obama, Barack contbr_nm ELLMAN, IRA contbr_city TEMPE contbr_st AZ contbr_zip 852816719 contbr_employer ARIZONA STATE UNIVERSITY contbr_occupation PROFESSOR contb_receipt_amt 50 contb_receipt_dt 01-DEC-11 receipt_desc NaN memo_cd NaN memo_text NaN form_tp SA17A file_num 772372 Name: 123456, dtype: object
unique_cands=fec.cand_nm.unique()
unique_cands
array([‘Bachmann, Michelle’, ‘Romney, Mitt’, ‘Obama, Barack’, “Roemer, Charles E. ‘Buddy’ III”, ‘Pawlenty, Timothy’, ‘Johnson, Gary Earl’, ‘Paul, Ron’, ‘Santorum, Rick’, ‘Cain, Herman’, ‘Gingrich, Newt’, ‘McCotter, Thaddeus G’, ‘Huntsman, Jon’, ‘Perry, Rick’], dtype=object)
unique_cands[2]
‘Obama, Barack’
parties={'Bachmann, Michelle':'Republican',
'Cain, Herman':'Republican',
'Gingrich, Newt':'Republican',
'Johnson, Gary Earl':'Republican',
'McCotter, Thaddeus G':'Republican',
'Obama, Barack':'Democrat',
'Paul, Ron':'Republican',
'Pawlenty, Timothy':'Republican',
'Perry, Rick':'Republican',
"Roemer, Charles E. 'Buddy' III":'Republican',
'Romney, Mitt':'Republican',
'Cain, Herman':'Republican',
'Santorum, Rick':'Republican'
}
fec.cand_nm[123456:123461]
123456 Obama, Barack 123457 Obama, Barack 123458 Obama, Barack 123459 Obama, Barack 123460 Obama, Barack Name: cand_nm, dtype: object
fec.cand_nm[123456:123461].map(parties)
123456 Democrat 123457 Democrat 123458 Democrat 123459 Democrat 123460 Democrat Name: cand_nm, dtype: object
fec['party']=fec.cand_nm.map(parties)
fec['party'].value_counts()
Democrat 593746 Republican 403829 Name: party, dtype: int64
(fec.contb_receipt_amt>0).value_counts()
True 991475 False 10256 Name: contb_receipt_amt, dtype: int64
fec=fec[fec.contb_receipt_amt>0]
fec_mrbo=fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]
fec_mrbo.head()
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
party |
411 |
C00431171 |
P80003353 |
Romney, Mitt |
ELDERBAUM, WILLIAM |
DPO |
AA |
3.4023e+08 |
US GOVERNMENT |
FOREIGN SERVICE OFFICER |
25.0 |
01-FEB-12 |
NaN |
NaN |
NaN |
SA17A |
780124 |
Republican |
412 |
C00431171 |
P80003353 |
Romney, Mitt |
ELDERBAUM, WILLIAM |
DPO |
AA |
3.4023e+08 |
US GOVERNMENT |
FOREIGN SERVICE OFFICER |
110.0 |
01-FEB-12 |
NaN |
NaN |
NaN |
SA17A |
780124 |
Republican |
413 |
C00431171 |
P80003353 |
Romney, Mitt |
CARLSEN, RICHARD |
APO |
AE |
9.128e+07 |
DEFENSE INTELLIGENCE AGENCY |
INTELLIGENCE ANALYST |
250.0 |
13-APR-12 |
NaN |
NaN |
NaN |
SA17A |
785689 |
Republican |
414 |
C00431171 |
P80003353 |
Romney, Mitt |
DELUCA, PIERRE |
APO |
AE |
9.128e+07 |
CISCO |
ENGINEER |
30.0 |
21-AUG-11 |
NaN |
NaN |
NaN |
SA17A |
760261 |
Republican |
415 |
C00431171 |
P80003353 |
Romney, Mitt |
SARGENT, MICHAEL |
APO |
AE |
9.01201e+07 |
RAYTHEON TECHNICAL SERVICES CORP |
COMPUTER SYSTEMS ENGINEER |
100.0 |
07-MAR-12 |
NaN |
NaN |
NaN |
SA17A |
780128 |
Republican |
根据职业和雇主统计赞助信息
fec.contbr_occupation.value_counts()[:10]
RETIRED 233990 INFORMATION REQUESTED 35107 ATTORNEY 34286 HOMEMAKER 29931 PHYSICIAN 23432 INFORMATION REQUESTED PER BEST EFFORTS 21138 ENGINEER 14334 TEACHER 13990 CONSULTANT 13273 PROFESSOR 12555 Name: contbr_occupation, dtype: int64
occ_mapping={'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'INFORMATION REQUESTED (BEST EFFORTS)':'NOT PROVIDED',
'C.E.O':'CEO'}
f=lambda x:occ_mapping.get(x,x)
fec.contbr_occupation=fec.contbr_occupation.map(f)
emp_mapping={'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'INFORMATION REQUESTED':'NOT PROVIDED',
'SELF':'SELF-EMPLOYED',}
f=lambda x:emp_mapping.get(x,x)
fec.contbr_employer=fec.contbr_employer.map(f)
by_occupation=fec.pivot_table('contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum')
over_2mm=by_occupation[by_occupation.sum(1)>2000000]
over_2mm
party |
Democrat |
Republican |
contbr_occupation |
|
|
ATTORNEY |
11141982.97 |
7.333637e+06 |
C.E.O. |
1690.00 |
2.543933e+06 |
CEO |
2074284.79 |
1.598978e+06 |
CONSULTANT |
2459912.71 |
2.502470e+06 |
ENGINEER |
951525.55 |
1.812534e+06 |
EXECUTIVE |
1355161.05 |
4.057645e+06 |
HOMEMAKER |
4248875.80 |
1.338025e+07 |
INVESTOR |
884133.00 |
2.356671e+06 |
LAWYER |
3160478.87 |
3.394023e+05 |
MANAGER |
762883.22 |
1.430643e+06 |
NOT PROVIDED |
4866973.96 |
2.018997e+07 |
OWNER |
1001567.36 |
2.376829e+06 |
PHYSICIAN |
3735124.94 |
3.555067e+06 |
PRESIDENT |
1878509.95 |
4.622173e+06 |
PROFESSOR |
2165071.08 |
2.839557e+05 |
REAL ESTATE |
528902.09 |
1.567552e+06 |
RETIRED |
25305116.38 |
2.320195e+07 |
SELF-EMPLOYED |
672393.40 |
1.621010e+06 |
import matplotlib.pyplot as plt
over_2mm.plot(kind='barh')
plt.show()
def get_top_amounts(group,key,n=5):
totals=group.groupby(key)['contb_receipt_amt'].sum()
return totals.sort_values(ascending=False)[:n]
grouped=fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
cand_nm contbr_occupation Obama, Barack RETIRED 25305116.38 ATTORNEY 11141982.97 INFORMATION REQUESTED 4866973.96 HOMEMAKER 4248875.80 PHYSICIAN 3735124.94 LAWYER 3160478.87 CONSULTANT 2459912.71 Romney, Mitt RETIRED 11508473.59 INFORMATION REQUESTED PER BEST EFFORTS 11396894.84 HOMEMAKER 8147446.22 ATTORNEY 5364718.82 PRESIDENT 2491244.89 EXECUTIVE 2300947.03 C.E.O. 1968386.11 Name: contb_receipt_amt, dtype: float64
grouped.apply(get_top_amounts,'contbr_employer',n=10)
cand_nm contbr_employer Obama, Barack RETIRED 22694358.85 SELF-EMPLOYED 17080985.96 NOT EMPLOYED 8586308.70 INFORMATION REQUESTED 5053480.37 HOMEMAKER 2605408.54 SELF 1076531.20 SELF EMPLOYED 469290.00 STUDENT 318831.45 VOLUNTEER 257104.00 MICROSOFT 215585.36 Romney, Mitt INFORMATION REQUESTED PER BEST EFFORTS 12059527.24 RETIRED 11506225.71 HOMEMAKER 8147196.22 SELF-EMPLOYED 7409860.98 STUDENT 496490.94 CREDIT SUISSE 281150.00 MORGAN STANLEY 267266.00 GOLDMAN SACH & CO. 238250.00 BARCLAYS CAPITAL 162750.00 H.I.G. CAPITAL 139500.00 Name: contb_receipt_amt, dtype: float64
对出资额分组
利用cut函数根据出资额的大小将数据离散化到多个面元中
bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)
labels.head()
411 (10, 100] 412 (100, 1000] 413 (100, 1000] 414 (10, 100] 415 (10, 100] Name: contb_receipt_amt, dtype: category Categories (8, object): [(0, 1]
grouped=fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
cand_nm |
Obama, Barack |
Romney, Mitt |
contb_receipt_amt |
|
|
(0, 1] |
493.0 |
77.0 |
(1, 10] |
40070.0 |
3681.0 |
(10, 100] |
372280.0 |
31853.0 |
(100, 1000] |
153991.0 |
43357.0 |
(1000, 10000] |
22284.0 |
26186.0 |
(10000, 100000] |
2.0 |
1.0 |
(100000, 1000000] |
3.0 |
NaN |
(1000000, 10000000] |
4.0 |
NaN |
bucket_sums=grouped.contb_receipt_amt.sum().unstack(0)
bucket_sums
cand_nm |
Obama, Barack |
Romney, Mitt |
contb_receipt_amt |
|
|
(0, 1] |
318.24 |
77.00 |
(1, 10] |
337267.62 |
29819.66 |
(10, 100] |
20288981.41 |
1987783.76 |
(100, 1000] |
54798531.46 |
22363381.69 |
(1000, 10000] |
51753705.67 |
63942145.42 |
(10000, 100000] |
59100.00 |
12700.00 |
(100000, 1000000] |
1490683.08 |
NaN |
(1000000, 10000000] |
7148839.76 |
NaN |
normed_sums=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sums
cand_nm |
Obama, Barack |
Romney, Mitt |
contb_receipt_amt |
|
|
(0, 1] |
0.805182 |
0.194818 |
(1, 10] |
0.918767 |
0.081233 |
(10, 100] |
0.910769 |
0.089231 |
(100, 1000] |
0.710176 |
0.289824 |
(1000, 10000] |
0.447326 |
0.552674 |
(10000, 100000] |
0.823120 |
0.176880 |
(100000, 1000000] |
1.000000 |
NaN |
(1000000, 10000000] |
1.000000 |
NaN |
normed_sums[:-2].plot(kind='barh',stacked=True)
plt.show()
根据州统计赞助信息
grouped=fec_mrbo.groupby(['cand_nm','contbr_st'])
totals=grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals=totals[totals.sum(1)>100000]
totals[:10]
cand_nm |
Obama, Barack |
Romney, Mitt |
contbr_st |
|
|
AK |
281840.15 |
86204.24 |
AL |
543123.48 |
527303.51 |
AR |
359247.28 |
105556.00 |
AZ |
1506476.98 |
1888436.23 |
CA |
23824984.24 |
11237636.60 |
CO |
2132429.49 |
1506714.12 |
CT |
2068291.26 |
3499475.45 |
DC |
4373538.80 |
1025137.50 |
DE |
336669.14 |
82712.00 |
FL |
7318178.58 |
8338458.81 |
percent=totals.div(totals.sum(1),axis=0)
percent[:10]
cand_nm |
Obama, Barack |
Romney, Mitt |
contbr_st |
|
|
AK |
0.765778 |
0.234222 |
AL |
0.507390 |
0.492610 |
AR |
0.772902 |
0.227098 |
AZ |
0.443745 |
0.556255 |
CA |
0.679498 |
0.320502 |
CO |
0.585970 |
0.414030 |
CT |
0.371476 |
0.628524 |
DC |
0.810113 |
0.189887 |
DE |
0.802776 |
0.197224 |
FL |
0.467417 |
0.532583 |
from mpl_toolkits.basemap import Basemap,cm
from matplotlib import rcParams
from matplotlib.collections import LineCollection
from shapelib import ShapeFile
import dbflib
obama=percent['Obama, Barack']
fig=plt.figure(figsize=(12,12))
ax=fig.add_axes[0.1,0.1,0.8,0.8]
lllat=21;urlat=53;lllon=-118,urlon=-62
m=Basemap(ax=ax,projection='stere',lon_0=(urlon+lllon)/2,lat_0=(urlat+lllat)/2,
llcrnrlat=lllat,urcrnrlat=urlat,llcrnrlon=lllon,
urcrnrlon=urlon,resolution='l')
m.drawcoastlines()
m.drawcounties()
shp=ShapeFile('../states/statespo2o')
dbf=dblib.open('../states/statespo2o')
for nploy in range(shp.info()[0]):
shpsegs=[]
shp_object=shp.read_object(npoly)
verts=shp_object.vertices()
rings=len(verts)
for ring in range(rings):
lons,lats=zip(*verts[ring])
x,y=m(lons,lats)
shpsegs.append(zip(x,y))
if ring==0:
shapedict=dbf.read_record(npoly)
name=shapedict['STATE']
lines=LineCollection(shpsegs,antialiaseds=(1,))
try:
per=obama[state_to_code[name.upper()]]
except KeyError:
continue
lines.set_facecolors('k')
lines.set_alpha(0.75*per)
lines.set_edgecolors('k')
lines.set_linewidth(0.3)
plt.show()