pandas分组聚合实例

文中所用数据来源:https://github.com/wesm/pydata-book/tree/2nd-edition/examples/tips.csv

import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)  #显示所有列

content=pd.read_csv('data/tips.csv')
content['tip_pct']=content['tip']/content['total_bill']

'''
以下实现美股四只股票与标准普尔指数的相关性计算
'''
close_px = pd.read_csv('data/stock_px_2.csv', parse_dates=True,
                       index_col=0)
spx_corr=lambda c:c.corrwith(c['SPX'])  #corrwith()实现与c['SPX']相关性计算
rets=close_px.pct_change().dropna()
get_year=lambda x:x.year
by_year=rets.groupby(get_year)  #groupby()也可以用函数名作为参数,将函数返回的结果作为groupby()的分组项
by_year.apply(spx_corr)
'''
          AAPL      MSFT       XOM  SPX
2003  0.541124  0.745174  0.661265  1.0
2004  0.374283  0.588531  0.557742  1.0
2005  0.467540  0.562374  0.631010  1.0
2006  0.428267  0.406126  0.518514  1.0
2007  0.508118  0.658770  0.786264  1.0
2008  0.681434  0.804626  0.828303  1.0
2009  0.707103  0.654902  0.797921  1.0
2010  0.710105  0.730118  0.839057  1.0
2011  0.691931  0.800996  0.859975  1.0
'''

table=content.pivot_table(index=['day', 'smoker'])
#pivot_table()透视表功能和grouped.agg(np.mean)一致,实现分组平均值计算
'''
                 size       tip   tip_pct  total_bill
day  smoker                                          
Fri  No      2.250000  2.812500  0.151650   18.420000
     Yes     2.066667  2.714000  0.174783   16.813333
Sat  No      2.555556  3.102889  0.158048   19.661778
     Yes     2.476190  2.875476  0.147906   21.276667
Sun  No      2.929825  3.167895  0.160113   20.506667
     Yes     2.578947  3.516842  0.187250   24.120000
Thur No      2.488889  2.673778  0.160298   17.113111
     Yes     2.352941  3.030000  0.163863   19.190588
'''
table1=content.pivot_table('tip_pct',index=['time','smoker'],columns='day',margins=True,aggfunc=len,fill_value=0.0)
#aggfunc=len实现分组大小计算,margins=True实现增加一列All显示aggfunc结果,默认情况是显示平均数计算,fill_value实现空值填充
'''
day            Fri  Sat  Sun  Thur    All
time   smoker                            
Dinner No        3   45   57     1  106.0
       Yes       9   42   19     0   70.0
Lunch  No        1    0    0    44   45.0
       Yes       6    0    0    17   23.0
All             19   87   76    62  244.0
'''

stat_crosstab=pd.crosstab(content.time,content.day,margins=True)
stat_pivottable=content.pivot_table('tip',index='time',columns='day',aggfunc=len,margins=True,fill_value=0)
#以上两行代码实现同样功能和结果,分别用crosstab()实现对数量的统计,也可用pivot_table实现
'''
result:
day     Fri  Sat  Sun  Thur  All
time                            
Dinner   12   87   76     1  176
Lunch     7    0    0    61   68
All      19   87   76    62  244
'''

你可能感兴趣的:(数据分析)