本文介绍透视表pivot_table和交叉表crosstab。
import pandas as pd
import numpy as np
%matplotlib inline
%matplotlib notebook
import matplotlib.pyplot as plt
# 导入数据
tips = pd.read_csv('data/tips/tips2.csv')
tips['tip_pct'] = tips['tip']/tips['total_bill']
print(tips.head())
# total_bill tip sex smoker day time size tip_pct
#0 16.99 1.01 Female No Sun Dinner 2 0.059447
#1 10.34 1.66 Male No Sun Dinner 3 0.160542
#2 21.01 3.50 Female No Sun Dinner 3 0.166587
#3 23.68 3.31 Male No Sun Dinner 2 0.139780
#4 24.59 3.61 Female No Sun Dinner 4 0.146808
1.1 制作数据透视表方法1 : groupby + 聚合函数
tips.groupby(['sex','smoker']).mean()
1.2 制作数据透视表方法2 :pivot_table
# pivot_table1:整个dataframe
tips.pivot_table(index=['sex','smoker'])
# pivot_table2:dataframe中一部分数据
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker')
# pivot_table3:传入小计margins=True
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker',margins=True)
# pivot_table4:传入其他聚合函数aggfunc
tips.pivot_table('tip_pct',index=['sex','smoker'],columns='day',aggfunc=len,margins=True)
# pivot_table5:fill_value
tips.pivot_table('size',index=['time','sex','smoker'],columns='day',aggfunc=np.sum,fill_value=0)
# 交叉表1
data=pd.DataFrame({
'sample':[1,2,3,4,5,6,7,8,9,10],
'Gender': ['Female','Male','Female','Male','Male','Male','Female','Female','Male','Female'],
'Handedness':['Right-handed','Left-handed','Right-handed','Right-handed','Left-handed','Right-handed','Right-handed','Left-handed','Right-handed','Right-handed']})
data
# sample Gender Handedness
#0 1 Female Right-handed
#1 2 Male Left-handed
#2 3 Female Right-handed
#3 4 Male Right-handed
#4 5 Male Left-handed
#5 6 Male Right-handed
#6 7 Female Right-handed
#7 8 Female Left-handed
#8 9 Male Right-handed
#9 10 Female Right-handed
2.1 用groupby制作出crosstab
# 用groupby制作出crosstab
data.groupby(['Gender','Handedness']).count().unstack()
2.2 用pivot_table制作出crosstab
# 用pivot_table制作出crosstab
data.pivot_table('sample',index=['Gender'],columns='Handedness',aggfunc='count',margins=True)
2.3 crosstab函数
# crosstab1
pd.crosstab(data.Gender,data.Handedness,margins=True)
# crosstab2
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)