import numpy as np
import pandas as pd
df = pd.read_csv('./data/table.csv',index_col = 'ID')
df.head()
|
Unnamed: 0 |
School |
Class |
Gender |
Address |
Height |
Weight |
Math |
Physics |
ID |
|
|
|
|
|
|
|
|
|
1101 |
0 |
S_1 |
C_1 |
M |
street_1 |
173 |
63 |
34.0 |
A+ |
1102 |
1 |
S_1 |
C_1 |
F |
street_2 |
192 |
73 |
32.5 |
B+ |
1103 |
2 |
S_1 |
C_1 |
M |
street_2 |
186 |
82 |
87.2 |
B+ |
1104 |
3 |
S_1 |
C_1 |
F |
street_2 |
167 |
81 |
80.4 |
B- |
1105 |
4 |
S_1 |
C_1 |
F |
street_4 |
159 |
64 |
84.8 |
B+ |
聚合、过滤和变换
聚合(Aggregation)
常用聚合函数
- 聚合:将一系列数变成一个标量。e.g. mean/sum/size/count/std/var/sem/describe/first/last/nth/min/max
grouped_single = df.groupby('School')
group_m = grouped_single['Math']
group_m.std().values/np.sqrt(group_m.count().values)==group_m.sem().values
array([ True, True])
同时使用多个聚合函数
group_m.agg(['sum','mean','std'])
|
sum |
mean |
std |
School |
|
|
|
S_1 |
956.2 |
63.746667 |
23.077474 |
S_2 |
1191.1 |
59.555000 |
17.589305 |
利用元组进行重命名
group_m.agg([('rename_sum','sum'),('rename_mean','mean'),('rename_std','std')])
|
rename_sum |
rename_mean |
rename_std |
School |
|
|
|
S_1 |
956.2 |
63.746667 |
23.077474 |
S_2 |
1191.1 |
59.555000 |
17.589305 |
指定哪些函数作用哪些列
grouped_mul = df.groupby(['School','Class'])
grouped_mul.agg({'Math':['mean','max'],'Height':'var'})
|
|
Math |
Height |
|
|
mean |
max |
var |
School |
Class |
|
|
|
S_1 |
C_1 |
63.78 |
87.2 |
183.3 |
C_2 |
64.30 |
97.0 |
132.8 |
C_3 |
63.16 |
87.7 |
179.2 |
S_2 |
C_1 |
58.56 |
83.3 |
54.7 |
C_2 |
62.80 |
85.4 |
256.0 |
C_3 |
63.06 |
95.5 |
205.7 |
C_4 |
53.80 |
67.7 |
300.2 |
自定义函数
grouped_single['Math'].agg(lambda x:print(x.head(),'间隔'))
1101 34.0
1102 32.5
1103 87.2
1104 80.4
1105 84.8
Name: Math, dtype: float64 间隔
2101 83.3
2102 50.6
2103 52.5
2104 72.2
2105 34.2
Name: Math, dtype: float64 间隔
School
S_1 None
S_2 None
Name: Math, dtype: object
- RMK:官方没有提供计算极差的函数,可以用agg实现组内极差计算
grouped_single['Math'].agg(lambda x: x.max()-x.min())
School
S_1 65.5
S_2 62.8
Name: Math, dtype: float64
利用NamedAgg函数进行多个聚合
- RMK:不支持lambda函数,但是可以使用外置的def函数
def R1(x):
return x.max()-x.min()
def R2(x):
return x.max()-x.median()
grouped_single['Math'].agg(min_score1=pd.NamedAgg(column='col1',aggfunc=R1),max_score1=pd.NamedAgg(column='col2',aggfunc='max'),range_score2=pd.NamedAgg(column='col3',aggfunc=R2)).head()
|
min_score1 |
max_score1 |
range_score2 |
School |
|
|
|
S_1 |
65.5 |
97.0 |
33.5 |
S_2 |
62.8 |
95.5 |
39.4 |
带参数的聚合函数
def f(s,low,high):
return s.between(low,high).max()
grouped_single['Math'].agg(f,50,52)
School
S_1 False
S_2 True
Name: Math, dtype: bool
如果需要使用多个函数,并且其中至少有一个带参数,则使用wrap技巧
def f_test (s,low,high):
return s.between(low,high).max()
def agg_f(f_mul,name,*args,**kwargs):
def wrapper(x):
return f_mul(x,*args,**kwargs)
wrapper.__name__=name
return wrapper
new_f = agg_f(f_test,'at_least_one_in_50_52',50,52)
grouped_single['Math'].agg([new_f,'mean']).head()
|
at_least_one_in_50_52 |
mean |
School |
|
|
S_1 |
False |
63.746667 |
S_2 |
True |
59.555000 |
过滤(Filteration)
- filter函数:用来筛选某些组(结果为组的全体),故传入的值是布尔标量
grouped_single[['Math','Physics']].filter(lambda x: (x['Math']>30).all()).head()
|
Math |
Physics |
ID |
|
|
1101 |
34.0 |
A+ |
1102 |
32.5 |
B+ |
1103 |
87.2 |
B+ |
1104 |
80.4 |
B- |
1105 |
84.8 |
B+ |
变换(Transformation)
传入对象
- Transform函数中出传入的对象是组内的列,并且返回值需要与列长完全一致
grouped_single[['Math','Height']].transform(lambda x : x-x.min()).head()
|
Math |
Height |
ID |
|
|
1101 |
2.5 |
14 |
1102 |
1.0 |
33 |
1103 |
55.7 |
27 |
1104 |
48.9 |
8 |
1105 |
53.3 |
0 |
- 若返回了标量值,则组内的所有元素都会被广播为这个值
grouped_single[['Math','Height']].transform(lambda x:x.mean()).head()
|
Math |
Height |
ID |
|
|
1101 |
63.746667 |
175.733333 |
1102 |
63.746667 |
175.733333 |
1103 |
63.746667 |
175.733333 |
1104 |
63.746667 |
175.733333 |
1105 |
63.746667 |
175.733333 |
利用变换方法进行组内标准化
grouped_single[['Math','Height']].transform(lambda x:(x-x.mean())/x.std()).head()
|
Math |
Height |
ID |
|
|
1101 |
-1.288991 |
-0.214991 |
1102 |
-1.353990 |
1.279460 |
1103 |
1.016287 |
0.807528 |
1104 |
0.721627 |
-0.686923 |
1105 |
0.912289 |
-1.316166 |
利用变换方法进行组内缺失值的均值填充
df_nan_1 = df[['Math','School']].copy()
df_nan_1.head()
|
Math |
School |
ID |
|
|
1101 |
34.0 |
S_1 |
1102 |
32.5 |
S_1 |
1103 |
87.2 |
S_1 |
1104 |
80.4 |
S_1 |
1105 |
84.8 |
S_1 |
df_nan = df_nan_1.reset_index()
df_nan.loc[np.random.randint(0,df.shape[0],25),['Math']] = np.nan
df_nan.head()
|
ID |
Math |
School |
0 |
1101 |
NaN |
S_1 |
1 |
1102 |
NaN |
S_1 |
2 |
1103 |
87.2 |
S_1 |
3 |
1104 |
NaN |
S_1 |
4 |
1105 |
NaN |
S_1 |
df_nan.groupby('School').transform(lambda x:x.fillna(x.mean())).join(df.reset_index()['School']).head()
|
ID |
Math |
School |
0 |
1101 |
83.225 |
S_1 |
1 |
1102 |
83.225 |
S_1 |
2 |
1103 |
87.200 |
S_1 |
3 |
1104 |
83.225 |
S_1 |
4 |
1105 |
83.225 |
S_1 |
apply函数
apply函数的灵活性
df.groupby('School').apply(lambda x:print(x.head(1)))
Unnamed: 0 School Class Gender Address Height Weight Math Physics
ID
1101 0 S_1 C_1 M street_1 173 63 34.0 A+
Unnamed: 0 School Class Gender Address Height Weight Math Physics
ID
2101 15 S_2 C_1 M street_7 174 84 83.3 C
apply函数的灵活性很大程度上来源于其返回值的多样性
- 标量返回值
df[['School','Math','Height']].groupby('School').apply(lambda x:x.max())
|
School |
Math |
Height |
School |
|
|
|
S_1 |
S_1 |
97.0 |
195 |
S_2 |
S_2 |
95.5 |
194 |
- 列表返回值
df[['School','Math','Height']].groupby('School').apply(lambda x: x- x.min()).head()
|
Math |
Height |
ID |
|
|
1101 |
2.5 |
14.0 |
1102 |
1.0 |
33.0 |
1103 |
55.7 |
27.0 |
1104 |
48.9 |
8.0 |
1105 |
53.3 |
0.0 |
- 数据框返回值
df[['School','Math','Height']].groupby('School').apply(lambda x:pd.DataFrame({'col1':x['Math']-x['Math'].max()})).head()
|
col1 |
ID |
|
1101 |
-63.0 |
1102 |
-64.5 |
1103 |
-9.8 |
1104 |
-16.6 |
1105 |
-12.2 |
用apply同时统计多个指标
from collections import OrderedDict
def f(df):
data = OrderedDict()
data['M_sum'] = df['Math'].sum()
data['W_var'] = df['Weight'].var()
data['H_mean'] = df['Height'].mean()
return pd.Series(data)
grouped_single.apply(f)
|
M_sum |
W_var |
H_mean |
School |
|
|
|
S_1 |
956.2 |
117.428571 |
175.733333 |
S_2 |
1191.1 |
181.081579 |
172.950000 |