import pandas as pd
import numpy as np
创建一个8 * 4的DataFrame:
dict_obj = {'key1' : ['a', 'b', 'a', 'b',
'a', 'b', 'a', 'a'],
'key2' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'data1': np.random.randn(8),
'data2': np.random.randn(8)}
df_obj = pd.DataFrame(dict_obj)
print df_obj
data1 data2 key1 key2
0 0.922654 -0.555087 a one
1 1.419044 -1.686106 b one
2 0.312465 0.553175 a two
3 0.000348 1.212291 b three
4 -0.867338 0.051782 a two
5 0.187010 -1.447387 b two
6 0.463482 -0.132936 a one
7 -0.588881 0.330971 a three
groupBy对象有两类:DataFrameGroupBy,SeriesGroupBy
groupBy对象实际上没有进行实际的运算,只是包含分组的中间数据。
把经过groupBy的DataFrame打印出来,是没有直接的数据的,如下,根据key1进行分组:
print df_obj.groupby('key1')
来打印以下这个进行了groupBy操作的DataFrame的类型,显示的是类型是DataFrameGroupBy
print type(df_obj.groupby('key1'))
可以指定DataFrame中的某一列根据某个列分组,如下,data1列根据key1进行分组,返回的类型是SeriesGroupBy
print type(df_obj['data1'].groupby(df_obj['key1']))
对DataFrame进行分组运算, 以下例子,整个DataFrame根据key1进行分组,并求得其他列在每组key1中的均值。因为key2是string类型的值,所以在求均值的时候自动舍弃了它的计算,故返回了data1, data2在a,b两个组中的均值。
grouped1 = df_obj.groupby('key1')
print grouped1.mean()
data1 data2
key1
a 0.048476 0.049581
b 0.535467 -0.640401
对DataFrame的某一列进行分组运算:
grouped2 = df_obj['data1'].groupby(df_obj['key1'])
print grouped2.mean()
key1
a 0.048476
b 0.535467
Name: data1, dtype: float64
.size是一种特殊的分组运算,它返回的是每个分组中的元素个数
# size
print grouped1.size()
print grouped2.size()
key1
a 5
b 3
dtype: int64
key1
a 5
b 3
dtype: int64
分组的方式可以归纳为以下几点:
# 1.按列名分组
print df_obj.groupby('key1')
# 2.按自定义key分组,列表
self_def_key = [1, 1, 2, 2, 2, 1, 1, 1]
print df_obj
print df_obj.groupby(self_def_key).size()
data1 data2 key1 key2
0 0.922654 -0.555087 a one
1 1.419044 -1.686106 b one
2 0.312465 0.553175 a two
3 0.000348 1.212291 b three
4 -0.867338 0.051782 a two
5 0.187010 -1.447387 b two
6 0.463482 -0.132936 a one
7 -0.588881 0.330971 a three
1 5
2 3
dtype: int64
# 按自定义key多层分组
print df_obj.groupby([df_obj['key1'], df_obj['key2']]).size()
key1 key2
a one 2
three 1
two 2
b one 1
three 1
two 1
dtype: int64
# 3.按列名多层分组
grouped2 = df_obj.groupby(['key1', 'key2'])
print grouped2.size()
key1 key2
a one 2
three 1
two 2
b one 1
three 1
two 1
dtype: int64
# 4.多层分组按key的顺序进行
grouped3 = df_obj.groupby(['key2', 'key1'])
print grouped3.mean()
print
print grouped3.mean().unstack()
data1 data2
key2 key1
one a 0.509819 -0.683087
b 0.379854 0.849821
three a 0.364664 0.014630
b -0.063710 -1.136585
two a -0.212407 -0.588067
b 0.854460 -0.474144
data1 data2
key1 a b a b
key2
one 0.509819 0.379854 -0.683087 0.849821
three 0.364664 -0.063710 0.014630 -1.136585
two -0.212407 0.854460 -0.588067 -0.474144
unstack可以将多层索引的结果转换成单层的dataframe,等于就是一个二维的交叉表了。
groupBy对象支持迭代操作。每次迭代返回一个元组(group_name, group_data)。可用于分组数据的具体运算。
单层与多层的DataFrame都可以实现分组迭代,以下分别是两个例子:
# 单层分组
for group_name, group_data in grouped1:
print group_name
print group_data
a
data1 data2 key1 key2
0 -0.079203 -0.844599 a one
2 -0.961888 -1.502866 a two
4 0.537074 0.326732 a two
6 1.098841 -0.521574 a one
7 0.364664 0.014630 a three
b
data1 data2 key1 key2
1 0.379854 0.849821 b one
3 -0.063710 -1.136585 b three
5 0.854460 -0.474144 b two
# 多层分组
for group_name, group_data in grouped2:
print group_name
print group_data
('a', 'one')
data1 data2 key1 key2
0 -0.079203 -0.844599 a one
6 1.098841 -0.521574 a one
('a', 'three')
data1 data2 key1 key2
7 0.364664 0.01463 a three
('a', 'two')
data1 data2 key1 key2
2 -0.961888 -1.502866 a two
4 0.537074 0.326732 a two
('b', 'one')
data1 data2 key1 key2
1 0.379854 0.849821 b one
('b', 'three')
data1 data2 key1 key2
3 -0.06371 -1.136585 b three
('b', 'two')
data1 data2 key1 key2
5 0.85446 -0.474144 b two
# GroupBy对象转换list
print list(grouped1)
[('a', data1 data2 key1 key2
0 0.922654 -0.555087 a one
2 0.312465 0.553175 a two
4 -0.867338 0.051782 a two
6 0.463482 -0.132936 a one
7 -0.588881 0.330971 a three), ('b', data1 data2 key1 key2
1 1.419044 -1.686106 b one
3 0.000348 1.212291 b three
5 0.187010 -1.447387 b two)]
# GroupBy对象转换dict
print dict(list(grouped1))
{'a': data1 data2 key1 key2
0 0.922654 -0.555087 a one
2 0.312465 0.553175 a two
4 -0.867338 0.051782 a two
6 0.463482 -0.132936 a one
7 -0.588881 0.330971 a three, 'b': data1 data2 key1 key2
1 1.419044 -1.686106 b one
3 0.000348 1.212291 b three
5 0.187010 -1.447387 b two}
# 按列分组
print df_obj.dtypes
# 按数据类型分组
print df_obj.groupby(df_obj.dtypes, axis=1).size()
print df_obj.groupby(df_obj.dtypes, axis=1).sum()
data1 float64
data2 float64
key1 object
key2 object
dtype: object
float64 2
object 2
dtype: int64
float64 object
0 0.367567 aone
1 -0.267062 bone
2 0.865640 atwo
3 1.212639 bthree
4 -0.815557 atwo
5 -1.260377 btwo
6 0.330546 aone
7 -0.257910 athree
df_obj2 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
columns=['a', 'b', 'c', 'd', 'e'],
index=['A', 'B', 'C', 'D', 'E'])
print df_obj2
a b c d e
A 8 5 7 4 1
B 3 9 5 4 7
C 9 3 3 8 9
D 8 5 3 2 6
E 3 8 1 3 7
# 更改部分数据为空
df_obj2.ix[1, 1:4] = np.NaN
print df_obj2
a b c d e
A 8 5.0 7.0 4.0 1
B 3 NaN NaN NaN 7
C 9 3.0 3.0 8.0 9
D 8 5.0 3.0 2.0 6
E 3 8.0 1.0 3.0 7
# 通过字典分组
mapping_dict = {'a':'python', 'b':'python', 'c':'java', 'd':'C', 'e':'java'}
print df_obj2.groupby(mapping_dict, axis=1).size()
print
print df_obj2.groupby(mapping_dict, axis=1).count() # 非NaN的个数
print
print df_obj2.groupby(mapping_dict, axis=1).sum()
C 1
java 2
python 2
dtype: int64
C java python
A 1 2 2
B 0 1 1
C 1 2 2
D 1 2 2
E 1 2 2
C java python
A 4.0 8.0 13.0
B NaN 7.0 3.0
C 8.0 12.0 12.0
D 2.0 9.0 13.0
E 3.0 8.0 11.0
df_obj3 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
columns=['a', 'b', 'c', 'd', 'e'],
index=['AA', 'BBB', 'CC', 'D', 'EE'])
print df_obj3
print
def group_key(idx):
"""
idx 为列索引或行索引
"""
#return idx
return len(idx)
print df_obj3.groupby(group_key).size()
print
# 以上自定义函数等价于
print df_obj3.groupby(len).size()
a b c d e
AA 9 1 8 1 4
BBB 3 2 9 3 2
CC 2 3 9 7 1
D 3 7 8 5 1
EE 9 9 9 4 1
1 1
2 3
3 1
dtype: int64
1 1
2 3
3 1
dtype: int64
# 创建一个多层索引的DataFrame
columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'],
['A', 'A', 'B', 'C', 'B']], names=['language', 'index'])
df_obj4 = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
print df_obj4
language Python Java Python Java Python
index A A B C B
0 2 9 4 6 7
1 6 3 5 7 4
2 5 9 5 3 6
3 4 2 5 7 5
4 8 3 3 8 5
# 根据language进行分组
print df_obj4.groupby(level='language', axis=1).sum()
print
print df_obj4.groupby(level='index', axis=1).sum()
language Java Python
0 15 13
1 10 15
2 12 16
3 9 14
4 11 16
index A B C
0 11 11 6
1 9 9 7
2 14 11 3
3 6 10 7
4 11 8 8
常用于对分组后的数据进行计算
# 根据字典创建一个DataFrame
dict_obj = {'key1' : ['a', 'b', 'a', 'b',
'a', 'b', 'a', 'a'],
'key2' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'data1': np.random.randint(1,10, 8),
'data2': np.random.randint(1,10, 8)}
df_obj5 = pd.DataFrame(dict_obj)
print df_obj5
data1 data2 key1 key2
0 7 7 a one
1 2 1 b one
2 9 6 a two
3 3 7 b three
4 8 5 a two
5 2 2 b two
6 3 7 a one
7 9 6 a three
常用的有sum(), mean(), max(), min(), count(), size(), describe()
# 使用内置的聚合函数
print df_obj5.groupby('key1').sum()
print
print df_obj5.groupby('key1').max()
print
print df_obj5.groupby('key1').min()
print
print df_obj5.groupby('key1').mean()
print
print df_obj5.groupby('key1').size()
print
print df_obj5.groupby('key1').count()
print
print df_obj5.groupby('key1').describe()
data1 data2
key1
a 36 31
b 7 10
data1 data2 key2
key1
a 9 7 two
b 3 7 two
data1 data2 key2
key1
a 3 5 one
b 2 1 one
data1 data2
key1
a 7.200000 6.200000
b 2.333333 3.333333
key1
a 5
b 3
dtype: int64
data1 data2 key2
key1
a 5 5 5
b 3 3 3
data1 data2
key1
a count 5.000000 5.000000
mean 7.200000 6.200000
std 2.489980 0.836660
min 3.000000 5.000000
25% 7.000000 6.000000
50% 8.000000 6.000000
75% 9.000000 7.000000
max 9.000000 7.000000
b count 3.000000 3.000000
mean 2.333333 3.333333
std 0.577350 3.214550
min 2.000000 1.000000
25% 2.000000 1.500000
50% 2.000000 2.000000
75% 2.500000 4.500000
max 3.000000 7.000000
可自定义函数,传入agg方法中
• grouped.agg(func)
• func的参数为groupby索引对应的记录
# 自定义聚合函数
def peak_range(df):
"""
返回数值范围
"""
#print type(df) #参数为索引所对应的记录
return df.max() - df.min()
print df_obj5.groupby('key1').agg(peak_range)
print
print df_obj.groupby('key1').agg(lambda df : df.max() - df.min())
data1 data2
key1
a 6 2
b 1 6
data1 data2
key1
a 1.789992 1.108262
b 1.418696 2.898396
# 默认列名为函数名
print df_obj.groupby('key1').agg(['mean', 'std', 'count', peak_range])
data1 data2
mean std count peak_range mean std count peak_range
key1
a 0.048476 0.750174 5 1.789992 0.049581 0.427705 5 1.108262
b 0.535467 0.770871 3 1.418696 -0.640401 1.608911 3 2.898396
# 通过元组提供新的列名
print df_obj.groupby('key1').agg(['mean', 'std', 'count', ('range', peak_range)])
data1 data2
mean std count range mean std count range
key1
a 0.191898 0.770756 5 2.060729 -0.505536 0.719919 5 1.829598
b 0.390201 0.459173 3 0.918170 -0.253636 1.011395 3 1.986406
# 每列作用不同的聚合函数
dict_mapping = {'data1':'mean',
'data2':'sum'}
print df_obj.groupby('key1').agg(dict_mapping)
data1 data2
key1
a 0.048476 0.247905
b 0.535467 -1.921202
dict_mapping = {'data1':['mean','max'],
'data2':'sum'}
print df_obj.groupby('key1').agg(dict_mapping)
data1 data2
mean max sum
key1
a 0.048476 0.922654 0.247905
b 0.535467 1.419044 -1.921202
注:部分例子来自于小象学院Robin课程