#coding=utf-8
import pandas as pd
import numpy as np
df=pd.read_csv("groupby.csv")
print(df)
'''
company salary age
0 A 13 40
1 A 49 18
2 C 15 16
3 B 40 49
4 A 48 40
5 A 12 27
6 B 17 18
7 B 5 39
8 B 37 19
9 B 15 18
'''
#df.groupby()按哪些字段进行组,返回一个DataFrameGroupBy对象
print(df.groupby("company"))
'''
'''
#通过list()查看DataFrameGroupBy对象的内部情况
print(list(df.groupby("company")))
'''
解释:groupby的过程就是将原有的DataFrame按照groupby的字段(这里是company),划分为若干个分组DataFrame对象
[
('A',
company salary age
0 A 13 40
1 A 49 18
4 A 48 40
5 A 12 27),
('B',
company salary age
3 B 40 49
6 B 17 18
7 B 5 39
8 B 37 19
9 B 15 18),
('C',
company salary age
2 C 15 16)
]
'''
#get_group()查看分组之后其中某个分组DataFrame
g=df.groupby("company")
print(g.get_group("A"))
'''
company salary age
0 A 13 40
1 A 49 18
4 A 48 40
5 A 12 27
'''
#通过groupby分组之后,可用通过agg()函数指定聚合操作
'''
聚合函数:
min 最小值
max 最大值
sum 求和
mean 求平均值
median 求中位数
count 计数
nunique 计算去重后个数(类似sql的 count distinct)
'''
#计算不同公司员工的平均年龄和平均薪资
g=df.groupby("company")
print(g.agg({"salary":"mean","age":"mean"}))
print(g.agg("mean"))
'''
age salary
company
A 31.25 30.5
B 28.60 22.8
C 16.00 15.0
salary age
company
A 30.5 31.25
B 22.8 28.60
C 15.0 16.00
'''
#指定列,行进行操作
#计算不同公司员工的平均年龄以及薪水的中位数
g=df.groupby("company")
print(g.agg({"salary":"median","age":"mean"}))
'''
age salary
company
A 31.25 30.5
B 28.60 17.0
C 16.00 15.0
'''
#计算不同公司员工的平均年龄以及薪水的中位数,均值,总人数
g=df.groupby("company")
print(g.agg({"age":"mean","salary":["median","mean","count"]}))
'''
salary age
median mean count mean
company
A 30.5 30.5 4 31.25
B 17.0 22.8 5 28.60
C 15.0 15.0 1 16.00
'''
'''
count:计数,不包含NaN的值
size: 计数,包含NaN的值
nuinque: 计算去重后的个数(类似sql count distinct)
'''
df=pd.DataFrame({"Name":["Alice","Bob","Mallory","Mallory","Bob","Mallory"],
"City":["Seattle","Seattle","Portland","Seattle","Seattle","Portland"],
"Val":[4,3,3,np.nan,np.nan,4]})
print(df)
'''
City Name Val
0 Seattle Alice 4.0
1 Seattle Bob 3.0
2 Portland Mallory 3.0
3 Seattle Mallory NaN
4 Seattle Bob NaN
5 Portland Mallory 4.0
'''
g=df.groupby("City")
print(g.agg({"Val":["count","size"]}))
'''
由结果得知:Seattle的count=2,size=4,因为size统计了两个NaN的记录
Val
count size
City
Portland 2 2
Seattle 2 4
'''
print(g.agg({"Val":["count","size"],"Name":"nunique"}))
'''
由结果得知:Seattle分组之后,去重的姓名有三个;Portland分组之后,去重的姓名只有一个
Val Name
count size nunique
City
Portland 2 2 1
Seattle 2 4 3
'''