import pandas as pd
import math
data = pd.DataFrame({"country":["Russian","China","America","Brazil","China","Japan","America"],"plane":[1170, 960, 980,800,math.nan,300,3244],"Population":[1,13,math.nan,2,14,3,5]})
data
"""
country plane Population
0 Russian 1170.0 1.0
1 China 960.0 13.0
2 America 980.0 NaN
3 Brazil 800.0 2.0
4 China NaN 14.0
5 Japan 300.0 3.0
6 America 3244.0 5.0
"""
group = data.groupby("country")
list(group)
"""
[('America',
country plane Population
2 America 980.0 NaN
6 America 3244.0 5.0),
('Brazil',
country plane Population
3 Brazil 800.0 2.0),
('China',
country plane Population
1 China 960.0 13.0
4 China NaN 14.0),
('Japan',
country plane Population
5 Japan 300.0 3.0),
('Russian',
country plane Population
0 Russian 1170.0 1.0)]
"""
## 使用groupby后可以在子dataframe上应用 count,mean,unique等统计方法
data.groupby("country").plane.count()
# 以 country进行分组,然后计算每个country的plane
# output
"""
country
America 2
Brazil 1
China 1
Japan 1
Russian 1
Name: plane, dtype: int64
"""
agg是groupby()用得最多的一种方法。agg()可以同时在一个dataframe上应用多个不同的函数。
# 计算plane的最大值和最小值
data.groupby("country").plane.agg([min,max])
# output
"""
min max
country
America 980.0 3244.0
Brazil 800.0 800.0
China 960.0 960.0
Japan 300.0 300.0
Russian 1170.0 1170.0
"""
groupby()的方法的返回结果,其行的排列顺序是以index value来进行排序的。我们可以使用sort_values"方法对其顺序进行自定义
sort_index方法 以 index value大小进行排列
country_min = data.groupby("country").plane.agg([min])
print(country_min)
#output
"""
min
country
America 980.0
Brazil 800.0
China 960.0
Japan 300.0
Russian 1170.0
"""
country_min.sort_values(by="min",ascending=False)
"""
min
country
Russian 1170.0
America 980.0
China 960.0
Brazil 800.0
Japan 300.0
"""
country_min.sort_index(ascending=False)
# output
"""
min
country
Russian 1170.0
Japan 300.0
China 960.0
Brazil 800.0
America 980.0
"""
参考:
(1)https://www.kaggle.com/learn/pandas
(2)https://zhuanlan.zhihu.com/p/101284491