import pandas as pd
# 假设有 5 个人,分别参加了 4 门课程,获得了对应的分数
# 同时这个 5 个人分别负责的项目个数 在 'Project_num' 列中显示
data = {'name' : pd.Series(['Alice', 'Bob', 'Cathy', 'Dany', 'Ella', 'Ford', 'Gary', 'Ham', 'Ico', 'Jack']),
'Math_A' : pd.Series([1.1, 2.2, 3.3, 4.4, 5, 3.2, 2.4, 1.5, 4.3, 4.5]),
'English_A' : pd.Series([3, 2.6, 2, 1.7, 3, 3.3, 4.4, 5, 3.2, 2.4]),
'Math_B' : pd.Series([1.7, 2.5, 3.6, 2.4, 5, 2.2, 3.3, 4.4, 1.5, 4.3]),
'English_B' : pd.Series([5, 2.6, 2.4, 1.3, 3, 3.6, 2.4, 5, 2.2, 3.1]),
'Project_num' : pd.Series([2, 3, 0, 1, 7, 2, 1, 5, 3, 4]),
'Sex' : pd.Series(['F', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'F', 'M'])
}
df = pd.DataFrame(data)
print(df)
运行结果:
name Math_A English_A Math_B English_B Project_num Sex
0 Alice 1.1 3.0 1.7 5.0 2 F
1 Bob 2.2 2.6 2.5 2.6 3 M
2 Cathy 3.3 2.0 3.6 2.4 0 M
3 Dany 4.4 1.7 2.4 1.3 1 F
4 Ella 5.0 3.0 5.0 3.0 7 M
5 Ford 3.2 3.3 2.2 3.6 2 F
6 Gary 2.4 4.4 3.3 2.4 1 M
7 Ham 1.5 5.0 4.4 5.0 5 M
8 Ico 4.3 3.2 1.5 2.2 3 F
9 Jack 4.5 2.4 4.3 3.1 4 M
# 整个表上下移动(相当于在表第一行插入一空白行,但是最后一行由于没有 index ,就消失了)
print(df.shift(1)) # 下移 1 行
print('\n')
print(df.shift(-2)) # 上移 2 行
print('\n')
运行结果:
name Math_A English_A Math_B English_B Project_num Sex
0 NaN NaN NaN NaN NaN NaN NaN
1 Alice 1.1 3.0 1.7 5.0 2.0 F
2 Bob 2.2 2.6 2.5 2.6 3.0 M
3 Cathy 3.3 2.0 3.6 2.4 0.0 M
4 Dany 4.4 1.7 2.4 1.3 1.0 F
5 Ella 5.0 3.0 5.0 3.0 7.0 M
6 Ford 3.2 3.3 2.2 3.6 2.0 F
7 Gary 2.4 4.4 3.3 2.4 1.0 M
8 Ham 1.5 5.0 4.4 5.0 5.0 M
9 Ico 4.3 3.2 1.5 2.2 3.0 F
name Math_A English_A Math_B English_B Project_num Sex
0 Cathy 3.3 2.0 3.6 2.4 0.0 M
1 Dany 4.4 1.7 2.4 1.3 1.0 F
2 Ella 5.0 3.0 5.0 3.0 7.0 M
3 Ford 3.2 3.3 2.2 3.6 2.0 F
4 Gary 2.4 4.4 3.3 2.4 1.0 M
5 Ham 1.5 5.0 4.4 5.0 5.0 M
6 Ico 4.3 3.2 1.5 2.2 3.0 F
7 Jack 4.5 2.4 4.3 3.1 4.0 M
8 NaN NaN NaN NaN NaN NaN NaN
9 NaN NaN NaN NaN NaN NaN NaN
print(df.shift(1,axis=1)) # 右移 1 行,数据格式不兼容则显示 NaN
print('\n')
print(df.shift(-2,axis=1)) # 左移 2 行,数据格式不兼容则显示 NaN
print('\n')
运行结果:
name Math_A English_A Math_B English_B Project_num Sex
0 NaN NaN 1.1 3.0 1.7 NaN Alice
1 NaN NaN 2.2 2.6 2.5 NaN Bob
2 NaN NaN 3.3 2.0 3.6 NaN Cathy
3 NaN NaN 4.4 1.7 2.4 NaN Dany
4 NaN NaN 5.0 3.0 5.0 NaN Ella
5 NaN NaN 3.2 3.3 2.2 NaN Ford
6 NaN NaN 2.4 4.4 3.3 NaN Gary
7 NaN NaN 1.5 5.0 4.4 NaN Ham
8 NaN NaN 4.3 3.2 1.5 NaN Ico
9 NaN NaN 4.5 2.4 4.3 NaN Jack
name Math_A English_A Math_B English_B Project_num Sex
0 NaN 1.7 5.0 NaN NaN NaN NaN
1 NaN 2.5 2.6 NaN NaN NaN NaN
2 NaN 3.6 2.4 NaN NaN NaN NaN
3 NaN 2.4 1.3 NaN NaN NaN NaN
4 NaN 5.0 3.0 NaN NaN NaN NaN
5 NaN 2.2 3.6 NaN NaN NaN NaN
6 NaN 3.3 2.4 NaN NaN NaN NaN
7 NaN 4.4 5.0 NaN NaN NaN NaN
8 NaN 1.5 2.2 NaN NaN NaN NaN
9 NaN 4.3 3.1 NaN NaN NaN NaN
for index,data in df.groupby(by='Sex'):
print(index)
print(data.shift(1))
print('\n')
运行结果:
F
name Math_A English_A Math_B English_B Project_num Sex
0 NaN NaN NaN NaN NaN NaN NaN
3 Alice 1.1 3.0 1.7 5.0 2.0 F
5 Dany 4.4 1.7 2.4 1.3 1.0 F
8 Ford 3.2 3.3 2.2 3.6 2.0 F
M
name Math_A English_A Math_B English_B Project_num Sex
1 NaN NaN NaN NaN NaN NaN NaN
2 Bob 2.2 2.6 2.5 2.6 3.0 M
4 Cathy 3.3 2.0 3.6 2.4 0.0 M
6 Ella 5.0 3.0 5.0 3.0 7.0 M
7 Gary 2.4 4.4 3.3 2.4 1.0 M
9 Ham 1.5 5.0 4.4 5.0 5.0 M
print(df.rolling(window=3, min_periods=1, center=False, axis=0).sum())
# 这里用一个长度为 3 的窗口进行 从上往下 rolling ,每次移动一步,对每次窗口中覆盖的数据进行求和
# 我们以此得到的值是 1.1, 1.1+2.2=3.3, 1.1+2.2+3.3=6.6, 2.2+3.3+4.4=9.9, 3.3+4.4+5=12.7~
# 这里的 center 表示 是否从窗口的中心位置开始计算
# 即如果 center 取 True,我们得到的是 1.1+2.2=3.3, 1.1+2.2+3.3=6.6, 2.2+3.3+4.4=9.9~
print('\n')
# 应用举例:比如我们想要滚动求过去 30 天内的总成交量
运行结果:
Math_A English_A Math_B English_B Project_num
0 1.1 3.0 1.7 5.0 2.0
1 3.3 5.6 4.2 7.6 5.0
2 6.6 7.6 7.8 10.0 5.0
3 9.9 6.3 8.5 6.3 4.0
4 12.7 6.7 11.0 6.7 8.0
5 12.6 8.0 9.6 7.9 10.0
6 10.6 10.7 10.5 9.0 10.0
7 7.1 12.7 9.9 11.0 8.0
8 8.2 12.6 9.2 9.6 9.0
9 10.3 10.6 10.2 10.3 12.0
print(df.groupby(['Sex'])['Project_num'].rolling(window=3).mean())
# 默认 min_periods = None,表示从窗口填满开始,所以前面两个值为 NaN
# 这里首先进行性别分组,然后从每个组中,以此取3个人,看平均负责的项目数量
print('\n')
# 其实在这个例子中使用没什么实际意义,通常实践中时序数据用的比较多,比如求移动平均值
运行结果:
Sex
F 0 NaN
3 NaN
5 1.666667
8 2.000000
M 1 NaN
2 NaN
4 3.333333
6 2.666667
7 4.333333
9 3.333333
Name: Project_num, dtype: float64
df_2 = df.set_index('name') # 将姓名提取为 index
print(df_2)
print('\n')
for index, data in df_2.groupby(['Sex']):
# 按照性别分组显示
print(index)
print(data)
print('\n')
df_3 = df_2.rank(ascending=False)
# 求 每个人 在 每一列中的综合排名,这里是从大到小排
# 并列排名会被取均值,比如 2 3 并列第2,则排名为 2.5
print(df_3)
print('\n')
运行结果:
Math_A English_A Math_B English_B Project_num Sex
name
Alice 1.1 3.0 1.7 5.0 2 F
Bob 2.2 2.6 2.5 2.6 3 M
Cathy 3.3 2.0 3.6 2.4 0 M
Dany 4.4 1.7 2.4 1.3 1 F
Ella 5.0 3.0 5.0 3.0 7 M
Ford 3.2 3.3 2.2 3.6 2 F
Gary 2.4 4.4 3.3 2.4 1 M
Ham 1.5 5.0 4.4 5.0 5 M
Ico 4.3 3.2 1.5 2.2 3 F
Jack 4.5 2.4 4.3 3.1 4 M
Math_A English_A Math_B English_B Project_num Sex
name
Alice 10.0 5.5 9.0 1.5 6.5 8.5
Bob 8.0 7.0 6.0 6.0 4.5 3.5
Cathy 5.0 9.0 4.0 7.5 10.0 3.5
Dany 3.0 10.0 7.0 10.0 8.5 8.5
Ella 1.0 5.5 1.0 5.0 1.0 3.5
Ford 6.0 3.0 8.0 3.0 6.5 8.5
Gary 7.0 2.0 5.0 7.5 8.5 3.5
Ham 9.0 1.0 2.0 1.5 2.0 3.5
Ico 4.0 4.0 10.0 9.0 4.5 8.5
Jack 2.0 8.0 3.0 4.0 3.0 3.5
for index, data in df_2.groupby(['Sex']):
# 按照性别分组显示
print(index)
print(data)
print('\n')
df_4 = df_2.groupby(['Sex']).rank(ascending=False).sort_values('English_A')
# 按照性别分组后,求 每个人 在 每一组中的综合排名(针对每一列),这里是从大到小排
# 然后我们查看 按照 'English_A' 分数从高到底排列
print(df_4)
print('\n')
运行结果:
F
Math_A English_A Math_B English_B Project_num Sex
name
Alice 1.1 3.0 1.7 5.0 2 F
Dany 4.4 1.7 2.4 1.3 1 F
Ford 3.2 3.3 2.2 3.6 2 F
Ico 4.3 3.2 1.5 2.2 3 F
M
Math_A English_A Math_B English_B Project_num Sex
name
Bob 2.2 2.6 2.5 2.6 3 M
Cathy 3.3 2.0 3.6 2.4 0 M
Ella 5.0 3.0 5.0 3.0 7 M
Gary 2.4 4.4 3.3 2.4 1 M
Ham 1.5 5.0 4.4 5.0 5 M
Jack 4.5 2.4 4.3 3.1 4 M
Math_A English_A Math_B English_B Project_num
name
Ford 3.0 1.0 2.0 2.0 2.5
Ham 6.0 1.0 2.0 1.0 2.0
Gary 4.0 2.0 5.0 5.5 5.0
Ico 2.0 2.0 4.0 3.0 1.0
Alice 4.0 3.0 3.0 1.0 2.5
Ella 1.0 3.0 1.0 3.0 1.0
Bob 5.0 4.0 6.0 4.0 4.0
Dany 1.0 4.0 1.0 4.0 4.0
Jack 2.0 5.0 3.0 2.0 3.0
Cathy 3.0 6.0 4.0 5.5 6.0
df_5 = df_2['English_A'].rank(ascending=True, pct=True)
# 'ptc'表示把所有的排序序号限定在 0~1 的范围内
# ascending=True 表示从小到大
print(df_5)
print('\n')
运行结果:
name
Alice 0.55
Bob 0.40
Cathy 0.20
Dany 0.10
Ella 0.55
Ford 0.80
Gary 0.90
Ham 1.00
Ico 0.70
Jack 0.30
Name: English_A, dtype: float64
df_6 = df_2['English_A'].rank(method='first',ascending=True)
# 虽然 Alice 和 Ella 都是 3 分,但是由于 Alice 的名字在原表格中先出现,在 method='first' 方法下,Alice排名在Ella前面
# method='min',看 Alice 和 Ella 这两个名字的排序,A 小于 E,所以 Alice 排在 Ella 前面
# method='max',看 Alice 和 Ella 这两个名字的排序,A 小于 E,所以 Ella 排在 Alice 前面
print(df_6)
print('\n')
df_6 = df_2['English_A'].rank(method='dense',ascending=True)
# 'dense' 方法下,Alice 和 Ella 并列第 5 ,后面从 6 开始继续排名
# 所以这里总的排名位数为 9
print(df_6)
print('\n')
运行结果:
name
Alice 5.0
Bob 4.0
Cathy 2.0
Dany 1.0
Ella 6.0
Ford 8.0
Gary 9.0
Ham 10.0
Ico 7.0
Jack 3.0
Name: English_A, dtype: float64
name
Alice 5.0
Bob 4.0
Cathy 2.0
Dany 1.0
Ella 5.0
Ford 7.0
Gary 8.0
Ham 9.0
Ico 6.0
Jack 3.0
Name: English_A, dtype: float64
print(df_2['Project_num'].cumsum())
# 挨个把每个人的项目数量加上去,看累计的总项目数量
print('\n')
运行结果:
name
Alice 2
Bob 5
Cathy 5
Dany 6
Ella 13
Ford 15
Gary 16
Ham 21
Ico 24
Jack 28
Name: Project_num, dtype: int64
print(df_2['Project_num'].cummax())
# 挨个把每个人的项目数量对比,看最大值
print('\n')
print(df_2['Project_num'].cummin())
# 挨个把每个人的项目数量对比,看最小值
print('\n')
运行结果:
name
Alice 2
Bob 3
Cathy 3
Dany 3
Ella 7
Ford 7
Gary 7
Ham 7
Ico 7
Jack 7
Name: Project_num, dtype: int64
name
Alice 2
Bob 2
Cathy 0
Dany 0
Ella 0
Ford 0
Gary 0
Ham 0
Ico 0
Jack 0
Name: Project_num, dtype: int64
print(df_2['Project_num'].cumprod())
# 累乘
print('\n')
运行结果:
name
Alice 2
Bob 6
Cathy 0
Dany 0
Ella 0
Ford 0
Gary 0
Ham 0
Ico 0
Jack 0
Name: Project_num, dtype: int64
df_agg = df_2.groupby('Sex')['Math_A'].agg(['min', 'mean', 'max'])
# 按照性别分组后,看每个组中 'Math_A' 的一些统计数据
print(df_agg)
print('\n')
运行结果:
min mean max
Sex
F 1.1 3.25 4.4
M 1.5 3.15 5.0
agg_column = {'Math_A':['min','max'], 'Math_B':['max','mean']}
# 用字典选取多列,指定不同的统计方法
df_agg = df_2.groupby('Sex').agg(agg_column)
print(df_agg)
print('\n')
运行结果:
Math_A Math_B
min max max mean
Sex
F 1.1 4.4 2.4 1.95
M 1.5 5.0 5.0 3.85