import numpy as np
import pandas as pd
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
'Rank': [1, 2, 2, 3, 3, 4, 1, 1, 2, 4, 1, 2],
'Year': [2014, 2015, 2014, 2015, 2014, 2015, 2016, 2017, 2016, 2014, 2015, 2017],
'Points': [876, 789, 863, 673, 741, 812, 756, 788, 694, 701, 804, 690]}
df = pd.DataFrame(ipl_data)
print(df)
# 将数据拆分成祖
grouped = df.groupby('Year')
print(grouped)
# 查看分组结果
print(grouped.groups, '-------分组内容')
print('*' * 45)
# 遍历分组
for year, group in grouped:
print(year)
print(group)
print('*' * 45)
# 获取一个分组的细节[参数是分组名称]
g2014 = grouped.get_group(2014)
# 对指定分组进行数据描述详情,例如某一分组排序
g = g2014.sort_values('Points', ascending=True, inplace=False)
print(g)
# 分组聚合,获得每个分组的平均值
print(grouped['Points'].agg(np.mean))
print(grouped['Points'].agg([np.mean, np.max, np.min]))
print(grouped)
E:\Anaconda\python.exe E:/Python达内/网络并发/data_analysis/6_pandas_study/demo16.py
Team Rank Year Points
0 Riders 1 2014 876
1 Riders 2 2015 789
2 Devils 2 2014 863
3 Devils 3 2015 673
4 Kings 3 2014 741
5 kings 4 2015 812
6 Kings 1 2016 756
7 Kings 1 2017 788
8 Riders 2 2016 694
9 Royals 4 2014 701
10 Royals 1 2015 804
11 Riders 2 2017 690
{2014: [0, 2, 4, 9], 2015: [1, 3, 5, 10], 2016: [6, 8], 2017: [7, 11]} -------分组内容
*********************************************
2014
Team Rank Year Points
0 Riders 1 2014 876
2 Devils 2 2014 863
4 Kings 3 2014 741
9 Royals 4 2014 701
2015
Team Rank Year Points
1 Riders 2 2015 789
3 Devils 3 2015 673
5 kings 4 2015 812
10 Royals 1 2015 804
2016
Team Rank Year Points
6 Kings 1 2016 756
8 Riders 2 2016 694
2017
Team Rank Year Points
7 Kings 1 2017 788
11 Riders 2 2017 690
*********************************************
Team Rank Year Points
9 Royals 4 2014 701
4 Kings 3 2014 741
2 Devils 2 2014 863
0 Riders 1 2014 876
Year
2014 795.25
2015 769.50
2016 725.00
2017 739.00
Name: Points, dtype: float64
mean amax amin
Year
2014 795.25 876 701
2015 769.50 812 673
2016 725.00 756 694
2017 739.00 788 690
Pandas具有功能全面的高性能内存中连接操作,与SQL等关系数据库非常相似。
Pandas提供了一个单独的merge()
函数,作为DataFrame对象之间所有标准数据库连接操作的入口。
import pandas as pd
left = pd.DataFrame({
'student_id':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
'student_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung', 'Billy', 'Brian', 'Bran', 'Bryce', 'Betty', 'Emma', 'Marry', 'Allen', 'Jean', 'Rose', 'David', 'Tom', 'Jack', 'Daniel', 'Andrew'],
'class_id':[1,1,1,2,2,2,3,3,3,4,1,1,1,2,2,2,3,3,3,2],
'gender':['M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'F'],
'age':[20,21,22,20,21,22,23,20,21,22,20,21,22,23,20,21,22,20,21,22],
'score':[98,74,67,38,65,29,32,34,85,64,52,38,26,89,68,46,32,78,79,87]})
right = pd.DataFrame(
{'class_id':[1,2,3,5],
'class_name': ['ClassA', 'ClassB', 'ClassC', 'ClassE']})
# 合并两个DataFrame
data = pd.merge(left,right)
print(data)
E:\Anaconda\python.exe E:/Python达内/网络并发/data_analysis/6_pandas_study/demo17.py
student_id student_name class_id gender age score class_name
0 1 Alex 1 M 20 98 ClassA
1 2 Amy 1 M 21 74 ClassA
2 3 Allen 1 F 22 67 ClassA
3 11 Emma 1 F 20 52 ClassA
4 12 Marry 1 F 21 38 ClassA
5 13 Allen 1 M 22 26 ClassA
6 4 Alice 2 F 20 38 ClassB
7 5 Ayoung 2 M 21 65 ClassB
8 6 Billy 2 M 22 29 ClassB
9 14 Jean 2 M 23 89 ClassB
10 15 Rose 2 F 20 68 ClassB
11 16 David 2 F 21 46 ClassB
12 20 Andrew 2 F 22 87 ClassB
13 7 Brian 3 F 23 32 ClassC
14 8 Bran 3 F 20 34 ClassC
15 9 Bryce 3 M 21 85 ClassC
16 17 Tom 3 M 22 32 ClassC
17 18 Jack 3 M 20 78 ClassC
18 19 Daniel 3 F 21 79 ClassC
Process finished with exit code 0
# 合并两个DataFrame (左连接)
rs = pd.merge(left, right, how='left')
print(rs)
# 合并两个DataFrame (左连接)
rs = pd.merge(left,right,on='subject_id', how='right')
print(rs)
# 合并两个DataFrame (左连接)
rs = pd.merge(left,right,on='subject_id', how='outer')
print(rs)
# 合并两个DataFrame (左连接)
rs = pd.merge(left,right,on='subject_id', how='inner')
print(rs)
其余关于Pandas的基本操作