# 定义一个创建DataFrame某种形式的函数
def make_df(cols, ind):
data = {c:[str(c)+str(i) for i in ind] for c in cols}
return pd.DataFrame(data,ind)
## 通过pd.concat实现简易合并
ser1 = pd.Series(['A','B','C'],index=[1,2,3])
ser2 = pd.Series(['D','E','F'],index=[4,5,6])
print(pd.concat([ser1,ser2]))
'''
1 A
2 B
3 C
4 D
5 E
6 F
dtype: object
'''
df1 = make_df('AB',[1,2])
df2 = make_df('AB',[3,4])
print(df1)
'''
A B
1 A1 B1
2 A2 B2
'''
print(df2)
'''
A B
3 A3 B3
4 A4 B4
'''
print(pd.concat([df1,df2]))
'''
A B
1 A1 B1
2 A2 B2
3 A3 B3
4 A4 B4
'''
df3 = make_df('AB',[0,1])
df4 = make_df('CD',[0,1])
print(df3)
'''
A B
0 A0 B0
1 A1 B1
'''
print(df4)
'''
C D
0 C0 D0
1 C1 D1
'''
print(pd.concat([df3,df4],axis=1))
'''
A B C D
0 A0 B0 C0 D0
1 A1 B1 C1 D1
'''
# 定义一个创建DataFrame某种形式的函数
def make_df(cols, ind):
data = {c:[str(c)+str(i) for i in ind] for c in cols}
return pd.DataFrame(data,ind)
# 索引重复
# np.concatrnate与pd.concat区别是Pandas会保留索引,即使索引是重复的
x = make_df('AB',[0,1])
y = make_df('AB',[2,3])
y.index = x.index # 复制索引
print(x)
'''
A B
0 A0 B0
1 A1 B1
'''
print(y)
'''
A B
0 A2 B2
1 A3 B3
'''
print(pd.concat([x,y]))
'''
A B
0 A0 B0
1 A1 B1
0 A2 B2
1 A3 B3
'''
结果索引是重复的,pd.concat提供了解决该问题的方法
设置参数verify_interity为True,合并有重复时会触发异常
try:
pd.concat([x,y], verify_integrity=True)
except ValueError as e:
print("ValueError:",e) # ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')
设置参数ignore_index为True,合并会创建新的整数索引
print(pd.concat([x,y],ignore_index=True))
'''
A B
0 A0 B0
1 A1 B1
2 A2 B2
3 A3 B3
'''
设置参数keys为数据源设置多级索引标签
print(pd.concat([x,y],keys=['x','y']))
'''
A B
x 0 A0 B0
1 A1 B1
y 0 A2 B2
1 A3 B3
'''
# 定义一个创建DataFrame某种形式的函数
def make_df(cols, ind):
data = {c:[str(c)+str(i) for i in ind] for c in cols}
return pd.DataFrame(data,ind)
x = make_df('ABC',[1,2])
y = make_df('BCD',[3,4])
y.index = x.index # 复制索引
print(x)
'''
A B C
1 A1 B1 C1
2 A2 B2 C2
'''
print(y)
'''
B C D
1 B3 C3 D3
2 B4 C4 D4
'''
print(pd.concat([x,y]))
'''
A B C D
1 A1 B1 C1 NaN
2 A2 B2 C2 NaN
1 NaN B3 C3 D3
2 NaN B4 C4 D4
'''
缺失的数据用NaN表示,如果不想这样,可以用join参数设置合并方式,默认方式是对所有列进行合并(join=‘outer’),join='inner’是对列的交集合并
print(pd.concat([x,y],join='inner'))
'''
B C
1 B1 C1
2 B2 C2
1 B3 C3
2 B4 C4
'''
x.append(y)效果与pd.concat([x,y])一样
# 定义一个创建DataFrame某种形式的函数
def make_df(cols, ind):
data = {c:[str(c)+str(i) for i in ind] for c in cols}
return pd.DataFrame(data,ind)
x = make_df('AB',[1,2])
y = make_df('AB',[3,4])
print(x)
'''
A B
1 A1 B1
2 A2 B2
'''
print(y)
'''
A B
3 A3 B3
4 A4 B4
'''
print(pd.concat([x,y]))
'''
A B
1 A1 B1
2 A2 B2
3 A3 B3
4 A4 B4
'''
print(x.append(y))
'''
A B
1 A1 B1
2 A2 B2
3 A3 B3
4 A4 B4
'''
共同列的位置可以是不一样,merge会正确处理该问题,另外,merge会自动丢弃原来的行索引,可以自定义
df1 = pd.DataFrame({'employee':['Bob','Jake','Lisa','Sue'],
'group':['Accounting','Engineering','Engineering','HR']})
df2 = pd.DataFrame({'employee':['Lisa','Bob','Jake','Sue'],
'hirer_data':[2004,2008,2012,2014]})
print(df1)
'''
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
'''
print(df2)
'''
employee hirer_data
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014
'''
df3 = pd.merge(df1, df2)
print(df3)
'''
employee group hirer_data
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
'''
指连接的两个列中有一列的值有重复,结果中的supervisor会因为group有重复而有重复
df4 = pd.DataFrame({'group':['Accounting','Engineering','HR'],
'supervisor':['Carly','Guido','Steve']})
print(df3)
'''
employee group hirer_data
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
'''
print(df4)
'''
group supervisor
0 Accounting Carly
1 Engineering Guido
2 HR Steve
'''
print(pd.merge(df3,df4))
'''
employee group hirer_data supervisor
0 Bob Accounting 2008 Carly
1 Jake Engineering 2012 Guido
2 Lisa Engineering 2004 Guido
3 Sue HR 2014 Steve
'''
两个输入的共同列都包含重复值,则为多对多
df5 = pd.DataFrame({'group':['Accounting','Accounting','Engineering','Engineering','HR','HR'],
'skills':['math','spreadsheets','coding','linux','spreadsheets','organization']})
print(df1)
'''
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
'''
print(df5)
'''
group skills
0 Accounting math
1 Accounting spreadsheets
2 Engineering coding
3 Engineering linux
4 HR spreadsheets
5 HR organization
'''
print(pd.merge(df1,df5))
'''
employee group skills
0 Bob Accounting math
1 Bob Accounting spreadsheets
2 Jake Engineering coding
3 Jake Engineering linux
4 Lisa Engineering coding
5 Lisa Engineering linux
6 Sue HR spreadsheets
7 Sue HR organization
'''
merge默认将两个输入的一个或多个共同列作为键进行合并。但由于两个输入要合并的列通常不是同名的,merge提供了一些参数处理
将参数on设置为一个列名字符串或者一个包含多个列名称的列表,这个参数只能在两个DataFrame有共同列名的时候才可以使用
df1 = pd.DataFrame({'employee':['Bob','Jake','Lisa','Sue'],
'group':['Accounting','Engineering','Engineering','HR']})
df2 = pd.DataFrame({'employee':['Lisa','Bob','Jake','Sue'],
'hirer_data':[2004,2008,2012,2014]})
print(df1)
'''
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
'''
print(df2)
'''
employee hirer_data
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014
'''
print(pd.merge(df1,df2,on='employee'))
'''
employee group hirer_data
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
'''
合并两个列名不同的数据集
df3 = pd.DataFrame({'name':['Bob','Jake','Lisa','Sue'],
'salary':[70000,80000,120000,90000]})
print(df1)
'''
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
'''
print(df3)
'''
name salary
0 Bob 70000
1 Jake 80000
2 Lisa 120000
3 Sue 90000
'''
print(pd.merge(df1,df3,left_on="employee",right_on="name"))
'''
employee group name salary
0 Bob Accounting Bob 70000
1 Jake Engineering Jake 80000
2 Lisa Engineering Lisa 120000
3 Sue HR Sue 90000
'''
print(pd.merge(df1,df3,left_on="employee",right_on="name").drop('name',axis=1))
'''
employee group salary
0 Bob Accounting 70000
1 Jake Engineering 80000
2 Lisa Engineering 120000
3 Sue HR 90000
'''
除了合并列之外,可能还需要合并索引
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
print(df1a)
'''
group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
'''
print(df2a)
'''
hirer_data
employee
Lisa 2004
Bob 2008
Jake 2012
Sue 2014
'''
print(pd.merge(df1a,df2a,left_index=True,right_index=True))
'''
group hirer_data
employee
Bob Accounting 2008
Jake Engineering 2012
Lisa Engineering 2004
Sue HR 2014
'''
为了方便考虑,DataFrame实现了join方法,可以按照索引进行数据合并
print(df1a.join(df2a))
'''
group hirer_data
employee
Bob Accounting 2008
Jake Engineering 2012
Lisa Engineering 2004
Sue HR 2014
'''
如果想要将索引与列混合使用,可以通过结合left_index和right_on,或者结合left_on与right_index来实现
print(df1a)
'''
group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
'''
print(df3)
'''
name salary
0 Bob 70000
1 Jake 80000
2 Lisa 120000
3 Sue 90000
'''
print(pd.merge(df1a,df3,left_index=True,right_on='name'))
'''
group name salary
0 Accounting Bob 70000
1 Engineering Jake 80000
2 Engineering Lisa 120000
3 HR Sue 90000
'''
当一个值出现在一列,却没有出现在另一列时,需要考虑集合操作规则
df6 = pd.DataFrame({'name':['Peter','Paul','Mary'],
'food':['fish','beans','bread']},
columns=['name','food'])
df7 = pd.DataFrame({'name':['Mary','Joseph'],
'drink':['wine','beer']},
columns=['name','drink'])
print(df6)
'''
name food
0 Peter fish
1 Paul beans
2 Mary bread
'''
print(df7)
'''
name drink
0 Mary wine
1 Joseph beer
'''
print(pd.merge(df6,df7))
'''
name food drink
0 Mary bread wine
'''
合并两个数据集,在name列中只有一个共同值Mary。默认结果只输出两个集合的交集,成为内连接
#how的值有inner(内连接),outer(外连接),left(左连接),right(有连接)
print(pd.merge(df6,df7,how='inner'))
'''
name food drink
0 Mary bread wine
'''
print(pd.merge(df6,df7,how='outer'))
'''
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
3 Joseph NaN beer
'''
print(pd.merge(df6,df7,how='left'))
'''
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
'''
print(pd.merge(df6,df7,how='right'))
'''
name food drink
0 Mary bread wine
1 Joseph NaN beer
'''
可能两个输入DataFrame有重名列的情况
df8 = pd.DataFrame({'name':['Bob','Jake','Lisa','Sue'],
'rank':[1,2,3,4]})
df9 = pd.DataFrame({'name':['Bob','Jake','Lisa','Sue'],
'rank':[3,1,4,2]})
print(df8)
'''
name rank
0 Bob 1
1 Jake 2
2 Lisa 3
3 Sue 4
'''
print(df9)
'''
name rank
0 Bob 3
1 Jake 1
2 Lisa 4
3 Sue 2
'''
print(pd.merge(df8,df9,on="name"))# 列名一样,会自动添加后缀rank_x rank_y
'''
name rank_x rank_y
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2
'''
print(pd.merge(df8,df9,on="name",suffixes=["_L","_R"])) # suffixes定义后缀
'''
name rank_L rank_R
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2
'''