一、concat
(1)axis和ignore
import pandas as pd
import numpy as np
df1=pd.DataFrame(np.ones((2,3))*0,columns=['A','B','C'])
df2=pd.DataFrame(np.ones((2,3))*1,columns=['A','B','C'])
#axis设置合并方向:concat纵向合并
res=pd.concat([df1,df2],axis=0) #index还是原数据的index
print(res)
#ignore_index重置index
print('重置index:')
res1=pd.concat([df1,df2],axis=0,ignore_index=True) #重置index
print(res1)
输出:
A B C
0 0.0 0.0 0.0
1 0.0 0.0 0.0
0 1.0 1.0 1.0
1 1.0 1.0 1.0
重置index:
A B C
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
(2)join合并:
jion=‘outer’默认值,按照column合并,相同的column上下合并,没有的column单独成列,存在cloumn没有值为NaN; join=’inner’时,相同的column保留,不相同的抛弃
df1=pd.DataFrame(np.ones((2,3))*1,columns=['a','b','c'])
df2=pd.DataFrame(np.ones((2,3))*2,columns=['b','c','d'])
#concat纵向外合并,join=‘outer’
res=pd.concat([df1,df2],axis=0,join='outer',ignore_index='Ture')
print('纵向外合并:')
print(res)
#concat纵向内合并,join=‘inner’
res=pd.concat([df1,df2],axis=0,join='inner',ignore_index='True')
print('纵向内合并:')
print(res)
#join_axes
df1=pd.DataFrame(np.ones((2,3))*1,columns=['a','b','c'],index=[1,2])
df2=pd.DataFrame(np.ones((2,3))*2,columns=['a','b','c'],index=[2,3])
res=pd.concat([df1,df2],axis=1,join_axes=[df1.index]) #只得到df1的index
print('join_axes合并')
print(res)
输出:
纵向外合并:
a b c d
0 1.0 1.0 1.0 NaN
1 1.0 1.0 1.0 NaN
2 NaN 2.0 2.0 2.0
3 NaN 2.0 2.0 2.0
纵向内合并:
b c
0 1.0 1.0
1 1.0 1.0
2 2.0 2.0
3 2.0 2.0
join_axes合并:
a b c a b c
1 1.0 1.0 1.0 NaN NaN NaN
2 1.0 1.0 1.0 2.0 2.0 2.0
(3)append合并数据
append合并:append只有纵向合并,没有横向合并。
df3=pd.DataFrame(np.ones((2,3))*3,columns=['a','b','c'],index=[3,4])
print('append合并数据:')
res=df1.append(df2)
print(res)
print('append合并多个df:')
res=df1.append([df2,df3])
print(res)
输出:
append合并数据:
a b c
1 1.0 1.0 1.0
2 1.0 1.0 1.0
2 2.0 2.0 2.0
3 2.0 2.0 2.0
append合并多个df:
a b c
1 1.0 1.0 1.0
2 1.0 1.0 1.0
2 2.0 2.0 2.0
3 2.0 2.0 2.0
3 3.0 3.0 3.0
4 3.0 3.0 3.0
二、merge合并
import pandas as pd
#一组key合并
left=pd.DataFrame({'key':['k0','k1','k2'],
'A':['A0','A1','A2'],
'B':['B0','B1','B2']})
right=pd.DataFrame({'key':['k0','k1','k2'],
'C':['C0','C1','C2'],
'd':['d0','d1','d2']})
res=pd.merge(left,right,on='key')
print('一组key合并')
print(res)
#两组key合并
left=pd.DataFrame({'key1':['k0','k0','k1','k2'],
'key2':['k1','k0','k1','k0'],
'a':['a1','a2','a3','a4'],
'b':['b1','b2','b3','b4']})
right=pd.DataFrame({'key1':['k0','k1','k0','k2'],
'key2':['k0','k0','k1','k0'],
'c':['c0','c1','c2','c3'],
'd':['d0','d1','d2','d3']})
#inner
res1=pd.merge(left,right,on=['key1','key2'],how='inner')
print('inner:')
print(res1)
#outer
res2=pd.merge(left,right,on=['key1','key2'],how='outer')
print('outer:')
print(res2)
#left
res3=pd.merge(left,right,on=['key1','key2'],how='left')
print('left:')
print(res3)
#right
res4=pd.merge(left,right,on=['key1','key2'],how='right')
print('right')
print(res4)
输出:
一组key合并
A B key C d
0 A0 B0 k0 C0 d0
1 A1 B1 k1 C1 d1
2 A2 B2 k2 C2 d2
inner:
a b key1 key2 c d
0 a1 b1 k0 k1 c2 d2
1 a2 b2 k0 k0 c0 d0
2 a4 b4 k2 k0 c3 d3
outer:
a b key1 key2 c d
0 a1 b1 k0 k1 c2 d2
1 a2 b2 k0 k0 c0 d0
2 a3 b3 k1 k1 NaN NaN
3 a4 b4 k2 k0 c3 d3
4 NaN NaN k1 k0 c1 d1
left:
a b key1 key2 c d
0 a1 b1 k0 k1 c2 d2
1 a2 b2 k0 k0 c0 d0
2 a3 b3 k1 k1 NaN NaN
3 a4 b4 k2 k0 c3 d3
right
a b key1 key2 c d
0 a1 b1 k0 k1 c2 d2
1 a2 b2 k0 k0 c0 d0
2 a4 b4 k2 k0 c3 d3
3 NaN NaN k1 k0 c1 d1
indicator:
#indicator,显示合并记录
res4=pd.merge(left,right,on=['key1','key2'],how='right',indicator=True)
print(res4)
输出:
a b key1 key2 c d _merge
0 a1 b1 k0 k1 c2 d2 both
1 a2 b2 k0 k0 c0 d0 both
2 a4 b4 k2 k0 c3 d3 both
3 NaN NaN k1 k0 c1 d1 right_only
overlapping重叠问题
#解决重叠问题
import pandas as pd
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
#使用suffixes后缀解决overlapping的问题
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
print(res)
输出:
age_boy k age_girl
0 1.0 K0 4.0
1 1.0 K0 5.0
2 2.0 K1 NaN
3 3.0 K2 NaN
4 NaN K3 6.0