pandas中的merge和concat类似,但主要是用于两组有key column的数据,统一索引的数据. 通常也被用在Database的处理当中.
import pandas as pd
left = pd.DataFrame({
"key":["K0","K1","K2","K3"],
"A":["A0","A1","A2","A3"],
"B":["B0","B1","B2","B3"]})
right = pd.DataFrame({
"key":["K0","K1","K2","K3"],
"C":["C0","C1","C2","C3"],
"D":["D0","D1","D2","D3"]})
print(left)
print('\n')
print(right)
print('\n')
res = pd.merge(left, right, on = "key") #依据key column合并,并打印出
print(res)
#输出
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
indicator=True会将合并的记录放在新的一列。
res = pd.merge(df1,df2,on = 'col1',how = 'outer',indicator = True) # 依据col1进行合并,并启用indicator=True,最后打印出
print(res)
print('\n')
res_self = pd.merge(df1,df2,on = 'col1',how = 'outer',indicator = 'indicator_column') # 自定indicator column的名称,并打印出
print(res_self)
#输出
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 3 NaN 2.0 right_only
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 3 NaN 2.0 right_only
import pandas as pd
left = pd.DataFrame({
'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index = ['K0','K1','K2'])
right = pd.DataFrame({
'C':['C0','C2','C3'],
'D':['D0','D2','D3']},
index = ['K0','K2','K3'])
print(left)
print('\n')
print(right)
print('\n')
res_outer = pd.merge(left,right,left_index=True,right_index=True,how='outer') #依据左右资料集的index进行合并,how='outer',并打印出
print(res_outer)
print('\n')
res_inner = pd.merge(left,right,left_index=True,right_index=True,how='inner') #依据左右资料集的index进行合并,how='inner',并打印出
print(res_inner)
#输出
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C2 D2
import pandas as pd
boys = pd.DataFrame({
'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({
'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner') ##使用suffixes解决overlapping的问题
print(res)
#输出
k age_boy age_girl
0 K0 1 4
1 K0 1 5