import pandas as pd
import numpy as np
# merge
print("***************根据指定的单个列进行合并*******************")
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
# key A B
# 0 K0 A0 B0
# 1 K1 A1 B1
# 2 K2 A2 B2
# 3 K3 A3 B3
print(right)
# key C D
# 0 K0 C0 D0
# 1 K1 C1 D1
# 2 K2 C2 D2
# 3 K3 C3 D3
print("===========merge 参数 on ,指定按照哪一列合并========")
res = pd.merge(left, right, on='key')
print(res)
# key A B C D
# 0 K0 A0 B0 C0 D0
# 1 K1 A1 B1 C1 D1
# 2 K2 A2 B2 C2 D2
# 3 K3 A3 B3 C3 D3
print("***************根据指定的两个列进行合并*******************")
left1 = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right1 = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left1)
# key1 key2 A B
# 0 K0 K0 A0 B0
# 1 K0 K1 A1 B1
# 2 K1 K0 A2 B2
# 3 K2 K1 A3 B3
print(right1)
# key1 key2 C D
# 0 K0 K0 C0 D0
# 1 K1 K0 C1 D1
# 2 K1 K0 C2 D2
# 3 K2 K0 C3 D3
print("===========merge 参数 on ,指定按照哪两列合并===========")
# 指定合并的两个列有顺序???
# 默认 inner join ,即 how='inner'
# how = {'left', 'right', 'inner', 'outer'}
res = pd.merge(left1, right1, on=['key1','key2'], how='inner')
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K1 K0 A2 B2 C1 D1
# 2 K1 K0 A2 B2 C2 D2
print("*****************merge 参数 indicator **************")
df1 = pd.DataFrame({'col1': [0, 1],'col_left': ['a', 'b']})
df2 = pd.DataFrame({'col1': [1, 2, 2],'col_right': [2, 2, 2]})
print(df1)
# col1 col_left
# 0 0 a
# 1 1 b
print(df2)
# col1 col_right
# 0 1 2
# 1 2 2
# 2 2 2
print("===============是否显示结果数据来源=================")
# indicator 默认False,True时会增加一列来表示该行数据来自哪一个df
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
# col1 col_left col_right _merge
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
print("===============显示来源时,指定来源列的名字================")
# indicator 默认False,True时会增加一列来表示该行数据来自哪一个df
res = pd.merge(df1, df2, on='col1', how='outer', indicator='source')
print(res)
# col1 col_left col_right source
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
print("*****************merge 根据行名或列名合并*****************")
left2 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right2 = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left2)
# A B
# K0 A0 B0
# K1 A1 B1
# K2 A2 B2
print(right2)
# C D
# K0 C0 D0
# K2 C2 D2
# K3 C3 D3
print("===============是否显示结果数据来源=================")
# left_index 默认NaN, True 时 指的是 根据行名 合并
# right_index 默认NaN , True时 指的是 根据列名 合并
res = pd.merge(left2, right2, left_index=True, right_index=True, how='outer')
print(res)
# A B C D
# K0 A0 B0 C0 D0
# K1 A1 B1 NaN NaN
# K2 A2 B2 C2 D2
# K3 NaN NaN C3 D3
print("*****************merge suffixes 解决两个df 重复列问题*****************")
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(boys)
# k age
# 0 K0 1
# 1 K1 2
# 2 K2 3
print(girls)
# k age
# 0 K0 4
# 1 K0 5
# 2 K3 6
print("===========suffixes修改合并后列名===========")
# 在重复列名后 追加字符串,避免df中常出现相同的列名
res = pd.merge(boys, girls, on='k', suffixes=['_boys', '_girls'], how='inner')
print(res)
# k age_boys age_girls
# 0 K0 1 4
# 1 K0 1 5