pandas中合并多个DataFrame的多种方式:
how=连接方式
on=合并列名
不设置只考虑key列中共同有的值
df1 = pd.DataFrame({'id':[1,2,3,4,5,6],'city':['wuhan','newyork','shanghai','paris','losangeles','london'],'country':['china','usa','china','france','usa','england'],'visits':[2,1,2,1,2,1]})
df2 = pd.DataFrame({'id':[1,2,3,4,5],'country':['china','france','usa','germany','japan'],'visits':[2,2,2,2,2]})
print(df1)
print(df2)
print('内链接')
df3 = pd.merge(df1,df2,on='country')
print(df3)
考虑df1中key列中所有的值,df2的key列没有对应值话,NaN填充
print('左链接')
df3 = pd.merge(df1,df2,on='country',how='left')
print(df3)
考虑df2中key列中所有的值,df1的key列没有对应值话,NaN填充
print('右链接')
df3 = pd.merge(df1,df2,on='country',how='right')
print(df3)
若需要根据多个列合并,设置on=(‘列1’,‘列2’,…)
print('多个列名为链接键')
df4 = pd.merge(df1,df2,on=('country','visits'))
print(df4)
如果合并前后存在其他相同列名的列,可以设置df1,df2中重复名的后缀:
suffixes=(‘左后缀’,‘右后缀’))
如果不设置,后缀默认为_x,_y
print('合并后设置新列的列名后缀')
df4 = pd.merge(df1,df2,on='country',suffixes=('_city','_country'))
print(df4)
有情况下,左右两个表需要合并的列名并不一定相同,比如df1中country合并df2中countryname
print('左右表不同列名合并')
df22.columns=['id','countryname','visits']
df5 = pd.merge(df1,df22,left_on=['country'],right_on=['countryname'],suffixes=('_city','_country'))
print(df5)
删除刚才出现的多余的列(countryname)
print('删除列')
df5.drop(columns=['countryname'],inplace=True)
print(df5)
不设置how,默认左连接,即考虑df1的所有行,df2中若行数不够则Nan补足
df1 = pd.DataFrame({'id':[1,2,3,4,5,6],'city':['wuhan','newyork','shanghai','paris','losangeles','london'],'country':['china','usa','china','france','usa','england'],'visits':[2,1,2,1,2,1]})
df2 = pd.DataFrame({'idy':[1,2,3,4,6],'cityy':['wuhan','newyork','shanghai','paris','losangeles'],'countryy':['china','usa','china','france','usa']})
print(df1)
print(df2)
print('默认how=left')
df3 = df1.join(df2)
print(df3)
设置how=‘right’,考虑df2中所有行,df1中若行数不够则Nan补足
print('右链接')
df3 = df1.join(df2,how='right')
print(df3)
how=‘inner’,以df1,df2中行数最少的为准
print('内链接')
df3 = df1.join(df2,how='inner')
print(df3)
how=‘outer’,以df1,df2中行数最多的为准
print('外链接')
df3 = df1.join(df2,how='outer')
print(df3)
设置axis=1,则将所有的列拼接在一起,列数=df1列数+df2列数
from pandas import concat
df1 = pd.DataFrame({'id':[1,2,3,4,5,6],'city':['wuhan','newyork','shanghai','paris','losangeles','london'],'country':['china','usa','china','france','usa','england'],'visits':[2,1,2,1,2,1]})
df2 = pd.DataFrame({'id':[1,2,3,4,6],'city':['wuhan','newyork','shanghai','paris','losangeles'],'country':['china','usa','china','france','usa'],'visits':[2,1,2,1,2]})
print(df1)
print(df2)
print('列连接')
df3 = concat([df1,df2],join="inner",axis=1)
print(df3)
不设置axis,则axis=0,要求存在相同的列名,按相同列名,把行拼接上,行数=df1行数+df2行数
print('行连接')
df3 = concat([df1,df2])
print(df3)
行连接时候,可以指定索引
print('行连接并制定行索引')
df3 = concat([df1,df2],keys=['a','b'])
print(df3)
行连接之后会出现相同的行信息,drop_duplicates删除重复行即可
print("去重")
df3 = concat([df1,df2],ignore_index=True).drop_duplicates()
print(df3)