pandas合并数据

一、concat
(1)axis和ignore

import pandas as pd
import numpy as np
df1=pd.DataFrame(np.ones((2,3))*0,columns=['A','B','C'])
df2=pd.DataFrame(np.ones((2,3))*1,columns=['A','B','C'])

#axis设置合并方向:concat纵向合并
res=pd.concat([df1,df2],axis=0)          #index还是原数据的index
print(res) 

#ignore_index重置index
print('重置index:')
res1=pd.concat([df1,df2],axis=0,ignore_index=True)   #重置index
print(res1)

输出:

     A    B    C
0  0.0  0.0  0.0
1  0.0  0.0  0.0
0  1.0  1.0  1.0
1  1.0  1.0  1.0
重置index:
     A    B    C
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0

(2)join合并:
jion=‘outer’默认值,按照column合并,相同的column上下合并,没有的column单独成列,存在cloumn没有值为NaN; join=’inner’时,相同的column保留,不相同的抛弃

df1=pd.DataFrame(np.ones((2,3))*1,columns=['a','b','c'])
df2=pd.DataFrame(np.ones((2,3))*2,columns=['b','c','d'])

#concat纵向外合并,join=‘outer’
res=pd.concat([df1,df2],axis=0,join='outer',ignore_index='Ture')
print('纵向外合并:')
print(res)

#concat纵向内合并,join=‘inner’
res=pd.concat([df1,df2],axis=0,join='inner',ignore_index='True')
print('纵向内合并:') 
print(res)

#join_axes
df1=pd.DataFrame(np.ones((2,3))*1,columns=['a','b','c'],index=[1,2])
df2=pd.DataFrame(np.ones((2,3))*2,columns=['a','b','c'],index=[2,3])
res=pd.concat([df1,df2],axis=1,join_axes=[df1.index])               #只得到df1的index
print('join_axes合并')
print(res) 

输出:

纵向外合并:
     a    b    c    d
0  1.0  1.0  1.0  NaN
1  1.0  1.0  1.0  NaN
2  NaN  2.0  2.0  2.0
3  NaN  2.0  2.0  2.0
纵向内合并:
     b    c
0  1.0  1.0
1  1.0  1.0
2  2.0  2.0
3  2.0  2.0
join_axes合并:
     a    b    c    a    b    c
1  1.0  1.0  1.0  NaN  NaN  NaN
2  1.0  1.0  1.0  2.0  2.0  2.0

(3)append合并数据
append合并:append只有纵向合并,没有横向合并。

df3=pd.DataFrame(np.ones((2,3))*3,columns=['a','b','c'],index=[3,4])
print('append合并数据:')
res=df1.append(df2)
print(res)

print('append合并多个df:')
res=df1.append([df2,df3])
print(res)

输出:

append合并数据:
     a    b    c
1  1.0  1.0  1.0
2  1.0  1.0  1.0
2  2.0  2.0  2.0
3  2.0  2.0  2.0
append合并多个df:
     a    b    c
1  1.0  1.0  1.0
2  1.0  1.0  1.0
2  2.0  2.0  2.0
3  2.0  2.0  2.0
3  3.0  3.0  3.0
4  3.0  3.0  3.0

二、merge合并

import pandas as pd
#一组key合并
left=pd.DataFrame({'key':['k0','k1','k2'],
                     'A':['A0','A1','A2'],
                     'B':['B0','B1','B2']})
right=pd.DataFrame({'key':['k0','k1','k2'],
                    'C':['C0','C1','C2'],
                    'd':['d0','d1','d2']})
res=pd.merge(left,right,on='key')
print('一组key合并')
print(res)

#两组key合并
left=pd.DataFrame({'key1':['k0','k0','k1','k2'],
                   'key2':['k1','k0','k1','k0'],
                   'a':['a1','a2','a3','a4'],
                   'b':['b1','b2','b3','b4']})
right=pd.DataFrame({'key1':['k0','k1','k0','k2'],
                    'key2':['k0','k0','k1','k0'],
                    'c':['c0','c1','c2','c3'],
                    'd':['d0','d1','d2','d3']})
#inner
res1=pd.merge(left,right,on=['key1','key2'],how='inner')
print('inner:')
print(res1)

#outer
res2=pd.merge(left,right,on=['key1','key2'],how='outer')
print('outer:')
print(res2)

#left
res3=pd.merge(left,right,on=['key1','key2'],how='left')
print('left:')
print(res3)

#right
res4=pd.merge(left,right,on=['key1','key2'],how='right')
print('right')
print(res4)

输出:

一组key合并
    A   B key   C   d
0  A0  B0  k0  C0  d0
1  A1  B1  k1  C1  d1
2  A2  B2  k2  C2  d2
inner:
    a   b key1 key2   c   d
0  a1  b1   k0   k1  c2  d2
1  a2  b2   k0   k0  c0  d0
2  a4  b4   k2   k0  c3  d3
outer:
     a    b key1 key2    c    d
0   a1   b1   k0   k1   c2   d2
1   a2   b2   k0   k0   c0   d0
2   a3   b3   k1   k1  NaN  NaN
3   a4   b4   k2   k0   c3   d3
4  NaN  NaN   k1   k0   c1   d1
left:
    a   b key1 key2    c    d
0  a1  b1   k0   k1   c2   d2
1  a2  b2   k0   k0   c0   d0
2  a3  b3   k1   k1  NaN  NaN
3  a4  b4   k2   k0   c3   d3
right
     a    b key1 key2   c   d
0   a1   b1   k0   k1  c2  d2
1   a2   b2   k0   k0  c0  d0
2   a4   b4   k2   k0  c3  d3
3  NaN  NaN   k1   k0  c1  d1

indicator:

#indicator,显示合并记录
res4=pd.merge(left,right,on=['key1','key2'],how='right',indicator=True)
print(res4)

输出:

a    b key1 key2   c   d      _merge
0   a1   b1   k0   k1  c2  d2        both
1   a2   b2   k0   k0  c0  d0        both
2   a4   b4   k2   k0  c3  d3        both
3  NaN  NaN   k1   k0  c1  d1  right_only

overlapping重叠问题

#解决重叠问题
import pandas as pd
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})

#使用suffixes后缀解决overlapping的问题
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
print(res)

输出:

   age_boy   k  age_girl
0      1.0  K0       4.0
1      1.0  K0       5.0
2      2.0  K1       NaN
3      3.0  K2       NaN
4      NaN  K3       6.0

你可能感兴趣的:(python)