python pandas 合并dataframe_python - pandas dataFrame merge 数据合并

import numpy as np

from pandas import Series, DataFrame

import pandas as pd

np.set_printoptions(precision=4, threshold=500)

pd.options.display.max_rows = 100

###dataframe合并

df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],

'data1': range(7)})

df2 = DataFrame({'key': ['a', 'b', 'd'],

'data2': range(3)})

#print( df1 )

'''

data1 key

0 0 b

1 1 b

2 2 a

3 3 c

4 4 a

5 5 a

6 6 b

'''

#print( df2 )

'''

data2 key

0 0 a

1 1 b

2 2 d

'''

'''

官网:http://pandas.pydata.org/pandas-docs/stable/merging.html

merge

pandas的merge方法提供了一种类似于SQL的内存链接操作,官网文档提到它的性能会比其他开源语言的数据操作(例如R)要高效。

和SQL语句的对比可以看这里

merge的参数

on:列名,join用来对齐的那一列的名字,用到这个参数的时候一定要保证左表和右表用来对齐的那一列都有相同的列名。

left_on:左表对齐的列,可以是列名,也可以是和dataframe同样长度的arrays。

right_on:右表对齐的列,可以是列名,也可以是和dataframe同样长度的arrays。

left_index / right_index: 如果是True的haunted以index作为对齐的key

how:数据融合的方法。

sort:根据dataframe合并的keys按字典顺序排序,默认是,如果置false可以提高表现。

merge的默认合并方法:

merge用于表内部基于 index-on-index 和 index-on-column(s) 的合并,但默认是基于index来合并。

'''

merge_1 = pd.merge(df1, df2)

#print( merge_1 )

'''

data1 key data2

0 0 b 1

1 1 b 1

2 6 b 1

3 2 a 0

4 4 a 0

5 5 a 0

'''

merge_2 = pd.merge(df1, df2, on='key') #通过on指定数据合并对齐的列

#print( merge_2 )

'''

data1 key data2

0 0 b 1

1 1 b 1

2 6 b 1

3 2 a 0

4 4 a 0

5 5 a 0

'''

#2

df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],

'data1': range(7)})

df4 = DataFrame({'rkey': ['a', 'b', 'd'],

'data2': range(3)})

merge_3 = pd.merge(df3, df4, left_on='lkey', right_on='rkey')

#print( merge_3 )

'''

data1 lkey data2 rkey

0 0 b 1 b

1 1 b 1 b

2 6 b 1 b

3 2 a 0 a

4 4 a 0 a

5 5 a 0 a

'''

merge_4 = pd.merge(df1, df2, how='outer')

#print( merge_4 )

'''

data1 key data2

0 0.0 b 1.0

1 1.0 b 1.0

2 6.0 b 1.0

3 2.0 a 0.0

4 4.0 a 0.0

5 5.0 a 0.0

6 3.0 c NaN

7 NaN d 2.0

'''

#3

df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],

'data1': range(6)})

df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],

'data2': range(5)})

#print( df1 )

'''

data1 key

0 0 b

1 1 b

2 2 a

3 3 c

4 4 a

5 5 b

'''

#print( df2 )

'''

data2 key

0 0 a

1 1 b

2 2 a

3 3 b

4 4 d

'''

merge_5 = pd.merge(df1, df2, on='key', how='left') # how='left'只保留左表的所有数据

#print( merge_5 )

'''

data1 key data2

0 0 b 1.0

1 0 b 3.0

2 1 b 1.0

3 1 b 3.0

4 2 a 0.0

5 2 a 2.0

6 3 c NaN

7 4 a 0.0

8 4 a 2.0

9 5 b 1.0

10 5 b 3.0

'''

merge_6 = pd.merge(df1, df2, how='inner')

#print( merge_6 )

'''

data1 key data2

0 0 b 1

1 0 b 3

2 1 b 1

3 1 b 3

4 5 b 1

5 5 b 3

6 2 a 0

7 2 a 2

8 4 a 0

9 4 a 2

'''

#4

left = DataFrame({'key1': ['foo', 'foo', 'bar'],

'key2': ['one', 'two', 'one'],

'lval': [1, 2, 3]})

#print( left )

'''

key1 key2 lval

0 foo one 1

1 foo two 2

2 bar one 3

'''

right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],

'key2': ['one', 'one', 'one', 'two'],

'rval': [4, 5, 6, 7]})

#print( right )

'''

key1 key2 rval

0 foo one 4

1 foo one 5

2 bar one 6

3 bar two 7

'''

merge_7 = pd.merge(left, right, on=['key1', 'key2'], how='outer')

print( merge_7 )

'''

key1 key2 lval rval

0 foo one 1.0 4.0

1 foo one 1.0 5.0

2 foo two 2.0 NaN

3 bar one 3.0 6.0

4 bar two NaN 7.0

'''

merge_8 = pd.merge(left, right, on='key1')

print( merge_8 )

'''

key1 key2_x lval key2_y rval

0 foo one 1 one 4

1 foo one 1 one 5

2 foo two 2 one 4

3 foo two 2 one 5

4 bar one 3 one 6

5 bar one 3 two 7

'''

merge_9 = pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

print( merge_9 )

'''

key1 key2_left lval key2_right rval

0 foo one 1 one 4

1 foo one 1 one 5

2 foo two 2 one 4

3 foo two 2 one 5

4 bar one 3 one 6

5 bar one 3 two 7

'''

###索引上的合并

#1

left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],'value': range(6)})

right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

left1

right1

pd.merge(left1, right1, left_on='key', right_index=True)

pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

#2

lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],

'key2': [2000, 2001, 2002, 2001, 2002],

'data': np.arange(5.)})

righth = DataFrame(np.arange(12).reshape((6, 2)),

index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],

[2001, 2000, 2000, 2000, 2001, 2002]],

columns=['event1', 'event2'])

print( lefth )

'''

data key1 key2

0 0.0 Ohio 2000

1 1.0 Ohio 2001

2 2.0 Ohio 2002

3 3.0 Nevada 2001

4 4.0 Nevada 2002

'''

print( righth )

'''

event1 event2

Nevada 2001 0 1

2000 2 3

Ohio 2000 4 5

2000 6 7

2001 8 9

2002 10 11

'''

pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

pd.merge(lefth, righth, left_on=['key1', 'key2'],

right_index=True, how='outer')

left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],

columns=['Ohio', 'Nevada'])

right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],

index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])

left2

right2

pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

#3

left2.join(right2, how='outer')

left1.join(right1, on='key')

#4

another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],

index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])

left2.join([right2, another])

你可能感兴趣的:(python,pandas,合并dataframe)