关注SCDN博客:程志伟的博客
Python 3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)]
Type "copyright", "credits" or "license" for more information.
IPython 7.6.1 -- An enhanced Interactive Python.
import pandas as pd
import numpy as np
import datetime
#### 1.1merge函数 ####
#创建两个新的数据集
customers = {'CustomerID': [10, 11],
'Name': ['Mike', 'Marcia'],
'Address': ['Address for Mike','Address for Marcia']}
customers = pd.DataFrame(customers)
customers
Out[2]:
CustomerID Name Address
0 10 Mike Address for Mike
1 11 Marcia Address for Marcia
orders = {'CustomerID': [10, 11, 10],
'OrderDate': [datetime.date(2016, 12, 1),
datetime.date(2016, 12, 1),
datetime.date(2016, 12, 2)]}
orders = pd.DataFrame(orders)
orders
Out[3]:
CustomerID OrderDate
0 10 2016-12-01
1 11 2016-12-01
2 10 2016-12-02
#merge函数以orders作为连接条件
customers.merge(orders)
Out[4]:
CustomerID Name Address OrderDate
0 10 Mike Address for Mike 2016-12-01
1 10 Mike Address for Mike 2016-12-02
2 11 Marcia Address for Marcia 2016-12-01
#创建新的数据集
left_data = {'key1': ['a', 'b', 'c'],
'key2': ['x', 'y', 'z'],
'lval1': [ 0, 1, 2]}
right_data = {'key1': ['a', 'b', 'c'],
'key2': ['x', 'a', 'z'],
'rval1': [ 6, 7, 8 ]}
left = pd.DataFrame(left_data, index=[0, 1, 2])
right = pd.DataFrame(right_data, index=[1, 2, 3])
left
Out[5]:
key1 key2 lval1
0 a x 0
1 b y 1
2 c z 2
right
Out[6]:
key1 key2 rval1
1 a x 6
2 b a 7
3 c z 8
#merge连接方式默认为内连接
left.merge(right)
Out[7]:
key1 key2 lval1 rval1
0 a x 0 6
1 c z 2 8
#以key1作为连接条件
left.merge(right,on='key1')
Out[8]:
key1 key2_x lval1 key2_y rval1
0 a x 0 x 6
1 b y 1 a 7
2 c z 2 z 8
#以key1,key2作为连接条件
left.merge(right, on=['key1', 'key2'])
Out[9]:
key1 key2 lval1 rval1
0 a x 0 6
1 c z 2 8
#按照index来merge
pd.merge(left, right, left_index=True, right_index=True)
Out[10]:
key1_x key2_x lval1 key1_y key2_y rval1
1 b y 1 a x 6
2 c z 2 b a 7
#### 1.2 merge进阶 ####
#inner :两个DataFrame的key交集
#outer:两个DataFrame的key并集
#left: 只使用左边的DataFrame的key
#right : 只使用右边的DataFrame的key
#how指定连接方式,outer表示全连接
left.merge(right,how='outer')
Out[11]:
key1 key2 lval1 rval1
0 a x 0.0 6.0
1 b y 1.0 NaN
2 c z 2.0 8.0
3 b a NaN 7.0
#左连接
left.merge(right,how='left')
Out[12]:
key1 key2 lval1 rval1
0 a x 0 6.0
1 b y 1 NaN
2 c z 2 8.0
#右连接
left.merge(right,how='right')
Out[13]:
key1 key2 lval1 rval1
0 a x 0.0 6
1 c z 2.0 8
2 b a NaN 7
#### 1.3join使用index label ####
#lsuffix='left',这里如果有相同列需要给出后缀
left.join(right, lsuffix='_left', rsuffix='_right')
Out[14]:
key1_left key2_left lval1 key1_right key2_right rval1
0 a x 0 NaN NaN NaN
1 b y 1 a x 6.0
2 c z 2 b a 7.0
left.join(right,how='outer', lsuffix='_left', rsuffix='_right')
Out[15]:
key1_left key2_left lval1 key1_right key2_right rval1
0 a x 0.0 NaN NaN NaN
1 b y 1.0 a x 6.0
2 c z 2.0 b a 7.0
3 NaN NaN NaN c z 8.0
left.join(right, lsuffix='_left', rsuffix='_right', how='inner')
Out[16]:
key1_left key2_left lval1 key1_right key2_right rval1
1 b y 1 a x 6
2 c z 2 b a 7
#### 1.4 concat ####
df1 = pd.DataFrame(np.arange(9).reshape(3, 3),columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3),columns=['a', 'b', 'c'])
df1
Out[17]:
a b c
0 0 1 2
1 3 4 5
2 6 7 8
df2
Out[18]:
a b c
0 9 10 11
1 12 13 14
2 15 16 17
pd.concat([df1, df2])
Out[19]:
a b c
0 0 1 2
1 3 4 5
2 6 7 8
0 9 10 11
1 12 13 14
2 15 16 17
#创建两个列名不一样新的数据集
df1 = pd.DataFrame(np.arange(9).reshape(3, 3),columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3),columns=['a', 'c', 'd'])
df1
Out[20]:
a b c
0 0 1 2
1 3 4 5
2 6 7 8
df2
Out[21]:
a c d
0 9 10 11
1 12 13 14
2 15 16 17
#默认全连接
pd.concat([df1, df2])
Out[22]:
a b c d
0 0 1.0 2 NaN
1 3 4.0 5 NaN
2 6 7.0 8 NaN
0 9 NaN 10 11.0
1 12 NaN 13 14.0
2 15 NaN 16 17.0
c = pd.concat([df1, df2], keys=['df1', 'df2'])
c
Out[23]:
a b c d
df1 0 0 1.0 2 NaN
1 3 4.0 5 NaN
2 6 7.0 8 NaN
df2 0 9 NaN 10 11.0
1 12 NaN 13 14.0
2 15 NaN 16 17.0
c.loc['df1']
Out[24]:
a b c d
0 0 1.0 2 NaN
1 3 4.0 5 NaN
2 6 7.0 8 NaN
c.loc['df1'].loc[0]
Out[25]:
a 0.0
b 1.0
c 2.0
d NaN
Name: 0, dtype: float64
#axis=1表示横向拼接
pd.concat([df1, df2], axis=1)
Out[26]:
a b c a c d
0 0 1 2 9 10 11
1 3 4 5 12 13 14
2 6 7 8 15 16 17