pandas_day03

import numpy as np
import pandas as pd
df = pd.read_csv('learn_pandas.csv',usecols = ['School','Grade', 'Name','Gender','Weight','Transfer'])
# 把Name列设为索引
df_demo = df.set_index('Name')
df_demo
School Grade Gender Weight Transfer
Name
Gaopeng Yang Shanghai Jiao Tong University Freshman Female 46.0 N
Changqiang You Peking University Freshman Male 70.0 N
Mei Sun Shanghai Jiao Tong University Senior Male 89.0 N
Xiaojuan Sun Fudan University Sophomore Female 41.0 N
Gaojuan You Fudan University Sophomore Male 74.0 N
... ... ... ... ... ...
Xiaojuan Sun Fudan University Junior Female 46.0 N
Li Zhao Tsinghua University Senior Female 50.0 N
Chengqiang Chu Shanghai Jiao Tong University Senior Female 45.0 N
Chengmei Shen Shanghai Jiao Tong University Senior Male 71.0 N
Chunpeng Lv Tsinghua University Sophomore Male 51.0 N

200 rows × 5 columns

df_loc_slice_demo = df_demo.copy()
df_loc_slice_demo.index = range(df_demo.shape[0],0,-1)
# 倒叙排列
df_loc_slice_demo.index
RangeIndex(start=200, stop=0, step=-1)
df_loc_slice_demo.loc[5:3]
School Grade Gender Weight Transfer
5 Fudan University Junior Female 46.0 N
4 Tsinghua University Senior Female 50.0 N
3 Shanghai Jiao Tong University Senior Female 45.0 N
np.random.seed(0)
multi_index = pd.MultiIndex.from_product([list('ABCD'),df.Gender.unique()],names=('School', 'Gender'))
multi_index
MultiIndex([('A', 'Female'),
            ('A',   'Male'),
            ('B', 'Female'),
            ('B',   'Male'),
            ('C', 'Female'),
            ('C',   'Male'),
            ('D', 'Female'),
            ('D',   'Male')],
           names=['School', 'Gender'])
multi_column=pd.MultiIndex.from_product([['Height','Weight'],df.Grade.unique()],names=('Indicator','Grade'))
multi_column
MultiIndex([('Height',  'Freshman'),
            ('Height',    'Senior'),
            ('Height', 'Sophomore'),
            ('Height',    'Junior'),
            ('Weight',  'Freshman'),
            ('Weight',    'Senior'),
            ('Weight', 'Sophomore'),
            ('Weight',    'Junior')],
           names=['Indicator', 'Grade'])
df_multi = pd.DataFrame(np.c_[(np.random.rand(8,4)*5 + 163).tolist(), (np.random.randn(8,4)*5 + 65).tolist()],
                       index = multi_index,
                       columns = multi_column).round(1)
df_multi
Indicator Height Weight
Grade Freshman Senior Sophomore Junior Freshman Senior Sophomore Junior
School Gender
A Female 165.7 166.6 166.0 165.7 76.3 57.7 65.2 64.1
Male 165.1 166.2 165.2 167.5 72.7 72.3 65.8 66.9
B Female 167.8 164.9 167.0 165.6 60.6 55.1 63.3 65.8
Male 165.8 167.6 163.4 163.4 71.2 71.0 63.1 63.5
C Female 163.1 167.2 166.9 167.4 59.8 57.9 56.5 74.8
Male 167.9 167.0 165.3 166.9 62.5 62.8 58.7 68.9
D Female 163.6 166.2 163.7 167.7 56.9 63.9 60.5 66.9
Male 165.6 165.1 164.3 166.9 62.4 59.1 64.9 67.1

3.2 索引的常用方法
3.3.1 索引层的交换和删除

import numpy as np
import pandas as pd
np.random.seed(0)
L1,L2,L3 = ['A','B'],['a','b'],['alpha','beta']
# 相当于做笛卡尔积
mul_index1 = pd.MultiIndex.from_product([L1,L2,L3], names=('Upper', 'Lower','Extra'))
mul_index1
MultiIndex([('A', 'a', 'alpha'),
            ('A', 'a',  'beta'),
            ('A', 'b', 'alpha'),
            ('A', 'b',  'beta'),
            ('B', 'a', 'alpha'),
            ('B', 'a',  'beta'),
            ('B', 'b', 'alpha'),
            ('B', 'b',  'beta')],
           names=['Upper', 'Lower', 'Extra'])
L4,L5,L6 = ['C','D'],['c','d'],['cat','dog']
mul_index2 = pd.MultiIndex.from_product([L4,L5,L6], names=('Big', 'Small', 'Other')) 
mul_index2
MultiIndex([('C', 'c', 'cat'),
            ('C', 'c', 'dog'),
            ('C', 'd', 'cat'),
            ('C', 'd', 'dog'),
            ('D', 'c', 'cat'),
            ('D', 'c', 'dog'),
            ('D', 'd', 'cat'),
            ('D', 'd', 'dog')],
           names=['Big', 'Small', 'Other'])
df_ex = pd.DataFrame(np.random.randint(-9,10,(8,8)),index=mul_index1,columns=mul_index2) 
df_ex
Big C D
Small c d c d
Other cat dog cat dog cat dog cat dog
Upper Lower Extra
A a alpha 3 6 -9 -6 -6 -2 0 9
beta -5 -3 3 -8 -3 -2 5 8
b alpha -4 4 -1 0 7 -4 6 6
beta -9 9 -6 8 5 -2 -9 -8
B a alpha 0 -9 1 -6 2 9 -7 -9
beta -9 -5 -4 -3 -1 8 6 -5
b alpha 0 1 -8 -8 -2 0 -6 -3
beta 2 5 9 -9 5 -6 3 1

3.4 索引运算
由于集合的元素是互异的,但是索引中可能有相同的元素,先用 unique 去重后再进行运算

df_set_1 = pd.DataFrame([[0,1],[1,2],[3,4]], index = pd.Index(['a','b','a'],name='id1'))
df_set_1
0 1
id1
a 0 1
b 1 2
a 3 4
df_set_2 = pd.DataFrame([[4,5],[2,6],[7,1]],index = pd.Index(['b','b','c'],name='id2')) 
df_set_2
0 1
id2
b 4 5
b 2 6
c 7 1
# 去重
id1, id2 = df_set_1.index.unique(), df_set_2.index.unique()
id1, id2
(Index(['a', 'b'], dtype='object', name='id1'),
 Index(['b', 'c'], dtype='object', name='id2'))
id1.intersection(id2) # id1 & id2 
Index(['b'], dtype='object')
id1.union(id2)  # id1 | id2 
Index(['a', 'b', 'c'], dtype='object')
id1.difference(id2) # (id1 ^ id2) & id1
Index(['a'], dtype='object')
id1.symmetric_difference(id2) #  id1 ^ id2 
Index(['a', 'c'], dtype='object')

若两张表需要做集合运算的列并没有被设置索引,一种办法是先转成索引,运算后再恢复,另一种方法是利用isin函数,例如在重置索引的第一张表中选出 id 列交集的所在行:

df_set_in_col_1 = df_set_1.reset_index()
df_set_in_col_1
id1 0 1
0 a 0 1
1 b 1 2
2 a 3 4
df_set_in_col_2 = df_set_2.reset_index()
df_set_in_col_2
id2 0 1
0 b 4 5
1 b 2 6
2 c 7 1
df_set_in_col_1[df_set_in_col_1.id1.isin(df_set_in_col_2.id2)] 
id1 0 1
1 b 1 2

3.5.1 公司员工数据集

  1. 分别只使用 query 和 loc 选出年龄不超过四十岁且工作部门为 Dairy 或 Bakery 的男性。
  2. 选出员工 ID 号为奇数所在行的第 1、第 3 和倒数第 2 列。
  3. 按照以下步骤进行索引操作:
    • 把后三列设为索引后交换内外两层
    • 恢复中间一层
    • 修改外层索引名为 Gender
    • 用下划线合并两层行索引
    • 把行索引拆分为原状态
    • 修改索引名为原表名称
    • 恢复默认索引并将列保持为原表的相对位置
df = pd.read_csv('company.csv')
df.head(3)
EmployeeID birthdate_key age city_name department job_title gender
0 1318 1/3/1954 61 Vancouver Executive CEO M
1 1319 1/3/1957 58 Vancouver Executive VP Stores F
2 1320 1/2/1955 60 Vancouver Executive Legal Counsel F
# 工作部门同属于一个列,所以应当用list表示
df.query("(age<=40)&(department == ['Dairy', 'Bakery'])&(gender == 'M')")
EmployeeID birthdate_key age city_name department job_title gender
3611 5791 1/14/1975 40 Kelowna Dairy Dairy Person M
3613 5793 1/22/1975 40 Richmond Bakery Baker M
3615 5795 1/30/1975 40 Nanaimo Dairy Dairy Person M
3617 5797 2/3/1975 40 Nanaimo Dairy Dairy Person M
3618 5798 2/4/1975 40 Surrey Dairy Dairy Person M
... ... ... ... ... ... ... ...
6108 8307 10/20/1994 21 Burnaby Dairy Dairy Person M
6113 8312 11/12/1994 21 Burnaby Dairy Dairy Person M
6137 8336 12/31/1994 21 Vancouver Dairy Dairy Person M
6270 6312 5/14/1979 36 Grand Forks Dairy Dairy Person M
6271 6540 2/14/1981 34 Victoria Bakery Baker M

441 rows × 7 columns

# 每一个& 前后连接的最好都用括号括起来
df.loc[(df.age<=40)& (df.department.isin(['Dairy', 'Bakery']))&(df.gender == 'M')]
EmployeeID birthdate_key age city_name department job_title gender
3611 5791 1/14/1975 40 Kelowna Dairy Dairy Person M
3613 5793 1/22/1975 40 Richmond Bakery Baker M
3615 5795 1/30/1975 40 Nanaimo Dairy Dairy Person M
3617 5797 2/3/1975 40 Nanaimo Dairy Dairy Person M
3618 5798 2/4/1975 40 Surrey Dairy Dairy Person M
... ... ... ... ... ... ... ...
6108 8307 10/20/1994 21 Burnaby Dairy Dairy Person M
6113 8312 11/12/1994 21 Burnaby Dairy Dairy Person M
6137 8336 12/31/1994 21 Vancouver Dairy Dairy Person M
6270 6312 5/14/1979 36 Grand Forks Dairy Dairy Person M
6271 6540 2/14/1981 34 Victoria Bakery Baker M

441 rows × 7 columns

(df.EmployeeID%2 == 1).values
array([False,  True, False, ...,  True, False, False])
df.iloc[(df.EmployeeID%2 == 1).values, [0,2,-2]]
EmployeeID age job_title
1 1319 58 VP Stores
3 1321 56 VP Human Resources
5 1323 53 Exec Assistant, VP Stores
6 1325 51 Exec Assistant, Legal Counsel
8 1329 48 Store Manager
... ... ... ...
6276 7659 26 Cashier
6277 7741 25 Cashier
6278 7801 25 Dairy Person
6280 8181 22 Cashier
6281 8223 21 Cashier

3126 rows × 3 columns

df_new = df.copy()
# 把后三列设为索引后交换内外两层
df_new = df_new.set_index(df_new.columns[-3:].tolist()).swaplevel(0,2,axis = 0)
df_new
EmployeeID birthdate_key age city_name
gender job_title department
M CEO Executive 1318 1/3/1954 61 Vancouver
F VP Stores Executive 1319 1/3/1957 58 Vancouver
Legal Counsel Executive 1320 1/2/1955 60 Vancouver
M VP Human Resources Executive 1321 1/2/1959 56 Vancouver
VP Finance Executive 1322 1/9/1958 57 Vancouver
... ... ... ... ... ... ...
F Cashier Customer Service 8036 8/9/1992 23 New Westminister
M Cashier Customer Service 8181 9/26/1993 22 Prince George
Customer Service 8223 2/11/1994 21 Trail
F Cashier Customer Service 8226 2/16/1994 21 Victoria
Customer Service 8264 6/13/1994 21 Vancouver

6284 rows × 4 columns

# 恢复中间一层
df_new = df_new.reset_index(1)
df_new
job_title EmployeeID birthdate_key age city_name
gender department
M Executive CEO 1318 1/3/1954 61 Vancouver
F Executive VP Stores 1319 1/3/1957 58 Vancouver
Executive Legal Counsel 1320 1/2/1955 60 Vancouver
M Executive VP Human Resources 1321 1/2/1959 56 Vancouver
Executive VP Finance 1322 1/9/1958 57 Vancouver
... ... ... ... ... ... ...
F Customer Service Cashier 8036 8/9/1992 23 New Westminister
M Customer Service Cashier 8181 9/26/1993 22 Prince George
Customer Service Cashier 8223 2/11/1994 21 Trail
F Customer Service Cashier 8226 2/16/1994 21 Victoria
Customer Service Cashier 8264 6/13/1994 21 Vancouver

6284 rows × 5 columns

# 修改外层索引名为Gender
df_new.rename_axis(index={'gender':'Gender'})
job_title EmployeeID birthdate_key age city_name
Gender department
M Executive CEO 1318 1/3/1954 61 Vancouver
F Executive VP Stores 1319 1/3/1957 58 Vancouver
Executive Legal Counsel 1320 1/2/1955 60 Vancouver
M Executive VP Human Resources 1321 1/2/1959 56 Vancouver
Executive VP Finance 1322 1/9/1958 57 Vancouver
... ... ... ... ... ... ...
F Customer Service Cashier 8036 8/9/1992 23 New Westminister
M Customer Service Cashier 8181 9/26/1993 22 Prince George
Customer Service Cashier 8223 2/11/1994 21 Trail
F Customer Service Cashier 8226 2/16/1994 21 Victoria
Customer Service Cashier 8264 6/13/1994 21 Vancouver

6284 rows × 5 columns

# 用下划线合并两层行索引,转为index
df_new.index = df_new.index.map(lambda x: '_'.join(x))
df_new
job_title EmployeeID birthdate_key age city_name
M_Executive CEO 1318 1/3/1954 61 Vancouver
F_Executive VP Stores 1319 1/3/1957 58 Vancouver
F_Executive Legal Counsel 1320 1/2/1955 60 Vancouver
M_Executive VP Human Resources 1321 1/2/1959 56 Vancouver
M_Executive VP Finance 1322 1/9/1958 57 Vancouver
... ... ... ... ... ...
F_Customer Service Cashier 8036 8/9/1992 23 New Westminister
M_Customer Service Cashier 8181 9/26/1993 22 Prince George
M_Customer Service Cashier 8223 2/11/1994 21 Trail
F_Customer Service Cashier 8226 2/16/1994 21 Victoria
F_Customer Service Cashier 8264 6/13/1994 21 Vancouver

6284 rows × 5 columns

# 把行索引拆分为原状态
df_new.index = df_new.index.map(lambda x:tuple(x.split('_')))
df_new
job_title EmployeeID birthdate_key age city_name
M Executive CEO 1318 1/3/1954 61 Vancouver
F Executive VP Stores 1319 1/3/1957 58 Vancouver
Executive Legal Counsel 1320 1/2/1955 60 Vancouver
M Executive VP Human Resources 1321 1/2/1959 56 Vancouver
Executive VP Finance 1322 1/9/1958 57 Vancouver
... ... ... ... ... ... ...
F Customer Service Cashier 8036 8/9/1992 23 New Westminister
M Customer Service Cashier 8181 9/26/1993 22 Prince George
Customer Service Cashier 8223 2/11/1994 21 Trail
F Customer Service Cashier 8226 2/16/1994 21 Victoria
Customer Service Cashier 8264 6/13/1994 21 Vancouver

6284 rows × 5 columns

# 修改索引名为原表名称,需要指定参数axis
df_new.rename_axis(index = ['gender', 'department'])
job_title EmployeeID birthdate_key age city_name
gender department
M Executive CEO 1318 1/3/1954 61 Vancouver
F Executive VP Stores 1319 1/3/1957 58 Vancouver
Executive Legal Counsel 1320 1/2/1955 60 Vancouver
M Executive VP Human Resources 1321 1/2/1959 56 Vancouver
Executive VP Finance 1322 1/9/1958 57 Vancouver
... ... ... ... ... ... ...
F Customer Service Cashier 8036 8/9/1992 23 New Westminister
M Customer Service Cashier 8181 9/26/1993 22 Prince George
Customer Service Cashier 8223 2/11/1994 21 Trail
F Customer Service Cashier 8226 2/16/1994 21 Victoria
Customer Service Cashier 8264 6/13/1994 21 Vancouver

6284 rows × 5 columns

# 恢复默认索引
df_new.reset_index().reindex(df.columns, axis=1)
EmployeeID birthdate_key age city_name department job_title gender
0 1318 1/3/1954 61 Vancouver NaN CEO NaN
1 1319 1/3/1957 58 Vancouver NaN VP Stores NaN
2 1320 1/2/1955 60 Vancouver NaN Legal Counsel NaN
3 1321 1/2/1959 56 Vancouver NaN VP Human Resources NaN
4 1322 1/9/1958 57 Vancouver NaN VP Finance NaN
... ... ... ... ... ... ... ...
6279 8036 8/9/1992 23 New Westminister NaN Cashier NaN
6280 8181 9/26/1993 22 Prince George NaN Cashier NaN
6281 8223 2/11/1994 21 Trail NaN Cashier NaN
6282 8226 2/16/1994 21 Victoria NaN Cashier NaN
6283 8264 6/13/1994 21 Vancouver NaN Cashier NaN

6284 rows × 7 columns

df_new.equals(df)
False
  1. 把列索引名中的\n替换为空格。
  2. 巧克力Rating评分为1至5,每0.25分一档,请选出2.75分及以下且可可含量 Cocoa Percent高于中位数的样本。
  3. 将Review Date和Company Location设为索引后,选出Review Date在2012年之后且Company Location不属于France, Canada, Amsterdam, Belgium的样本。
dfc = pd.read_csv('chocolate.csv')
dfc.columns = [' '.join(i.split('\n')) for i in dfc.columns]
dfc
Company Review Date Cocoa Percent Company Location Rating
0 A. Morin 2016 63% France 3.75
1 A. Morin 2015 70% France 2.75
2 A. Morin 2015 70% France 3.00
3 A. Morin 2015 70% France 3.50
4 A. Morin 2015 70% France 3.50
... ... ... ... ... ...
1790 Zotter 2011 70% Austria 3.75
1791 Zotter 2011 65% Austria 3.00
1792 Zotter 2011 65% Austria 3.50
1793 Zotter 2011 62% Austria 3.25
1794 Zotter 2010 65% Austria 3.00

1795 rows × 5 columns

dfc['Cocoa Percent'] = dfc['Cocoa Percent'].apply(lambda x: float(x[:-1])/100)
dfc.query('Rating <=2.75 & `Cocoa Percent` > `Cocoa Percent`.median()').head()
Company Review Date Cocoa Percent Company Location Rating
33 Akesson's (Pralus) 2010 0.75 Switzerland 2.75
34 Akesson's (Pralus) 2010 0.75 Switzerland 2.75
36 Alain Ducasse 2014 0.75 France 2.75
38 Alain Ducasse 2013 0.75 France 2.50
39 Alain Ducasse 2013 0.75 France 2.50
dfc[(dfc['Rating'] <=2.75) & (dfc['Cocoa Percent'] > dfc['Cocoa Percent'].median())]
Company Review Date Cocoa Percent Company Location Rating
33 Akesson's (Pralus) 2010 0.75 Switzerland 2.75
34 Akesson's (Pralus) 2010 0.75 Switzerland 2.75
36 Alain Ducasse 2014 0.75 France 2.75
38 Alain Ducasse 2013 0.75 France 2.50
39 Alain Ducasse 2013 0.75 France 2.50
... ... ... ... ... ...
1736 Wilkie's Organic 2013 0.89 Ireland 2.75
1738 Wilkie's Organic 2013 0.75 Ireland 2.75
1741 Willie's Cacao 2013 1.00 U.K. 2.25
1769 Zart Pralinen 2016 0.85 Austria 2.75
1778 Zotter 2014 0.80 Austria 2.75

239 rows × 5 columns

idx = pd.IndexSlice
# 设置Review Date和Company Location为索引
dfc= dfc.set_index(['Review Date', 'Company Location']).sort_index(level=0)
dfc.head()
Company Cocoa Percent Rating
Review Date Company Location
2006 Belgium Cote d' Or (Kraft) 0.70 1.00
Belgium Dolfin (Belcolade) 0.70 1.50
Belgium Neuhaus (Callebaut) 0.73 2.00
Belgium Neuhaus (Callebaut) 0.75 2.75
Belgium Neuhaus (Callebaut) 0.71 3.00
# 选出Review Date在2012年之后且Company Location不属于France, Canada, Amsterdam, Belgium的样本
dfc.loc[idx[2012:, dfc.index.get_level_values(1).difference(['France', 'Canada', 'Amsterdam', 'Belgium'])], :]
Company Cocoa Percent Rating
Review Date Company Location
2012 Australia Bahen & Co. 0.70 3.00
Australia Bahen & Co. 0.70 2.50
Australia Bahen & Co. 0.70 2.50
Australia Cravve 0.75 3.25
Australia Cravve 0.65 3.25
... ... ... ... ...
2017 U.S.A. Spencer 0.70 3.75
U.S.A. Spencer 0.70 3.50
U.S.A. Spencer 0.70 2.75
U.S.A. Xocolla 0.70 2.75
U.S.A. Xocolla 0.70 2.50

972 rows × 3 columns

你可能感兴趣的:(python初学笔记)