# http://pandas.pydata.org/pandas-docs/stable/advanced.html
# MultiIndex / Advanced Indexing
# pandas 0.22.0
import pandas as pd
import numpy as np
import random; random.shuffle(tuples)
# Hierarchical indexing (MultiIndex) 分层索引 多重索引
# 创建多重索引对象,如同标准的索引类,他们存放轴axis标签labels
# 创建方式:
# from a list of arrays -- using MultiIndex.from_arrays
# from an array of tuples -- using MultiIndex.from_tuples
# from a crossed set of iterables -- using MultiIndex.from_product
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
arrays
# *arrays
zip(*arrays) # 这里的* 是“解开”list , list[0] list[1] ……
list(zip(*arrays))
list(zip(arrays[0], arrays[1]))
tuples = list(zip(*arrays))
tuples
[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
[('bar', 'one'),
('bar', 'two'),
('baz', 'one'),
('baz', 'two'),
('foo', 'one'),
('foo', 'two'),
('qux', 'one'),
('qux', 'two')]
[('bar', 'one'),
('bar', 'two'),
('baz', 'one'),
('baz', 'two'),
('foo', 'one'),
('foo', 'two'),
('qux', 'one'),
('qux', 'two')]
[('bar', 'one'),
('bar', 'two'),
('baz', 'one'),
('baz', 'two'),
('foo', 'one'),
('foo', 'two'),
('qux', 'one'),
('qux', 'two')]
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
s = pd.Series(np.random.randn(8), index=index)
s
first second
bar one 0.271000
two -1.276230
baz one -1.018103
two -0.620292
foo one 1.008070
two 0.759145
qux one -2.141050
two -0.927688
dtype: float64
# 更简洁的方式, 当每个元素对都来自于可迭代对象时
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] # 创建了与上面相同的index
pd.MultiIndex.from_product(iterables, names=['first', 'second']) # 多重索引可以接受命名,默认为None
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
# 也可以直接在创建df或series时传入矩阵array 的 列表list
arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]
arrays
[array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
dtype='
s = pd.Series(np.random.randn(8), index=arrays)
s
bar one 0.817791
two 0.510420
baz one -0.494160
two -0.529997
foo one 0.641282
two -0.202762
qux one 0.050320
two 2.097300
dtype: float64
df = pd.DataFrame(np.random.randn(8, 4), index=arrays)
df
|
|
0 |
1 |
2 |
3 |
bar |
one |
-2.090989 |
0.001052 |
1.467637 |
0.267938 |
two |
1.224610 |
0.851894 |
0.765531 |
-0.505116 |
baz |
one |
1.444246 |
-0.247795 |
0.267462 |
-0.945641 |
two |
0.836046 |
0.274732 |
0.530525 |
-0.560081 |
foo |
one |
-3.709465 |
-0.157089 |
0.608778 |
-0.003217 |
two |
-0.848818 |
1.478306 |
-0.389401 |
-1.205956 |
qux |
one |
-1.069775 |
1.272440 |
-0.797613 |
-0.194223 |
two |
1.597218 |
0.454815 |
-0.756022 |
0.481038 |
# 可以对不同的轴向(如行索引/行名或列索引/列名)设置
df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index)
df
first |
bar |
baz |
foo |
qux |
second |
one |
two |
one |
two |
one |
two |
one |
two |
A |
-1.608162 |
-0.007312 |
1.048244 |
-0.029907 |
-0.437866 |
-1.853398 |
2.026875 |
0.359521 |
B |
1.207609 |
-0.272366 |
-0.530191 |
-0.689641 |
-0.244362 |
-1.476252 |
0.818493 |
0.353771 |
C |
-0.369463 |
1.862253 |
-0.118297 |
-0.148326 |
1.147616 |
-1.389965 |
0.817716 |
0.787394 |
# 同时对两个方向设置index,注意index的长度与数据在不同方向上的长度
pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6])
|
first |
bar |
baz |
foo |
|
second |
one |
two |
one |
two |
one |
two |
first |
second |
|
|
|
|
|
|
bar |
one |
0.020204 |
-0.549089 |
0.381830 |
0.326558 |
-1.420590 |
-1.551863 |
two |
1.311775 |
2.294908 |
0.203981 |
1.381199 |
-0.743387 |
2.119027 |
baz |
one |
0.640856 |
1.089627 |
-1.463503 |
0.727607 |
-0.959549 |
-0.037316 |
two |
-0.906859 |
-0.720702 |
0.862614 |
0.082066 |
0.209276 |
-0.391039 |
foo |
one |
-0.328704 |
-1.015117 |
0.279826 |
0.141166 |
-0.053601 |
-1.171920 |
two |
0.342074 |
-0.196049 |
-0.387946 |
0.196228 |
-1.264932 |
0.144251 |
pd.Series(np.random.randn(8), index=tuples) # 多重索引,相当于元组index
(bar, one) -0.267177
(bar, two) -0.239632
(baz, one) 1.212249
(baz, two) 0.289517
(foo, one) 1.311922
(foo, two) -0.797733
(qux, one) -1.395485
(qux, two) -0.451327
dtype: float64
# 可以控制索引的显示方式,通过在 pandas.set_options() 设置 multi_sparse 选项
pd.set_option('display.multi_sparse', False)
df
pd.set_option('display.multi_sparse', True)
df
first |
bar |
bar |
baz |
baz |
foo |
foo |
qux |
qux |
second |
one |
two |
one |
two |
one |
two |
one |
two |
A |
-1.608162 |
-0.007312 |
1.048244 |
-0.029907 |
-0.437866 |
-1.853398 |
2.026875 |
0.359521 |
B |
1.207609 |
-0.272366 |
-0.530191 |
-0.689641 |
-0.244362 |
-1.476252 |
0.818493 |
0.353771 |
C |
-0.369463 |
1.862253 |
-0.118297 |
-0.148326 |
1.147616 |
-1.389965 |
0.817716 |
0.787394 |
first |
bar |
baz |
foo |
qux |
second |
one |
two |
one |
two |
one |
two |
one |
two |
A |
-1.608162 |
-0.007312 |
1.048244 |
-0.029907 |
-0.437866 |
-1.853398 |
2.026875 |
0.359521 |
B |
1.207609 |
-0.272366 |
-0.530191 |
-0.689641 |
-0.244362 |
-1.476252 |
0.818493 |
0.353771 |
C |
-0.369463 |
1.862253 |
-0.118297 |
-0.148326 |
1.147616 |
-1.389965 |
0.817716 |
0.787394 |
# Reconstructing the level labels 重建层级标签
# The method get_level_values will return a vector of the labels for each location at a particular level:
# get_level_values 方法返回指定层级的标签向量
index.get_level_values(0) # 使用整数序号
index.get_level_values("second") # 使用name
Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')
Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')
# Basic indexing on axis with MultiIndex # 在轴方向的基础索引
df['bar']
df['bar', 'one']
df['bar']['one'] # 不建议使用,链式
s['qux']
second |
one |
two |
A |
-1.608162 |
-0.007312 |
B |
1.207609 |
-0.272366 |
C |
-0.369463 |
1.862253 |
A -1.608162
B 1.207609
C -0.369463
Name: (bar, one), dtype: float64
A -1.608162
B 1.207609
C -0.369463
Name: one, dtype: float64
one 0.05032
two 2.09730
dtype: float64
# Defined Levels 指定层级
df.columns # 原index
df[['foo','qux']].columns # 切片后的结果,层级levels中的项目没有减少,labels减少了
# 这样做避免了重新计算层级,使切片保持高效
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
labels=[[2, 2, 3, 3], [0, 1, 0, 1]],
names=['first', 'second'])
# 查看切片实际选择levels
df[['foo','qux']].columns.values
df[['foo','qux']].columns.get_level_values(0) # 指定层级
array([('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')],
dtype=object)
Index(['foo', 'foo', 'qux', 'qux'], dtype='object', name='first')
# 用有效的used层级重建多重索引
df[['foo','qux']].columns.remove_unused_levels()
MultiIndex(levels=[['foo', 'qux'], ['one', 'two']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['first', 'second'])
# Data alignment and using reindex 数据定位和使用reindex
s
bar one 0.817791
two 0.510420
baz one -0.494160
two -0.529997
foo one 0.641282
two -0.202762
qux one 0.050320
two 2.097300
dtype: float64
# 当两个index不同的对象计算时与一般的index一样
s + s[:-2]
s + s[::2]
bar one 1.635582
two 1.020840
baz one -0.988319
two -1.059993
foo one 1.282563
two -0.405523
qux one NaN
two NaN
dtype: float64
bar one 1.635582
two NaN
baz one -0.988319
two NaN
foo one 1.282563
two NaN
qux one 0.100640
two NaN
dtype: float64
# reindex 可以被另外一个 multiindex 或者 元组的list 或 array 调用
index
index[:3]
s.reindex(index[:3])
s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')])
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
labels=[[0, 0, 1], [0, 1, 0]],
names=['first', 'second'])
first second
bar one 0.817791
two 0.510420
baz one -0.494160
dtype: float64
foo two -0.202762
bar one 0.817791
qux one 0.050320
baz one -0.494160
dtype: float64
# Advanced indexing with hierarchical index
# 使用层次索引的高级索引方法
df = df.T
df
|
|
A |
B |
C |
first |
second |
|
|
|
bar |
one |
-1.608162 |
1.207609 |
-0.369463 |
two |
-0.007312 |
-0.272366 |
1.862253 |
baz |
one |
1.048244 |
-0.530191 |
-0.118297 |
two |
-0.029907 |
-0.689641 |
-0.148326 |
foo |
one |
-0.437866 |
-0.244362 |
1.147616 |
two |
-1.853398 |
-1.476252 |
-1.389965 |
qux |
one |
2.026875 |
0.818493 |
0.817716 |
two |
0.359521 |
0.353771 |
0.787394 |
# .loc 定位
df.loc["bar"]
df.loc["bar", "two"] # 返回了一个series(不是一行,而是“一列”),其索引是原df的列名
|
A |
B |
C |
second |
|
|
|
one |
-1.608162 |
1.207609 |
-0.369463 |
two |
-0.007312 |
-0.272366 |
1.862253 |
A -0.007312
B -0.272366
C 1.862253
Name: (bar, two), dtype: float64
# loc 中使用切片,切片的值可以是元组
df.loc['baz':'foo']
df.loc[('baz', 'two'):('qux', 'one')]
df.loc[('baz', 'two'):'foo']
|
|
A |
B |
C |
first |
second |
|
|
|
baz |
one |
1.048244 |
-0.530191 |
-0.118297 |
two |
-0.029907 |
-0.689641 |
-0.148326 |
foo |
one |
-0.437866 |
-0.244362 |
1.147616 |
two |
-1.853398 |
-1.476252 |
-1.389965 |
|
|
A |
B |
C |
first |
second |
|
|
|
baz |
two |
-0.029907 |
-0.689641 |
-0.148326 |
foo |
one |
-0.437866 |
-0.244362 |
1.147616 |
two |
-1.853398 |
-1.476252 |
-1.389965 |
qux |
one |
2.026875 |
0.818493 |
0.817716 |
|
|
A |
B |
C |
first |
second |
|
|
|
baz |
two |
-0.029907 |
-0.689641 |
-0.148326 |
foo |
one |
-0.437866 |
-0.244362 |
1.147616 |
two |
-1.853398 |
-1.476252 |
-1.389965 |
# 可以给loc传入元组或标签列表,取得不连续的索引
df.loc[[('bar', 'two'), ('qux', 'one')]]
|
|
A |
B |
C |
first |
second |
|
|
|
bar |
two |
-0.007312 |
-0.272366 |
1.862253 |
qux |
one |
2.026875 |
0.818493 |
0.817716 |
# Using slicers 使用切片
# 可以用多重索引对象进行切片。可以用 切片值、 标签或标签列表、 布尔索引等选择器
# 可以用slice(None) 选择那一级的所有的内容。不用特别指定所有深度的级别,他们默认是slice(None)
# 注意:使用loc应该规定所有的轴方向,包括行index和列columns。
# 推荐方式:df.loc[(slice('A1','A3'),.....), :] 注意 冒号前面的逗号,逗号前表示行方向切片(选择器),逗号后面表示列方向切片(选择器)
# 不推荐: df.loc[(slice('A1','A3'),.....)] 可能产生歧义
def mklbl(prefix, n):
# mklbl("a", 3) --> ['a0', 'a1', 'a2']
return ["%s%s" % (prefix, i) for i in range(n)]
mklbl("a", 3)
['a0', 'a1', 'a2']
miindex = pd.MultiIndex.from_product([mklbl('A',4),
mklbl('B',2),
mklbl('C',4),
mklbl('D',2)])
miindex # 由列表生成4重(4级)索引对象,共生成4*2*4*2=64行
MultiIndex(levels=[['A0', 'A1', 'A2', 'A3'], ['B0', 'B1'], ['C0', 'C1', 'C2', 'C3'], ['D0', 'D1']],
labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])
micolumns = pd.MultiIndex.from_tuples([('a','foo'),('a','bar'),
('b','foo'),('b','bah')],
names=['lvl0', 'lvl1'])
micolumns # 由元组生成2重索引对象,共4行,并对2重(两级)分别命名
MultiIndex(levels=[['a', 'b'], ['bah', 'bar', 'foo']],
labels=[[0, 0, 1, 1], [2, 1, 2, 0]],
names=['lvl0', 'lvl1'])
row_l = len(miindex)
col_l = len(micolumns)
dfmi = pd.DataFrame(np.arange(row_l * col_l).reshape((row_l, col_l)),
index=miindex,
columns=micolumns).sort_index().sort_index(axis=1)
dfmi
|
|
|
lvl0 |
a |
b |
|
|
|
lvl1 |
bar |
foo |
bah |
foo |
A0 |
B0 |
C0 |
D0 |
1 |
0 |
3 |
2 |
D1 |
5 |
4 |
7 |
6 |
C1 |
D0 |
9 |
8 |
11 |
10 |
D1 |
13 |
12 |
15 |
14 |
C2 |
D0 |
17 |
16 |
19 |
18 |
D1 |
21 |
20 |
23 |
22 |
C3 |
D0 |
25 |
24 |
27 |
26 |
D1 |
29 |
28 |
31 |
30 |
B1 |
C0 |
D0 |
33 |
32 |
35 |
34 |
D1 |
37 |
36 |
39 |
38 |
C1 |
D0 |
41 |
40 |
43 |
42 |
D1 |
45 |
44 |
47 |
46 |
C2 |
D0 |
49 |
48 |
51 |
50 |
D1 |
53 |
52 |
55 |
54 |
C3 |
D0 |
57 |
56 |
59 |
58 |
D1 |
61 |
60 |
63 |
62 |
A1 |
B0 |
C0 |
D0 |
65 |
64 |
67 |
66 |
D1 |
69 |
68 |
71 |
70 |
C1 |
D0 |
73 |
72 |
75 |
74 |
D1 |
77 |
76 |
79 |
78 |
C2 |
D0 |
81 |
80 |
83 |
82 |
D1 |
85 |
84 |
87 |
86 |
C3 |
D0 |
89 |
88 |
91 |
90 |
D1 |
93 |
92 |
95 |
94 |
B1 |
C0 |
D0 |
97 |
96 |
99 |
98 |
D1 |
101 |
100 |
103 |
102 |
C1 |
D0 |
105 |
104 |
107 |
106 |
D1 |
109 |
108 |
111 |
110 |
C2 |
D0 |
113 |
112 |
115 |
114 |
D1 |
117 |
116 |
119 |
118 |
... |
... |
... |
... |
... |
... |
... |
... |
A2 |
B0 |
C1 |
D0 |
137 |
136 |
139 |
138 |
D1 |
141 |
140 |
143 |
142 |
C2 |
D0 |
145 |
144 |
147 |
146 |
D1 |
149 |
148 |
151 |
150 |
C3 |
D0 |
153 |
152 |
155 |
154 |
D1 |
157 |
156 |
159 |
158 |
B1 |
C0 |
D0 |
161 |
160 |
163 |
162 |
D1 |
165 |
164 |
167 |
166 |
C1 |
D0 |
169 |
168 |
171 |
170 |
D1 |
173 |
172 |
175 |
174 |
C2 |
D0 |
177 |
176 |
179 |
178 |
D1 |
181 |
180 |
183 |
182 |
C3 |
D0 |
185 |
184 |
187 |
186 |
D1 |
189 |
188 |
191 |
190 |
A3 |
B0 |
C0 |
D0 |
193 |
192 |
195 |
194 |
D1 |
197 |
196 |
199 |
198 |
C1 |
D0 |
201 |
200 |
203 |
202 |
D1 |
205 |
204 |
207 |
206 |
C2 |
D0 |
209 |
208 |
211 |
210 |
D1 |
213 |
212 |
215 |
214 |
C3 |
D0 |
217 |
216 |
219 |
218 |
D1 |
221 |
220 |
223 |
222 |
B1 |
C0 |
D0 |
225 |
224 |
227 |
226 |
D1 |
229 |
228 |
231 |
230 |
C1 |
D0 |
233 |
232 |
235 |
234 |
D1 |
237 |
236 |
239 |
238 |
C2 |
D0 |
241 |
240 |
243 |
242 |
D1 |
245 |
244 |
247 |
246 |
C3 |
D0 |
249 |
248 |
251 |
250 |
D1 |
253 |
252 |
255 |
254 |
64 rows × 4 columns
# Basic multi-index slicing using slices, lists, and labels.
dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']), :] # slice('A1','A3') 相当于 ['A1':'A3']
# dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']) :] # 错误 冒号前面必须有逗号
|
|
|
lvl0 |
a |
b |
|
|
|
lvl1 |
bar |
foo |
bah |
foo |
A1 |
B0 |
C1 |
D0 |
73 |
72 |
75 |
74 |
D1 |
77 |
76 |
79 |
78 |
C3 |
D0 |
89 |
88 |
91 |
90 |
D1 |
93 |
92 |
95 |
94 |
B1 |
C1 |
D0 |
105 |
104 |
107 |
106 |
D1 |
109 |
108 |
111 |
110 |
C3 |
D0 |
121 |
120 |
123 |
122 |
D1 |
125 |
124 |
127 |
126 |
A2 |
B0 |
C1 |
D0 |
137 |
136 |
139 |
138 |
D1 |
141 |
140 |
143 |
142 |
C3 |
D0 |
153 |
152 |
155 |
154 |
D1 |
157 |
156 |
159 |
158 |
B1 |
C1 |
D0 |
169 |
168 |
171 |
170 |
D1 |
173 |
172 |
175 |
174 |
C3 |
D0 |
185 |
184 |
187 |
186 |
D1 |
189 |
188 |
191 |
190 |
A3 |
B0 |
C1 |
D0 |
201 |
200 |
203 |
202 |
D1 |
205 |
204 |
207 |
206 |
C3 |
D0 |
217 |
216 |
219 |
218 |
D1 |
221 |
220 |
223 |
222 |
B1 |
C1 |
D0 |
233 |
232 |
235 |
234 |
D1 |
237 |
236 |
239 |
238 |
C3 |
D0 |
249 |
248 |
251 |
250 |
D1 |
253 |
252 |
255 |
254 |
# You can use a pd.IndexSlice to have a more natural syntax using :
# rather than using slice(None)
# 使用pd.IndexSlice 可以用冒号 : 代替 slice(None)
idx = pd.IndexSlice
dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']] # 默认必须是一个元组()来指定关键字,而且全选的话只能使用slice(None)
|
|
|
lvl0 |
a |
b |
|
|
|
lvl1 |
foo |
foo |
A0 |
B0 |
C1 |
D0 |
8 |
10 |
D1 |
12 |
14 |
C3 |
D0 |
24 |
26 |
D1 |
28 |
30 |
B1 |
C1 |
D0 |
40 |
42 |
D1 |
44 |
46 |
C3 |
D0 |
56 |
58 |
D1 |
60 |
62 |
A1 |
B0 |
C1 |
D0 |
72 |
74 |
D1 |
76 |
78 |
C3 |
D0 |
88 |
90 |
D1 |
92 |
94 |
B1 |
C1 |
D0 |
104 |
106 |
D1 |
108 |
110 |
C3 |
D0 |
120 |
122 |
D1 |
124 |
126 |
A2 |
B0 |
C1 |
D0 |
136 |
138 |
D1 |
140 |
142 |
C3 |
D0 |
152 |
154 |
D1 |
156 |
158 |
B1 |
C1 |
D0 |
168 |
170 |
D1 |
172 |
174 |
C3 |
D0 |
184 |
186 |
D1 |
188 |
190 |
A3 |
B0 |
C1 |
D0 |
200 |
202 |
D1 |
204 |
206 |
C3 |
D0 |
216 |
218 |
D1 |
220 |
222 |
B1 |
C1 |
D0 |
232 |
234 |
D1 |
236 |
238 |
C3 |
D0 |
248 |
250 |
D1 |
252 |
254 |
# 一次执行复杂选取
dfmi.loc['A1', (slice(None), 'foo')]
|
|
lvl0 |
a |
b |
|
|
lvl1 |
foo |
foo |
B0 |
C0 |
D0 |
64 |
66 |
D1 |
68 |
70 |
C1 |
D0 |
72 |
74 |
D1 |
76 |
78 |
C2 |
D0 |
80 |
82 |
D1 |
84 |
86 |
C3 |
D0 |
88 |
90 |
D1 |
92 |
94 |
B1 |
C0 |
D0 |
96 |
98 |
D1 |
100 |
102 |
C1 |
D0 |
104 |
106 |
D1 |
108 |
110 |
C2 |
D0 |
112 |
114 |
D1 |
116 |
118 |
C3 |
D0 |
120 |
122 |
D1 |
124 |
126 |
dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]
|
|
|
lvl0 |
a |
b |
|
|
|
lvl1 |
foo |
foo |
A0 |
B0 |
C1 |
D0 |
8 |
10 |
D1 |
12 |
14 |
C3 |
D0 |
24 |
26 |
D1 |
28 |
30 |
B1 |
C1 |
D0 |
40 |
42 |
D1 |
44 |
46 |
C3 |
D0 |
56 |
58 |
D1 |
60 |
62 |
A1 |
B0 |
C1 |
D0 |
72 |
74 |
D1 |
76 |
78 |
C3 |
D0 |
88 |
90 |
D1 |
92 |
94 |
B1 |
C1 |
D0 |
104 |
106 |
D1 |
108 |
110 |
C3 |
D0 |
120 |
122 |
D1 |
124 |
126 |
A2 |
B0 |
C1 |
D0 |
136 |
138 |
D1 |
140 |
142 |
C3 |
D0 |
152 |
154 |
D1 |
156 |
158 |
B1 |
C1 |
D0 |
168 |
170 |
D1 |
172 |
174 |
C3 |
D0 |
184 |
186 |
D1 |
188 |
190 |
A3 |
B0 |
C1 |
D0 |
200 |
202 |
D1 |
204 |
206 |
C3 |
D0 |
216 |
218 |
D1 |
220 |
222 |
B1 |
C1 |
D0 |
232 |
234 |
D1 |
236 |
238 |
C3 |
D0 |
248 |
250 |
D1 |
252 |
254 |
# Using a boolean indexer you can provide selection related to the values.
# 使用布尔索引
mask = dfmi[('a', 'foo')] > 200
mask
dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']]
A0 B0 C0 D0 False
D1 False
C1 D0 False
D1 False
C2 D0 False
D1 False
C3 D0 False
D1 False
B1 C0 D0 False
D1 False
C1 D0 False
D1 False
C2 D0 False
D1 False
C3 D0 False
D1 False
A1 B0 C0 D0 False
D1 False
C1 D0 False
D1 False
C2 D0 False
D1 False
C3 D0 False
D1 False
B1 C0 D0 False
D1 False
C1 D0 False
D1 False
C2 D0 False
D1 False
...
A2 B0 C1 D0 False
D1 False
C2 D0 False
D1 False
C3 D0 False
D1 False
B1 C0 D0 False
D1 False
C1 D0 False
D1 False
C2 D0 False
D1 False
C3 D0 False
D1 False
A3 B0 C0 D0 False
D1 False
C1 D0 False
D1 True
C2 D0 True
D1 True
C3 D0 True
D1 True
B1 C0 D0 True
D1 True
C1 D0 True
D1 True
C2 D0 True
D1 True
C3 D0 True
D1 True
Name: (a, foo), Length: 64, dtype: bool
|
|
|
lvl0 |
a |
b |
|
|
|
lvl1 |
foo |
foo |
A3 |
B0 |
C1 |
D1 |
204 |
206 |
C3 |
D0 |
216 |
218 |
D1 |
220 |
222 |
B1 |
C1 |
D0 |
232 |
234 |
D1 |
236 |
238 |
C3 |
D0 |
248 |
250 |
D1 |
252 |
254 |
# 指定轴参数axis,说明传入的切片在一个轴上
dfmi.loc(axis=0)[:, :, ['C1', 'C3']]
|
|
|
lvl0 |
a |
b |
|
|
|
lvl1 |
bar |
foo |
bah |
foo |
A0 |
B0 |
C1 |
D0 |
9 |
8 |
11 |
10 |
D1 |
13 |
12 |
15 |
14 |
C3 |
D0 |
25 |
24 |
27 |
26 |
D1 |
29 |
28 |
31 |
30 |
B1 |
C1 |
D0 |
41 |
40 |
43 |
42 |
D1 |
45 |
44 |
47 |
46 |
C3 |
D0 |
57 |
56 |
59 |
58 |
D1 |
61 |
60 |
63 |
62 |
A1 |
B0 |
C1 |
D0 |
73 |
72 |
75 |
74 |
D1 |
77 |
76 |
79 |
78 |
C3 |
D0 |
89 |
88 |
91 |
90 |
D1 |
93 |
92 |
95 |
94 |
B1 |
C1 |
D0 |
105 |
104 |
107 |
106 |
D1 |
109 |
108 |
111 |
110 |
C3 |
D0 |
121 |
120 |
123 |
122 |
D1 |
125 |
124 |
127 |
126 |
A2 |
B0 |
C1 |
D0 |
137 |
136 |
139 |
138 |
D1 |
141 |
140 |
143 |
142 |
C3 |
D0 |
153 |
152 |
155 |
154 |
D1 |
157 |
156 |
159 |
158 |
B1 |
C1 |
D0 |
169 |
168 |
171 |
170 |
D1 |
173 |
172 |
175 |
174 |
C3 |
D0 |
185 |
184 |
187 |
186 |
D1 |
189 |
188 |
191 |
190 |
A3 |
B0 |
C1 |
D0 |
201 |
200 |
203 |
202 |
D1 |
205 |
204 |
207 |
206 |
C3 |
D0 |
217 |
216 |
219 |
218 |
D1 |
221 |
220 |
223 |
222 |
B1 |
C1 |
D0 |
233 |
232 |
235 |
234 |
D1 |
237 |
236 |
239 |
238 |
C3 |
D0 |
249 |
248 |
251 |
250 |
D1 |
253 |
252 |
255 |
254 |
# 可以使用这种指定轴方向的方式赋值
df2 = dfmi.copy()
df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10
df2
|
|
|
lvl0 |
a |
b |
|
|
|
lvl1 |
bar |
foo |
bah |
foo |
A0 |
B0 |
C0 |
D0 |
1 |
0 |
3 |
2 |
D1 |
5 |
4 |
7 |
6 |
C1 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
C2 |
D0 |
17 |
16 |
19 |
18 |
D1 |
21 |
20 |
23 |
22 |
C3 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
B1 |
C0 |
D0 |
33 |
32 |
35 |
34 |
D1 |
37 |
36 |
39 |
38 |
C1 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
C2 |
D0 |
49 |
48 |
51 |
50 |
D1 |
53 |
52 |
55 |
54 |
C3 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
A1 |
B0 |
C0 |
D0 |
65 |
64 |
67 |
66 |
D1 |
69 |
68 |
71 |
70 |
C1 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
C2 |
D0 |
81 |
80 |
83 |
82 |
D1 |
85 |
84 |
87 |
86 |
C3 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
B1 |
C0 |
D0 |
97 |
96 |
99 |
98 |
D1 |
101 |
100 |
103 |
102 |
C1 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
C2 |
D0 |
113 |
112 |
115 |
114 |
D1 |
117 |
116 |
119 |
118 |
... |
... |
... |
... |
... |
... |
... |
... |
A2 |
B0 |
C1 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
C2 |
D0 |
145 |
144 |
147 |
146 |
D1 |
149 |
148 |
151 |
150 |
C3 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
B1 |
C0 |
D0 |
161 |
160 |
163 |
162 |
D1 |
165 |
164 |
167 |
166 |
C1 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
C2 |
D0 |
177 |
176 |
179 |
178 |
D1 |
181 |
180 |
183 |
182 |
C3 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
A3 |
B0 |
C0 |
D0 |
193 |
192 |
195 |
194 |
D1 |
197 |
196 |
199 |
198 |
C1 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
C2 |
D0 |
209 |
208 |
211 |
210 |
D1 |
213 |
212 |
215 |
214 |
C3 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
B1 |
C0 |
D0 |
225 |
224 |
227 |
226 |
D1 |
229 |
228 |
231 |
230 |
C1 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
C2 |
D0 |
241 |
240 |
243 |
242 |
D1 |
245 |
244 |
247 |
246 |
C3 |
D0 |
-10 |
-10 |
-10 |
-10 |
D1 |
-10 |
-10 |
-10 |
-10 |
64 rows × 4 columns
# You can use a right-hand-side of an alignable object as well.
# 可以在右侧使用可定位的对象?
df2 = dfmi.copy()
df2.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000
df2
|
|
|
lvl0 |
a |
b |
|
|
|
lvl1 |
bar |
foo |
bah |
foo |
A0 |
B0 |
C0 |
D0 |
1 |
0 |
3 |
2 |
D1 |
5 |
4 |
7 |
6 |
C1 |
D0 |
9000 |
8000 |
11000 |
10000 |
D1 |
13000 |
12000 |
15000 |
14000 |
C2 |
D0 |
17 |
16 |
19 |
18 |
D1 |
21 |
20 |
23 |
22 |
C3 |
D0 |
25000 |
24000 |
27000 |
26000 |
D1 |
29000 |
28000 |
31000 |
30000 |
B1 |
C0 |
D0 |
33 |
32 |
35 |
34 |
D1 |
37 |
36 |
39 |
38 |
C1 |
D0 |
41000 |
40000 |
43000 |
42000 |
D1 |
45000 |
44000 |
47000 |
46000 |
C2 |
D0 |
49 |
48 |
51 |
50 |
D1 |
53 |
52 |
55 |
54 |
C3 |
D0 |
57000 |
56000 |
59000 |
58000 |
D1 |
61000 |
60000 |
63000 |
62000 |
A1 |
B0 |
C0 |
D0 |
65 |
64 |
67 |
66 |
D1 |
69 |
68 |
71 |
70 |
C1 |
D0 |
73000 |
72000 |
75000 |
74000 |
D1 |
77000 |
76000 |
79000 |
78000 |
C2 |
D0 |
81 |
80 |
83 |
82 |
D1 |
85 |
84 |
87 |
86 |
C3 |
D0 |
89000 |
88000 |
91000 |
90000 |
D1 |
93000 |
92000 |
95000 |
94000 |
B1 |
C0 |
D0 |
97 |
96 |
99 |
98 |
D1 |
101 |
100 |
103 |
102 |
C1 |
D0 |
105000 |
104000 |
107000 |
106000 |
D1 |
109000 |
108000 |
111000 |
110000 |
C2 |
D0 |
113 |
112 |
115 |
114 |
D1 |
117 |
116 |
119 |
118 |
... |
... |
... |
... |
... |
... |
... |
... |
A2 |
B0 |
C1 |
D0 |
137000 |
136000 |
139000 |
138000 |
D1 |
141000 |
140000 |
143000 |
142000 |
C2 |
D0 |
145 |
144 |
147 |
146 |
D1 |
149 |
148 |
151 |
150 |
C3 |
D0 |
153000 |
152000 |
155000 |
154000 |
D1 |
157000 |
156000 |
159000 |
158000 |
B1 |
C0 |
D0 |
161 |
160 |
163 |
162 |
D1 |
165 |
164 |
167 |
166 |
C1 |
D0 |
169000 |
168000 |
171000 |
170000 |
D1 |
173000 |
172000 |
175000 |
174000 |
C2 |
D0 |
177 |
176 |
179 |
178 |
D1 |
181 |
180 |
183 |
182 |
C3 |
D0 |
185000 |
184000 |
187000 |
186000 |
D1 |
189000 |
188000 |
191000 |
190000 |
A3 |
B0 |
C0 |
D0 |
193 |
192 |
195 |
194 |
D1 |
197 |
196 |
199 |
198 |
C1 |
D0 |
201000 |
200000 |
203000 |
202000 |
D1 |
205000 |
204000 |
207000 |
206000 |
C2 |
D0 |
209 |
208 |
211 |
210 |
D1 |
213 |
212 |
215 |
214 |
C3 |
D0 |
217000 |
216000 |
219000 |
218000 |
D1 |
221000 |
220000 |
223000 |
222000 |
B1 |
C0 |
D0 |
225 |
224 |
227 |
226 |
D1 |
229 |
228 |
231 |
230 |
C1 |
D0 |
233000 |
232000 |
235000 |
234000 |
D1 |
237000 |
236000 |
239000 |
238000 |
C2 |
D0 |
241 |
240 |
243 |
242 |
D1 |
245 |
244 |
247 |
246 |
C3 |
D0 |
249000 |
248000 |
251000 |
250000 |
D1 |
253000 |
252000 |
255000 |
254000 |
64 rows × 4 columns
# Cross-section 断面
# xs 方法 另外提供了一个级别level参数 用来选择多重索引中的部分级别
# xs 当提供轴参数时,也可用于列的选择
df
|
|
A |
B |
C |
first |
second |
|
|
|
bar |
one |
-1.608162 |
1.207609 |
-0.369463 |
two |
-0.007312 |
-0.272366 |
1.862253 |
baz |
one |
1.048244 |
-0.530191 |
-0.118297 |
two |
-0.029907 |
-0.689641 |
-0.148326 |
foo |
one |
-0.437866 |
-0.244362 |
1.147616 |
two |
-1.853398 |
-1.476252 |
-1.389965 |
qux |
one |
2.026875 |
0.818493 |
0.817716 |
two |
0.359521 |
0.353771 |
0.787394 |
df.xs("one", level="second")
df.loc[(slice(None), "one"), :] # 使用切片得到同样的选择
|
A |
B |
C |
first |
|
|
|
bar |
-1.608162 |
1.207609 |
-0.369463 |
baz |
1.048244 |
-0.530191 |
-0.118297 |
foo |
-0.437866 |
-0.244362 |
1.147616 |
qux |
2.026875 |
0.818493 |
0.817716 |
|
|
A |
B |
C |
first |
second |
|
|
|
bar |
one |
-1.608162 |
1.207609 |
-0.369463 |
baz |
one |
1.048244 |
-0.530191 |
-0.118297 |
foo |
one |
-0.437866 |
-0.244362 |
1.147616 |
qux |
one |
2.026875 |
0.818493 |
0.817716 |
df = df.T
df.xs('one', level='second', axis=1) # 在列方向选择
df.loc[:, (slice(None),'one')] # 使用切片方式
first |
bar |
baz |
foo |
qux |
A |
-1.608162 |
1.048244 |
-0.437866 |
2.026875 |
B |
1.207609 |
-0.530191 |
-0.244362 |
0.818493 |
C |
-0.369463 |
-0.118297 |
1.147616 |
0.817716 |
first |
bar |
baz |
foo |
qux |
second |
one |
one |
one |
one |
A |
-1.608162 |
1.048244 |
-0.437866 |
2.026875 |
B |
1.207609 |
-0.530191 |
-0.244362 |
0.818493 |
C |
-0.369463 |
-0.118297 |
1.147616 |
0.817716 |
# xs 方法使用多重关键字keys,关键字元组
df.xs(('one', 'bar'), level=('second', 'first'), axis=1) # 关键字元组可以与级层顺序不一致,与给定level顺序一致
df.loc[:, ("bar", "one")]
# df.loc[:, ("one", "bar")] # 错误,元组元素的顺序与级层顺序一致
first |
bar |
second |
one |
A |
-1.608162 |
B |
1.207609 |
C |
-0.369463 |
A -1.608162
B 1.207609
C -0.369463
Name: (bar, one), dtype: float64
# drop_level=False 参数可以使xs保留选定的层级,而不是舍弃,这样的话,与切片得到的结果完全相同
df.xs('one', level='second', axis=1, drop_level=False) # 默认 drop_level=True
df.loc[:, (slice(None) ,"one")]
first |
bar |
baz |
foo |
qux |
second |
one |
one |
one |
one |
A |
-1.608162 |
1.048244 |
-0.437866 |
2.026875 |
B |
1.207609 |
-0.530191 |
-0.244362 |
0.818493 |
C |
-0.369463 |
-0.118297 |
1.147616 |
0.817716 |
first |
bar |
baz |
foo |
qux |
second |
one |
one |
one |
one |
A |
-1.608162 |
1.048244 |
-0.437866 |
2.026875 |
B |
1.207609 |
-0.530191 |
-0.244362 |
0.818493 |
C |
-0.369463 |
-0.118297 |
1.147616 |
0.817716 |
# Advanced reindexing and alignment 高级索引和定位
# level参数加在索引reindex和定位align方法中。可用于通过级层进行广播值。
midx = pd.MultiIndex(levels=[['zero', 'one'], ['x','y']],
labels=[[1,1,0,0],[1,0,1,0]]) # 指定了labels,指定了层级间对应关系
midx
MultiIndex(levels=[['zero', 'one'], ['x', 'y']],
labels=[[1, 1, 0, 0], [1, 0, 1, 0]])
df = pd.DataFrame(np.random.randn(4,2), index=midx)
df
|
|
0 |
1 |
one |
y |
-1.388542 |
-1.170054 |
x |
0.240534 |
-0.656707 |
zero |
y |
-0.848351 |
-1.394871 |
x |
-0.212248 |
0.051445 |
# 不同层级索引的广播计算
df2 = df.mean(level=1)
df2
df2 = df.mean(level=0)
df2
|
0 |
1 |
y |
-1.118447 |
-1.282462 |
x |
0.014143 |
-0.302631 |
|
0 |
1 |
one |
-0.574004 |
-0.913381 |
zero |
-0.530300 |
-0.671713 |
# 重构索引
df2.reindex(df.index, level=0)
|
|
0 |
1 |
one |
y |
-0.574004 |
-0.913381 |
x |
-0.574004 |
-0.913381 |
zero |
y |
-0.530300 |
-0.671713 |
x |
-0.530300 |
-0.671713 |
# 定位/对齐
df
df2
df.align(df2, level=0)
df_aligned, df2_aligned = df.align(df2, level=0) # ??
df_aligned
df2_aligned
|
|
0 |
1 |
one |
y |
-1.388542 |
-1.170054 |
x |
0.240534 |
-0.656707 |
zero |
y |
-0.848351 |
-1.394871 |
x |
-0.212248 |
0.051445 |
|
0 |
1 |
one |
-0.574004 |
-0.913381 |
zero |
-0.530300 |
-0.671713 |
( 0 1
one y -1.388542 -1.170054
x 0.240534 -0.656707
zero y -0.848351 -1.394871
x -0.212248 0.051445, 0 1
one y -0.574004 -0.913381
x -0.574004 -0.913381
zero y -0.530300 -0.671713
x -0.530300 -0.671713)
|
|
0 |
1 |
one |
y |
-1.388542 |
-1.170054 |
x |
0.240534 |
-0.656707 |
zero |
y |
-0.848351 |
-1.394871 |
x |
-0.212248 |
0.051445 |
|
|
0 |
1 |
one |
y |
-0.574004 |
-0.913381 |
x |
-0.574004 |
-0.913381 |
zero |
y |
-0.530300 |
-0.671713 |
x |
-0.530300 |
-0.671713 |
# 交换层级 swaplevel()
df.swaplevel(0, 1, axis=0)
|
|
0 |
1 |
y |
one |
-1.388542 |
-1.170054 |
x |
one |
0.240534 |
-0.656707 |
y |
zero |
-0.848351 |
-1.394871 |
x |
zero |
-0.212248 |
0.051445 |
# reorder_levels 概况了 swaplevel 函数, 可以一步交换层级索引
df.reorder_levels([1,0], axis=0) # 看上去结果与swaplevel一样,传入参数不一样
|
|
0 |
1 |
y |
one |
-1.388542 |
-1.170054 |
x |
one |
0.240534 |
-0.656707 |
y |
zero |
-0.848351 |
-1.394871 |
x |
zero |
-0.212248 |
0.051445 |
# 多重索引排序
# 排序是为了搞笑的索引和切片。任何索引都可以使用sort_index
tuples
s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples))
s
[('baz', 'two'),
('qux', 'two'),
('bar', 'one'),
('foo', 'one'),
('qux', 'one'),
('baz', 'one'),
('foo', 'two'),
('bar', 'two')]
baz two 1.365155
qux two -1.331225
bar one -1.512430
foo one 0.468294
qux one -0.667115
baz one -0.502417
foo two 1.685553
bar two -1.611271
dtype: float64
s.sort_index()
s.sort_index(level=1) # 默认是level=0排序
s.sort_index(level=0)
bar one -1.512430
two -1.611271
baz one -0.502417
two 1.365155
foo one 0.468294
two 1.685553
qux one -0.667115
two -1.331225
dtype: float64
bar one -1.512430
baz one -0.502417
foo one 0.468294
qux one -0.667115
bar two -1.611271
baz two 1.365155
foo two 1.685553
qux two -1.331225
dtype: float64
bar one -1.512430
two -1.611271
baz one -0.502417
two 1.365155
foo one 0.468294
two 1.685553
qux one -0.667115
two -1.331225
dtype: float64
# level参数除了可以用整型序号,还可以使用层级的names
s.index.set_names(['L1', 'L2'], inplace=True)
s
s.sort_index(level="L1")
s.sort_index(level="L2")
L1 L2
baz two 1.365155
qux two -1.331225
bar one -1.512430
foo one 0.468294
qux one -0.667115
baz one -0.502417
foo two 1.685553
bar two -1.611271
dtype: float64
L1 L2
bar one -1.512430
two -1.611271
baz one -0.502417
two 1.365155
foo one 0.468294
two 1.685553
qux one -0.667115
two -1.331225
dtype: float64
L1 L2
bar one -1.512430
baz one -0.502417
foo one 0.468294
qux one -0.667115
bar two -1.611271
baz two 1.365155
foo two 1.685553
qux two -1.331225
dtype: float64
# 可以指定排序的轴方向
df.T
df.T.sort_index(level=1, axis=1)
|
one |
zero |
|
y |
x |
y |
x |
0 |
-1.388542 |
0.240534 |
-0.848351 |
-0.212248 |
1 |
-1.170054 |
-0.656707 |
-1.394871 |
0.051445 |
|
zero |
one |
zero |
one |
|
x |
x |
y |
y |
0 |
-0.212248 |
0.240534 |
-0.848351 |
-1.388542 |
1 |
0.051445 |
-0.656707 |
-1.394871 |
-1.170054 |
# 即使数据没有排序也可以索引,但是这样效率低下。
# 返回值是拷贝
dfm = pd.DataFrame({'jim': [0, 0, 1, 1],
'joe': ['x', 'x', 'z', 'y'],
'jolie': np.random.rand(4)})
dfm
|
jim |
joe |
jolie |
0 |
0 |
x |
0.844228 |
1 |
0 |
x |
0.317508 |
2 |
1 |
z |
0.413824 |
3 |
1 |
y |
0.074264 |
dfm = dfm.set_index(["jim", "joe"])
dfm
|
|
jolie |
jim |
joe |
|
0 |
x |
0.844228 |
x |
0.317508 |
1 |
z |
0.413824 |
y |
0.074264 |
dfm.loc[(1, "z")] # 会提示PerformanceWarning
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:1: PerformanceWarning: indexing past lexsort depth may impact performance.
"""Entry point for launching an IPython kernel.
|
|
jolie |
jim |
joe |
|
1 |
z |
0.413824 |
# dfm.loc[(0,'y'):(1, 'z')] # 错误 无法定位
dfm.index.is_lexsorted()
dfm.index.lexsort_depth
False
1
dfm = dfm.sort_index() # 索引排序,默认对所有层级
dfm
|
|
jolie |
jim |
joe |
|
0 |
x |
0.844228 |
x |
0.317508 |
1 |
y |
0.074264 |
z |
0.413824 |
dfm.index.is_lexsorted()
dfm.index.lexsort_depth
dfm.loc[(0,'y'):(1, 'z')]
True
2
|
|
jolie |
jim |
joe |
|
1 |
y |
0.074264 |
z |
0.413824 |
# Take Methods take 方法 (拿、取)
# 与numpy的数组类似,padas的index、series、Dataframe也提供take方法
# 用来检索给定轴方向上给定的指数indices(必须是整数列表或者整数数组,可以是负整数)
# 在性能方面,由于take方法管理了一个更窄的输入范围,它能提供比想象的索引更快的性能
index = pd.Index(np.random.randint(0, 1000, 10))
index
Int64Index([523, 532, 386, 998, 832, 71, 965, 274, 389, 59], dtype='int64')
positions = [0, 9, 3]
index[positions]
index.take(positions)
Int64Index([523, 59, 998], dtype='int64')
Int64Index([523, 59, 998], dtype='int64')
ser = pd.Series(np.random.randn(10))
ser
0 0.733196
1 0.975773
2 -0.261602
3 -0.055134
4 0.959253
5 1.189025
6 -0.434102
7 0.653628
8 0.248894
9 -0.203562
dtype: float64
ser.iloc[positions]
ser.take(positions)
0 0.733196
9 -0.203562
3 -0.055134
dtype: float64
0 0.733196
9 -0.203562
3 -0.055134
dtype: float64
# 对DataFrame,indices应该是一个一维 的列表或数组,规定了行或列的位置
frm = pd.DataFrame(np.random.randn(5, 3))
frm
|
0 |
1 |
2 |
0 |
-0.722107 |
-1.758271 |
0.580805 |
1 |
0.555332 |
-0.856173 |
-1.143862 |
2 |
-0.636994 |
1.312340 |
0.046131 |
3 |
-0.154813 |
0.311931 |
0.933192 |
4 |
-1.277001 |
-0.144097 |
-1.871135 |
frm.take([1, 4, 3]) # 默认取行方向
frm.take([0, 2], axis=1)
|
0 |
1 |
2 |
1 |
0.555332 |
-0.856173 |
-1.143862 |
4 |
-1.277001 |
-0.144097 |
-1.871135 |
3 |
-0.154813 |
0.311931 |
0.933192 |
|
0 |
2 |
0 |
-0.722107 |
0.580805 |
1 |
0.555332 |
-1.143862 |
2 |
-0.636994 |
0.046131 |
3 |
-0.154813 |
0.933192 |
4 |
-1.277001 |
-1.871135 |
# 注意:take方法不要用于布尔indices
arr = np.random.randn(10)
arr
array([-0.00772525, 0.95419469, 1.80636718, -2.46742236, -0.025503 ,
0.44203691, 0.48626739, -0.74160374, -0.22453771, 0.8813933 ])
arr.take([False, False, True, True]) # 相当于取了[0,0,1,1]
arr[[0, 1]]
array([-0.00772525, -0.00772525, 0.95419469, 0.95419469])
array([-0.00772525, 0.95419469])
ser = pd.Series(np.random.randn(10))
ser
0 1.782426
1 0.531882
2 -0.339277
3 0.500497
4 -0.333816
5 -1.713753
6 -0.125252
7 -0.857100
8 0.385080
9 1.247962
dtype: float64
ser.take([False, False, True, True]) # 相当于取了[0,0,1,1]
ser.iloc[[0, 1]]
0 1.782426
0 1.782426
1 0.531882
1 0.531882
dtype: float64
0 1.782426
1 0.531882
dtype: float64
# Index Types 索引 index 对象
# 其他一些索引对象
# CategoricalIndex 绝对索引?类别索引?
# 用于支持重复的索引
from pandas.api.types import CategoricalDtype
df = pd.DataFrame({'A': np.arange(6),
'B': list('aabbca')})
df
|
A |
B |
0 |
0 |
a |
1 |
1 |
a |
2 |
2 |
b |
3 |
3 |
b |
4 |
4 |
c |
5 |
5 |
a |
df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
df
df.dtypes
df.B.cat.categories
|
A |
B |
0 |
0 |
a |
1 |
1 |
a |
2 |
2 |
b |
3 |
3 |
b |
4 |
4 |
c |
5 |
5 |
a |
A int32
B category
dtype: object
Index(['c', 'a', 'b'], dtype='object')
df2 = df.set_index('B')
df2
df2.index
|
A |
B |
|
a |
0 |
a |
1 |
b |
2 |
b |
3 |
c |
4 |
a |
5 |
CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')
# 使用 __getitem__/.iloc/.loc 索引时,索引对象 必须 在类别里面,否则操作将挂起
df2.loc['a']
df2.loc['a'].index # 保留了全部的 CategoricalIndex
df2.sort_index() # 按照categoies给定的顺序排序
CategoricalIndex(['a', 'a', 'a'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')
|
A |
B |
|
c |
4 |
a |
0 |
a |
1 |
a |
5 |
b |
2 |
b |
3 |
df2.groupby(level=0)
df2.groupby(level=0).sum()
df2.groupby(level=0).sum().index # 也保留了category
CategoricalIndex(['c', 'a', 'b'], categories=['c', 'a', 'b'], ordered=False, name='B', dtype='category')
df2.reindex(['a','e']) # reindex 传入普通列表 返回一个 普通的 index
df2.reindex(['a','e']).index
df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))) # 指定catgorical index,即使原来的index没有的类别,也可以reindex
df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index
|
A |
B |
|
a |
0.0 |
a |
1.0 |
a |
5.0 |
e |
NaN |
Index(['a', 'a', 'a', 'e'], dtype='object', name='B')
|
A |
B |
|
a |
0.0 |
a |
1.0 |
a |
5.0 |
e |
NaN |
CategoricalIndex(['a', 'a', 'a', 'e'], categories=['a', 'b', 'c', 'd', 'e'], ordered=False, name='B', dtype='category')
# 注意:变形和比较操作必须有同样的categories,否则报错
# Int64Index and RangeIndex
# Int64Index 是pandas基础索引。
# RangeIndex是Int64Index的一个子集,现在作为所有NDFrame对象的默认索引。
# Float64Index 当创建索引index时,传入浮点数或者浮点与整数混合值,就默认是Float64Index
indexf = pd.Index([1.5, 2, 3, 4.5, 5])
indexf
Float64Index([1.5, 2.0, 3.0, 4.5, 5.0], dtype='float64')
sf = pd.Series(range(5), index=indexf)
sf
1.5 0
2.0 1
3.0 2
4.5 3
5.0 4
dtype: int64
# [] .loc 基于 label,整数将被转为浮点值
sf[1.5:4.5]
sf[1:4]
sf.loc[3] # label,不是位置索引
# sf[3.2] # 错误,传入值必须在labels中
1.5 0
2.0 1
3.0 2
4.5 3
dtype: int64
1.5 0
2.0 1
3.0 2
dtype: int64
2
sf.iloc[3] # 基于位置,传入整数,不能传入浮点数
3
# 例子: 有不规则的数据表,其索引类似时间间隔,但数值是浮点型的
dfir1 = pd.DataFrame(np.random.randn(5,2),
index=np.arange(5) * 250.0,
columns=list('AB'))
dfir1
dfir2 = pd.DataFrame(np.random.randn(6,2),
index=np.arange(4,10) * 250.1,
columns=list('AB'))
dfir2
dfir = pd.concat([dfir1,dfir2])
dfir
|
A |
B |
0.0 |
1.158461 |
0.595743 |
250.0 |
1.457556 |
0.268541 |
500.0 |
-0.437650 |
-0.299700 |
750.0 |
-1.095812 |
-2.079684 |
1000.0 |
0.242220 |
-0.868812 |
|
A |
B |
1000.4 |
-0.858327 |
-0.364968 |
1250.5 |
-1.445806 |
-2.129608 |
1500.6 |
0.799049 |
1.232102 |
1750.7 |
-1.132538 |
0.283472 |
2000.8 |
-1.157884 |
0.398119 |
2250.9 |
-1.330821 |
-0.563333 |
|
A |
B |
0.0 |
1.158461 |
0.595743 |
250.0 |
1.457556 |
0.268541 |
500.0 |
-0.437650 |
-0.299700 |
750.0 |
-1.095812 |
-2.079684 |
1000.0 |
0.242220 |
-0.868812 |
1000.4 |
-0.858327 |
-0.364968 |
1250.5 |
-1.445806 |
-2.129608 |
1500.6 |
0.799049 |
1.232102 |
1750.7 |
-1.132538 |
0.283472 |
2000.8 |
-1.157884 |
0.398119 |
2250.9 |
-1.330821 |
-0.563333 |
# 选取第1秒前的数据
dfir[:1000]
|
A |
B |
0.0 |
1.158461 |
0.595743 |
250.0 |
1.457556 |
0.268541 |
500.0 |
-0.437650 |
-0.299700 |
750.0 |
-1.095812 |
-2.079684 |
1000.0 |
0.242220 |
-0.868812 |
# IntervalIndex 区间索引 (数学上的开闭区间)
df = pd.DataFrame({'A': [1, 2, 3, 4]},
index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4]))
df
|
A |
(0, 1] |
1 |
(1, 2] |
2 |
(2, 3] |
3 |
(3, 4] |
4 |
df.loc[2] # loc 可以是区间的边缘
df.loc[2.5]
df.loc[1.5:2.5]
A 2
Name: (1, 2], dtype: int64
A 3
Name: (2, 3], dtype: int64
# Interval and IntervalIndex are used by cut and qcut
# 区间数值类型和区间类型索引可以使用 cut qcut 方法??
c = pd.cut(range(4), bins=2)
c
c.categories
[(-0.003, 1.5], (-0.003, 1.5], (1.5, 3.0], (1.5, 3.0]]
Categories (2, interval[float64]): [(-0.003, 1.5] < (1.5, 3.0]]
IntervalIndex([(-0.003, 1.5], (1.5, 3.0]]
closed='right',
dtype='interval[float64]')
pd.cut([0, 3, 5, 1], bins=c.categories) # 允许一个interval类型去bin(分隔)其他数据
[(-0.003, 1.5], (1.5, 3.0], NaN, (-0.003, 1.5]]
Categories (2, interval[float64]): [(-0.003, 1.5] < (1.5, 3.0]]
# Miscellaneous indexing FAQ 杂项 常见问题
# Integer indexing 整数型索引
# 整型的索引是label,应满足label的要求
# 在pandas中,一般认为标签label事项大于整数定位。
s = pd.Series(range(5))
s
# s[-1] # 异常
# s.loc[-1] # 异常
s.loc[-1:] # 允许
s.iloc[-1] # 允许
df = pd.DataFrame(np.random.randn(5, 4))
df
df.loc[-2:]
# df.loc[-2] # 异常
0 0
1 1
2 2
3 3
4 4
dtype: int64
0 0
1 1
2 2
3 3
4 4
dtype: int64
4
|
0 |
1 |
2 |
3 |
0 |
0.021033 |
0.127054 |
-0.864734 |
-1.835828 |
1 |
-0.400611 |
0.594981 |
-1.758866 |
-1.059539 |
2 |
-0.108597 |
0.784000 |
0.306035 |
-0.695933 |
3 |
-0.078048 |
-1.742895 |
-0.159740 |
0.934115 |
4 |
-0.524633 |
0.433224 |
-0.732334 |
0.442827 |
|
0 |
1 |
2 |
3 |
0 |
0.021033 |
0.127054 |
-0.864734 |
-1.835828 |
1 |
-0.400611 |
0.594981 |
-1.758866 |
-1.059539 |
2 |
-0.108597 |
0.784000 |
0.306035 |
-0.695933 |
3 |
-0.078048 |
-1.742895 |
-0.159740 |
0.934115 |
4 |
-0.524633 |
0.433224 |
-0.732334 |
0.442827 |
# Non-monotonic indexes require exact matches 非单调索引要求精确匹配
# 如果series或Dataframe的索引是单调增或单调减的,则基于标签的切片可以超出索引的范围。
# 就像对一般python列表list的索引切片。
# 可以用is_monotonic_increasing和is_monotonic_decreasing测试单调属性
df = pd.DataFrame(index=[2,3,3,4,5], columns=['data'], data=list(range(5)))
df
df.index.is_monotonic_increasing
df.index.is_monotonic_decreasing
True
False
df.loc[0:4, :] # 没有0和1行,但是返回了label为2、3、4的行
df.loc[13:15, :] # 超出界限,返回空
# 非单调索引,切片必须在index内,而且边界的值必须是唯一的
df = pd.DataFrame(index=[2,3,1,4,3,5], columns=['data'], data=list(range(6)))
df
df.index.is_monotonic_increasing
|
data |
2 |
0 |
3 |
1 |
1 |
2 |
4 |
3 |
3 |
4 |
5 |
5 |
False
df.loc[2:4, :]
# df.loc[0:4, :] # 错误没有0标签
# df.loc[2:3, :] # 错误,边界标签3不是唯一的
# Index.is_monotonic_increasing() and Index.is_monotonic_decreasing() 只检测弱单调(可以有重复值)
# 结合使用 Index.is_unique() 可以检测严格单调性
weakly_monotonic = pd.Index(['a', 'b', 'c', 'c'])
weakly_monotonic
weakly_monotonic.is_monotonic_increasing
weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique
Index(['a', 'b', 'c', 'c'], dtype='object')
True
False
# Endpoints are inclusive 端点(边界)包括在内
# 与标准的python切片(不包括右端点值)相比,pandas中的标签切片包含端点值。
# 主要原因是经常不可能轻易断定 在索引的局部标签后 的 后继或者下一个元素
s = pd.Series(np.random.randn(6), index=list('abcdef'))
s
a 1.280483
b 1.562738
c 0.904503
d -0.470785
e -0.008048
f -0.413812
dtype: float64
s[2:5] # 基于整型的索引,与既有标签的不同,不包括右端点
c 0.904503
d -0.470785
e -0.008048
dtype: float64
# 如果用标签,不容易取得下一个标签
# s.loc['c':'e'+1] # 错误
s.loc['c':'e']
c 0.904503
d -0.470785
e -0.008048
dtype: float64
# Indexing potentially changes underlying Series dtype
# 在series类型下索引可能出现变化
# The different indexing operation can potentially change the dtype of a Series.
# 不同的索引操作可能会潜在的改变series的类型
series1 = pd.Series([1, 2, 3])
series1.dtype # int
series1
res = series1.reindex([0, 4])
res.dtype # float
res
dtype('int64')
0 1
1 2
2 3
dtype: int64
dtype('float64')
0 1.0
4 NaN
dtype: float64
series2 = pd.Series([True])
series2.dtype # 布尔类型
series2
res = series2.reindex_like(series1)
res.dtype # '0' 型 (空?)
res
dtype('bool')
0 True
dtype: bool
dtype('O')
0 True
1 NaN
2 NaN
dtype: object
# 由于默认插入NaN,引起了dtype的改变。
# 这会导致一些问题,当使用如 numpy.logical_and. 的np ufuncs 时
# 2018-02-22