# from http://pandas.pydata.org/pandas-docs/stable/indexing.html
# pandas 0.22.0
import pandas as pd
import numpy as np
"""
尽量利用pandas提供的专用索引方式,而不是python通用的切片方式。
三种主要的索引方式:indexers
.loc label based 基于标签, 可以是标签名,可以是布尔值,可以是一元函数
.iloc integer psition based 基于整数位置(from 0 to length-1 of the axis),和python切片类似
[]
"""
"""
Object Type Selection Return Value Type
------------------------------------------------------------
Series series[label] scalar value
DataFrame frame[colname] Series corresponding to colname
Panel panel[itemname] DataFrame corresponding to the itemname
"""
'\n尽量利用pandas提供的专用索引方式,而不是python通用的切片方式。\n三种主要的索引方式:indexers\n.loc label based 基于标签, 可以是标签名,可以是布尔值,可以是一元函数\n.iloc integer psition based 基于整数位置(from 0 to length-1 of the axis),和python切片类似\n[]\n\n'
'\nObject Type \t\tIndexers\nSeries \t\t\t\ts.loc[indexer]\nDataFrame \t\t\tdf.loc[row_indexer,column_indexer]\nPanel \t\t\t\tp.loc[item_indexer,major_indexer,minor_indexer]\n'
# Here we construct a simple time series data set to use for illustrating the indexing functionality
# 构造时间序列,举例说明索引的功能
dates = pd.date_range('1/1/2000', periods=8)
dates
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
'2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
dtype='datetime64[ns]', freq='D')
|
A |
B |
C |
D |
2000-01-01 |
0.136628 |
0.324097 |
0.856313 |
-0.145259 |
2000-01-02 |
0.113819 |
-0.718630 |
0.016217 |
-1.571263 |
2000-01-03 |
-0.603598 |
-0.143643 |
-1.143063 |
-0.425266 |
2000-01-04 |
-0.486200 |
-0.136663 |
-2.016020 |
-0.815514 |
2000-01-05 |
0.835318 |
1.036607 |
-0.502919 |
0.878680 |
2000-01-06 |
-2.376205 |
0.362577 |
-0.484754 |
-0.478711 |
2000-01-07 |
0.193371 |
1.330468 |
0.544160 |
1.030900 |
2000-01-08 |
0.476533 |
-0.476653 |
-0.434356 |
0.744500 |
panel = pd.Panel({'one' : df, 'two' : df - df.mean()}) # 多维表格
panel
panel['one']
panel['two']
Dimensions: 2 (items) x 8 (major_axis) x 4 (minor_axis)
Items axis: one to two
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-08 00:00:00
Minor_axis axis: A to D
|
A |
B |
C |
D |
2000-01-01 |
0.136628 |
0.324097 |
0.856313 |
-0.145259 |
2000-01-02 |
0.113819 |
-0.718630 |
0.016217 |
-1.571263 |
2000-01-03 |
-0.603598 |
-0.143643 |
-1.143063 |
-0.425266 |
2000-01-04 |
-0.486200 |
-0.136663 |
-2.016020 |
-0.815514 |
2000-01-05 |
0.835318 |
1.036607 |
-0.502919 |
0.878680 |
2000-01-06 |
-2.376205 |
0.362577 |
-0.484754 |
-0.478711 |
2000-01-07 |
0.193371 |
1.330468 |
0.544160 |
1.030900 |
2000-01-08 |
0.476533 |
-0.476653 |
-0.434356 |
0.744500 |
|
A |
B |
C |
D |
2000-01-01 |
0.350419 |
0.126827 |
1.251866 |
-0.047518 |
2000-01-02 |
0.327611 |
-0.915900 |
0.411770 |
-1.473522 |
2000-01-03 |
-0.389806 |
-0.340913 |
-0.747511 |
-0.327524 |
2000-01-04 |
-0.272408 |
-0.333933 |
-1.620467 |
-0.717772 |
2000-01-05 |
1.049110 |
0.839337 |
-0.107366 |
0.976422 |
2000-01-06 |
-2.162413 |
0.165307 |
-0.089201 |
-0.380969 |
2000-01-07 |
0.407163 |
1.133198 |
0.939713 |
1.128641 |
2000-01-08 |
0.690325 |
-0.673923 |
-0.038803 |
0.842241 |
# Thus, as per above, we have the most basic indexing using []:
# 最基本的索引方法,使用[]
s = df['A']
s
dates[5] # 注意切片从0计数
s[dates[5]]
2000-01-01 0.136628
2000-01-02 0.113819
2000-01-03 -0.603598
2000-01-04 -0.486200
2000-01-05 0.835318
2000-01-06 -2.376205
2000-01-07 0.193371
2000-01-08 0.476533
Freq: D, Name: A, dtype: float64
Timestamp('2000-01-06 00:00:00', freq='D')
-2.376204948581219
# 在[]中传入列名的列表,如[ 'A', "B" ]
columns_l = ['A', 'B']
df[columns_l]
df[[ 'A', "B" ]] # 相当于上面,注意两重[]
df
|
A |
B |
2000-01-01 |
0.324097 |
0.136628 |
2000-01-02 |
-0.718630 |
0.113819 |
2000-01-03 |
-0.143643 |
-0.603598 |
2000-01-04 |
-0.136663 |
-0.486200 |
2000-01-05 |
1.036607 |
0.835318 |
2000-01-06 |
0.362577 |
-2.376205 |
2000-01-07 |
1.330468 |
0.193371 |
2000-01-08 |
-0.476653 |
0.476533 |
|
A |
B |
2000-01-01 |
0.324097 |
0.136628 |
2000-01-02 |
-0.718630 |
0.113819 |
2000-01-03 |
-0.143643 |
-0.603598 |
2000-01-04 |
-0.136663 |
-0.486200 |
2000-01-05 |
1.036607 |
0.835318 |
2000-01-06 |
0.362577 |
-2.376205 |
2000-01-07 |
1.330468 |
0.193371 |
2000-01-08 |
-0.476653 |
0.476533 |
|
A |
B |
C |
D |
2000-01-01 |
0.324097 |
0.136628 |
0.856313 |
-0.145259 |
2000-01-02 |
-0.718630 |
0.113819 |
0.016217 |
-1.571263 |
2000-01-03 |
-0.143643 |
-0.603598 |
-1.143063 |
-0.425266 |
2000-01-04 |
-0.136663 |
-0.486200 |
-2.016020 |
-0.815514 |
2000-01-05 |
1.036607 |
0.835318 |
-0.502919 |
0.878680 |
2000-01-06 |
0.362577 |
-2.376205 |
-0.484754 |
-0.478711 |
2000-01-07 |
1.330468 |
0.193371 |
0.544160 |
1.030900 |
2000-01-08 |
-0.476653 |
0.476533 |
-0.434356 |
0.744500 |
df[['A', 'B']] = df[['B', 'A']] # 交换两列的值
df
df.loc[:,['B', 'A']]
# 下式不能交换两列的值
# This will not modify df because the column alignment is before value assignment. ? 不理解?
# 列赋值在值赋值之前?
df.loc[:,['B', 'A']] = df[['A', 'B']]
df
# df.loc[:,['B', 'A']] = df[:, ['A', 'B']] # 错误?
# df
# 正确的方式:
df.loc[:,['B', 'A']] = df[['A', 'B']].values # 取列的值
df
|
A |
B |
C |
D |
2000-01-01 |
0.136628 |
0.324097 |
0.856313 |
-0.145259 |
2000-01-02 |
0.113819 |
-0.718630 |
0.016217 |
-1.571263 |
2000-01-03 |
-0.603598 |
-0.143643 |
-1.143063 |
-0.425266 |
2000-01-04 |
-0.486200 |
-0.136663 |
-2.016020 |
-0.815514 |
2000-01-05 |
0.835318 |
1.036607 |
-0.502919 |
0.878680 |
2000-01-06 |
-2.376205 |
0.362577 |
-0.484754 |
-0.478711 |
2000-01-07 |
0.193371 |
1.330468 |
0.544160 |
1.030900 |
2000-01-08 |
0.476533 |
-0.476653 |
-0.434356 |
0.744500 |
|
B |
A |
2000-01-01 |
0.324097 |
0.136628 |
2000-01-02 |
-0.718630 |
0.113819 |
2000-01-03 |
-0.143643 |
-0.603598 |
2000-01-04 |
-0.136663 |
-0.486200 |
2000-01-05 |
1.036607 |
0.835318 |
2000-01-06 |
0.362577 |
-2.376205 |
2000-01-07 |
1.330468 |
0.193371 |
2000-01-08 |
-0.476653 |
0.476533 |
|
A |
B |
C |
D |
2000-01-01 |
0.136628 |
0.324097 |
0.856313 |
-0.145259 |
2000-01-02 |
0.113819 |
-0.718630 |
0.016217 |
-1.571263 |
2000-01-03 |
-0.603598 |
-0.143643 |
-1.143063 |
-0.425266 |
2000-01-04 |
-0.486200 |
-0.136663 |
-2.016020 |
-0.815514 |
2000-01-05 |
0.835318 |
1.036607 |
-0.502919 |
0.878680 |
2000-01-06 |
-2.376205 |
0.362577 |
-0.484754 |
-0.478711 |
2000-01-07 |
0.193371 |
1.330468 |
0.544160 |
1.030900 |
2000-01-08 |
0.476533 |
-0.476653 |
-0.434356 |
0.744500 |
|
A |
B |
C |
D |
2000-01-01 |
0.324097 |
0.136628 |
0.856313 |
-0.145259 |
2000-01-02 |
-0.718630 |
0.113819 |
0.016217 |
-1.571263 |
2000-01-03 |
-0.143643 |
-0.603598 |
-1.143063 |
-0.425266 |
2000-01-04 |
-0.136663 |
-0.486200 |
-2.016020 |
-0.815514 |
2000-01-05 |
1.036607 |
0.835318 |
-0.502919 |
0.878680 |
2000-01-06 |
0.362577 |
-2.376205 |
-0.484754 |
-0.478711 |
2000-01-07 |
1.330468 |
0.193371 |
0.544160 |
1.030900 |
2000-01-08 |
-0.476653 |
0.476533 |
-0.434356 |
0.744500 |
# Attribute Access¶
# 属性访问,属性存取
# You may access an index on a Series, column on a DataFrame,
# and an item on a Panel directly as an attribute:
sa = pd.Series([1,2,3],index=list('abc'))
sa
dfa = df.copy()
dfa
a 1
b 2
c 3
dtype: int64
|
A |
B |
C |
D |
2000-01-01 |
0.324097 |
0.136628 |
0.856313 |
-0.145259 |
2000-01-02 |
-0.718630 |
0.113819 |
0.016217 |
-1.571263 |
2000-01-03 |
-0.143643 |
-0.603598 |
-1.143063 |
-0.425266 |
2000-01-04 |
-0.136663 |
-0.486200 |
-2.016020 |
-0.815514 |
2000-01-05 |
1.036607 |
0.835318 |
-0.502919 |
0.878680 |
2000-01-06 |
0.362577 |
-2.376205 |
-0.484754 |
-0.478711 |
2000-01-07 |
1.330468 |
0.193371 |
0.544160 |
1.030900 |
2000-01-08 |
-0.476653 |
0.476533 |
-0.434356 |
0.744500 |
# 对item column等赋值
# 就像属性一样的存取,但需要注意:
# 1. 名称应符合python命名规则,由字母、数字和下划线组成
# 2. 不能和已有的方法名称重名,例如 min
# 3. 不能与pandas内部“关键字”重名,例如 index axis items labels
# 以上情况,可以使用 [ "" ] 引用
sa.a = 5
sa
dfa.index
dfa.A = list(range(len(dfa.index)))
dfa
a 5
b 2
c 3
dtype: int64
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
'2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
dtype='datetime64[ns]', freq='D')
|
A |
B |
C |
D |
2000-01-01 |
0 |
0.136628 |
0.856313 |
-0.145259 |
2000-01-02 |
1 |
0.113819 |
0.016217 |
-1.571263 |
2000-01-03 |
2 |
-0.603598 |
-1.143063 |
-0.425266 |
2000-01-04 |
3 |
-0.486200 |
-2.016020 |
-0.815514 |
2000-01-05 |
4 |
0.835318 |
-0.502919 |
0.878680 |
2000-01-06 |
5 |
-2.376205 |
-0.484754 |
-0.478711 |
2000-01-07 |
6 |
0.193371 |
0.544160 |
1.030900 |
2000-01-08 |
7 |
0.476533 |
-0.434356 |
0.744500 |
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) # 字典key值为列名
x
dict(x=9, y=99)
x.iloc[1]
x.iloc[1] = dict(x=9, y=99)
x
{'x': 9, 'y': 99}
x 2
y 4
Name: 1, dtype: int64
df = pd.DataFrame({'one': [1., 2., 3.]})
df
df.two = [4, 5, 6] # 错误,不能增加一列,利用属性的方式不能对没有的列赋值
df.two # 但是增加了一项属性,而且可以取得这项属性
df
df['two'] = [4, 5, 6] # 可以增加一列
df
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
This is separate from the ipykernel package so we can avoid doing imports until
[4, 5, 6]
|
one |
two |
0 |
1.0 |
4 |
1 |
2.0 |
5 |
2 |
3.0 |
6 |
# Slicing ranges
# 切片范围
# iloc 方法是最稳健和兼容的
# 下面介绍 方括号 [] 操作符 作为切片
# 对series操作
# 取值
s
s[:5]
s[::2]
s[::-1]
2000-01-01 0.324097
2000-01-02 -0.718630
2000-01-03 -0.143643
2000-01-04 -0.136663
2000-01-05 1.036607
2000-01-06 0.362577
2000-01-07 1.330468
2000-01-08 -0.476653
Freq: D, Name: A, dtype: float64
2000-01-01 0.324097
2000-01-02 -0.718630
2000-01-03 -0.143643
2000-01-04 -0.136663
2000-01-05 1.036607
Freq: D, Name: A, dtype: float64
2000-01-01 0.324097
2000-01-03 -0.143643
2000-01-05 1.036607
2000-01-07 1.330468
Freq: 2D, Name: A, dtype: float64
2000-01-08 -0.476653
2000-01-07 1.330468
2000-01-06 0.362577
2000-01-05 1.036607
2000-01-04 -0.136663
2000-01-03 -0.143643
2000-01-02 -0.718630
2000-01-01 0.324097
Freq: -1D, Name: A, dtype: float64
# 赋值
s2 = s.copy()
s2
s2[:5] = 0
s2
2000-01-01 0.324097
2000-01-02 -0.718630
2000-01-03 -0.143643
2000-01-04 -0.136663
2000-01-05 1.036607
2000-01-06 0.362577
2000-01-07 1.330468
2000-01-08 -0.476653
Freq: D, Name: A, dtype: float64
2000-01-01 0.000000
2000-01-02 0.000000
2000-01-03 0.000000
2000-01-04 0.000000
2000-01-05 0.000000
2000-01-06 0.362577
2000-01-07 1.330468
2000-01-08 -0.476653
Freq: D, Name: A, dtype: float64
# 对DataFrame操作
# [] 操作 选择的是行
df
df[:3]
df[::-1]
|
one |
two |
0 |
1.0 |
4 |
1 |
2.0 |
5 |
2 |
3.0 |
6 |
|
one |
two |
0 |
1.0 |
4 |
1 |
2.0 |
5 |
2 |
3.0 |
6 |
|
one |
two |
2 |
3.0 |
6 |
1 |
2.0 |
5 |
0 |
1.0 |
4 |
# Selection By Label
# 通过标签选择
# 这种方式可能依靠上下文关系,有时候会调用链式赋值,这应该避免。参考:
# Returning a view versus a copy 返回视图 对比 拷贝
# When setting values in a pandas object,
# care must be taken to avoid what is called chained indexing.
# Here is an example.
[list('abcd'),list('efgh'),list('ijkl'),list('mnop')]
[['one','two'],['first','second']]
dfmi = pd.DataFrame([list('abcd'),list('efgh'),list('ijkl'),list('mnop')], # 定义四列数值
columns=pd.MultiIndex.from_product(
[['one','two'],['first','second']])) # 定义多重索引,第一个list是第一层,以下类推
dfmi
[['a', 'b', 'c', 'd'],
['e', 'f', 'g', 'h'],
['i', 'j', 'k', 'l'],
['m', 'n', 'o', 'p']]
[['one', 'two'], ['first', 'second']]
|
one |
two |
|
first |
second |
first |
second |
0 |
a |
b |
c |
d |
1 |
e |
f |
g |
h |
2 |
i |
j |
k |
l |
3 |
m |
n |
o |
p |
# Compare these two access methods:
dfmi['one']['second'] # chained
dfmi.loc[:,('one','second')]
0 b
1 f
2 j
3 n
Name: second, dtype: object
0 b
1 f
2 j
3 n
Name: (one, second), dtype: object
# 两种方式分析
# 第一种方式:链式 使用两个 []
dfmi['one'] # 第一个[] 先生成了一个DataFrame
dfmi['one']['second'] # pandas 把两个[] 作为分开的事件,他们执行分开的两步调用 __getitem__
|
first |
second |
0 |
a |
b |
1 |
e |
f |
2 |
i |
j |
3 |
m |
n |
# 第二种方式:loc
# 通过嵌套的元组切片
df_s = (slice(None), ('one', 'second'))
dfmi.loc[df_s] # 只调用了一次 __getitem__
0 b
1 f
2 j
3 n
Name: (one, second), dtype: object
# Selection By Label
# 通过标签选择
# loc要注意索引的数据类型,必须与索引的数据类型一致才可以,
# 例如 datetimeIndex 中,使用loc[2:3] ,即整数型的slice 将会出现TypeError
dfl = pd.DataFrame(np.random.randn(5,4), columns=list('ABCD'),
index=pd.date_range('20130101',periods=5))
dfl
# dfl.loc[2:3] # 错误的loc
dfl.loc['20130102':'20130104'] # 使用可转换为datetime的字符串
dfl.loc['20130202':'20130204'] # 不报错,返回为空DataFrame
# dfl.loc['20130202'] # 报错,错误信息是index无此值
dfl.loc['20130104':'20130114'] # 只返回存在的数据
|
A |
B |
C |
D |
2013-01-01 |
-0.887358 |
1.068362 |
0.551961 |
-0.378400 |
2013-01-02 |
1.239840 |
-0.986149 |
-0.880655 |
2.112731 |
2013-01-03 |
-0.785526 |
1.583703 |
-0.871005 |
-0.659880 |
2013-01-04 |
-1.267462 |
2.500886 |
-0.980569 |
1.308624 |
2013-01-05 |
-0.842107 |
-0.921086 |
1.020196 |
-0.055930 |
|
A |
B |
C |
D |
2013-01-02 |
1.239840 |
-0.986149 |
-0.880655 |
2.112731 |
2013-01-03 |
-0.785526 |
1.583703 |
-0.871005 |
-0.659880 |
2013-01-04 |
-1.267462 |
2.500886 |
-0.980569 |
1.308624 |
|
A |
B |
C |
D |
2013-01-04 |
-1.267462 |
2.500886 |
-0.980569 |
1.308624 |
2013-01-05 |
-0.842107 |
-0.921086 |
1.020196 |
-0.055930 |
# loc可以使用整数,但此时的整数不代表位置,而是label
# loc是基本的取值方法
# 给loc的输入,即在[]中的值,可以是:
# 1. 单独的label e.g. 5 or 'a
# 2. labels的list ['a', 'b', 'c']
# 3. slice对象 'a':'f' !! 注意:与python的切片不同,pandas的切片包括开始和结尾,而python不包括结尾
# 4. 布尔值
# 5. 调用函数 [lambda df: df.A > 0, :]
# Series
s1 = pd.Series(np.random.randn(6),index=list('abcdef'))
s1
s1.loc['c':]
s1.loc['b']
s1.loc['c':] = 0
s1
a 0.796911
b -1.341250
c 0.008152
d -0.745881
e 0.674385
f 1.108411
dtype: float64
c 0.008152
d -0.745881
e 0.674385
f 1.108411
dtype: float64
-1.3412499335785426
a 0.796911
b -1.341250
c 0.000000
d 0.000000
e 0.000000
f 0.000000
dtype: float64
# DataFrame
df1 = pd.DataFrame(np.random.randn(6,4), # 6X4 阵列
index=list('abcdef'), # 索引
columns=list('ABCD')) # 行号
df1
|
A |
B |
C |
D |
a |
-1.912141 |
-0.835589 |
-0.188341 |
-1.024797 |
b |
-0.977498 |
-2.050214 |
0.355172 |
-0.291794 |
c |
-0.183401 |
-0.376330 |
-0.188848 |
-2.116438 |
d |
-1.008359 |
0.230593 |
-0.099235 |
-0.426229 |
e |
-0.027338 |
2.125459 |
0.066200 |
-0.247813 |
f |
-1.132103 |
1.945235 |
1.891179 |
1.549750 |
df1.loc[['a', 'b', 'd'], :] # 先是行label选择,再是列label选择
df1.loc['d':, 'A':'C']
# 取得 a cross section 截面,用单个的label,返回Series
# 以下三式等同
df1.loc['a']
df1.loc['a',:]
df1.xs('a')
type(df1.loc['a'])
|
A |
B |
C |
D |
a |
-1.912141 |
-0.835589 |
-0.188341 |
-1.024797 |
b |
-0.977498 |
-2.050214 |
0.355172 |
-0.291794 |
d |
-1.008359 |
0.230593 |
-0.099235 |
-0.426229 |
|
A |
B |
C |
d |
-1.008359 |
0.230593 |
-0.099235 |
e |
-0.027338 |
2.125459 |
0.066200 |
f |
-1.132103 |
1.945235 |
1.891179 |
A -1.912141
B -0.835589
C -0.188341
D -1.024797
Name: a, dtype: float64
A -1.912141
B -0.835589
C -0.188341
D -1.024797
Name: a, dtype: float64
A -1.912141
B -0.835589
C -0.188341
D -1.024797
Name: a, dtype: float64
pandas.core.series.Series
# 通过布尔值数组取值
df1.loc['a'] > 0
df1.loc[:, df1.loc['a'] > 0]
A False
B False
C False
D False
Name: a, dtype: bool
# 获取一个值
# this is also equivalent to ``df1.at['a','A']``
df1.loc['a', 'A']
df1.at['a','A']
-1.9121410098540752
-1.9121410098540752
# Slicing with labels¶
# 用labels切片
s = pd.Series(list('abcde'), index=[0,3,2,5,4])
s
s.loc[3:5] # 包含 5, 注意不是 3 4 5 ,而是从 标签3 到 标签5
0 a
3 b
2 c
5 d
4 e
dtype: object
3 b
2 c
5 d
dtype: object
# s.loc[3:6] # 2个错误,ValueError: index must be monotonic increasing or decreasing KeyError: 6
# 如果排序的话,可以超出范围
s.sort_index() # 不改变原值
s
s.sort_index().loc[1:6] # 可以超出范围
s.sort_index().loc[6:8] # 即使一个都没有
# s.sort_index().loc[8] # 但不能是单值,必须是切片
0 a
2 c
3 b
4 e
5 d
dtype: object
0 a
3 b
2 c
5 d
4 e
dtype: object
2 c
3 b
4 e
5 d
dtype: object
Series([], dtype: object)
# Selection By Position
# 通过位置选择,仅通过基于索引的整数,与python和numpy类似,从0开始,且不包括最后一个
# iloc的输入:
# 1. 整数
# 2. 整数的list
# 3. 整数的切片
# 4. 布尔值数组
# 5. 调用函数
# Series
s1 = pd.Series(np.random.randn(5), index=list(range(0,10,2)))
s1
s1.iloc[:3]
s1.iloc[3]
s1.iloc[:3] = 0
s1
0 -0.312716
2 1.425936
4 1.716575
6 2.099666
8 0.262365
dtype: float64
0 -0.312716
2 1.425936
4 1.716575
dtype: float64
2.099665679869975
0 0.000000
2 0.000000
4 0.000000
6 2.099666
8 0.262365
dtype: float64
# DataFrame
df1 = pd.DataFrame(np.random.randn(6,4),
index=list(range(0,12,2)),
columns=list(range(0,8,2)))
df1
|
0 |
2 |
4 |
6 |
0 |
1.635993 |
-0.512450 |
1.786760 |
-0.002533 |
2 |
0.173188 |
0.275977 |
-0.044987 |
-1.077772 |
4 |
1.985020 |
1.604020 |
0.127853 |
-1.003384 |
6 |
0.250428 |
-0.102090 |
1.566787 |
-1.708521 |
8 |
-2.111103 |
-1.232141 |
0.863753 |
-0.545229 |
10 |
-1.762999 |
1.009840 |
0.274013 |
0.786940 |
df1.iloc[:3]
df1.iloc[1:5, 2:4]
df1.iloc[[1, 3, 5], [1, 3]]
df1.iloc[1:3, :]
df1.iloc[:, 1:3]
|
0 |
2 |
4 |
6 |
0 |
1.635993 |
-0.512450 |
1.786760 |
-0.002533 |
2 |
0.173188 |
0.275977 |
-0.044987 |
-1.077772 |
4 |
1.985020 |
1.604020 |
0.127853 |
-1.003384 |
|
4 |
6 |
2 |
-0.044987 |
-1.077772 |
4 |
0.127853 |
-1.003384 |
6 |
1.566787 |
-1.708521 |
8 |
0.863753 |
-0.545229 |
|
2 |
6 |
2 |
0.275977 |
-1.077772 |
6 |
-0.102090 |
-1.708521 |
10 |
1.009840 |
0.786940 |
|
0 |
2 |
4 |
6 |
2 |
0.173188 |
0.275977 |
-0.044987 |
-1.077772 |
4 |
1.985020 |
1.604020 |
0.127853 |
-1.003384 |
|
2 |
4 |
0 |
-0.512450 |
1.786760 |
2 |
0.275977 |
-0.044987 |
4 |
1.604020 |
0.127853 |
6 |
-0.102090 |
1.566787 |
8 |
-1.232141 |
0.863753 |
10 |
1.009840 |
0.274013 |
# this is also equivalent to ``df1.iat[1,1]``
# 取单个的值
df1.iloc[1, 1]
df1.iat[1, 1]
0.2759774784621013
0.2759774784621013
# For getting a cross section using an integer position (equiv to df.xs(1))
# 取截面,得到Series
df1.iloc[1]
df1.iloc[:,1]
0 0.173188
2 0.275977
4 -0.044987
6 -1.077772
Name: 2, dtype: float64
0 -0.512450
2 0.275977
4 1.604020
6 -0.102090
8 -1.232141
10 1.009840
Name: 2, dtype: float64
# Out of range slice indexes are handled gracefully just as in Python/Numpy.
# 超过索引的切片处理,与python和numpy一样
# 注意:不能是单独索引,或列表中,有超过界限的值,只可以是slice,即带冒号的切片才不会提示错误
x = list('abcdef')
x[4:10]
x[8:10]
s = pd.Series(x)
s.iloc[4:10]
s.iloc[8:10] # 超过界限bound返回空
['e', 'f']
[]
4 e
5 f
dtype: object
Series([], dtype: object)
dfl = pd.DataFrame(np.random.randn(5,2), columns=list('AB'))
dfl
dfl.iloc[:, 2:3]
dfl.iloc[:, 1:3]
dfl.iloc[4:6]
|
A |
B |
0 |
-0.069941 |
1.124145 |
1 |
-0.025781 |
0.940736 |
2 |
-0.117417 |
0.503736 |
3 |
0.882286 |
0.302845 |
4 |
-0.136374 |
0.276822 |
|
B |
0 |
1.124145 |
1 |
0.940736 |
2 |
0.503736 |
3 |
0.302845 |
4 |
0.276822 |
# Selection By Callable
# 通过调用函数进行选择
df1 = pd.DataFrame(np.random.randn(6,4),
index=list("abcdef"),
columns=list("ABCD"))
df1
df1.A
|
A |
B |
C |
D |
a |
-1.947578 |
0.874286 |
1.139484 |
-3.004564 |
b |
0.565255 |
0.028440 |
0.685688 |
0.973264 |
c |
-1.275992 |
0.732339 |
-0.324490 |
1.116887 |
d |
0.433325 |
0.002567 |
-1.310127 |
0.844756 |
e |
0.341412 |
-0.606646 |
0.034623 |
0.772968 |
f |
1.518936 |
-0.590351 |
0.604839 |
-1.461750 |
a -1.947578
b 0.565255
c -1.275992
d 0.433325
e 0.341412
f 1.518936
Name: A, dtype: float64
df1.loc[lambda df: df.A > 0, :]
df1.loc[:, lambda df: ['A', 'B']]
df1.iloc[:, lambda df: [0, 1]]
df1[lambda df: df.columns[0]]
|
A |
B |
C |
D |
b |
0.565255 |
0.028440 |
0.685688 |
0.973264 |
d |
0.433325 |
0.002567 |
-1.310127 |
0.844756 |
e |
0.341412 |
-0.606646 |
0.034623 |
0.772968 |
f |
1.518936 |
-0.590351 |
0.604839 |
-1.461750 |
|
A |
B |
a |
-1.947578 |
0.874286 |
b |
0.565255 |
0.028440 |
c |
-1.275992 |
0.732339 |
d |
0.433325 |
0.002567 |
e |
0.341412 |
-0.606646 |
f |
1.518936 |
-0.590351 |
|
A |
B |
a |
-1.947578 |
0.874286 |
b |
0.565255 |
0.028440 |
c |
-1.275992 |
0.732339 |
d |
0.433325 |
0.002567 |
e |
0.341412 |
-0.606646 |
f |
1.518936 |
-0.590351 |
a -1.947578
b 0.565255
c -1.275992
d 0.433325
e 0.341412
f 1.518936
Name: A, dtype: float64
df1.A
df1.A.loc[lambda s: s > 0]
df1.A.loc[df1.A > 0]
a -1.947578
b 0.565255
c -1.275992
d 0.433325
e 0.341412
f 1.518936
Name: A, dtype: float64
b 0.565255
d 0.433325
e 0.341412
f 1.518936
Name: A, dtype: float64
b 0.565255
d 0.433325
e 0.341412
f 1.518936
Name: A, dtype: float64
# 使用这些方法或索引,可以使用链式的选择方法,而不用中间的临时变量。链式方法,不是链式[]
bb = pd.read_csv('data/baseball.csv', index_col='id')
bb.groupby(['year', 'team']).sum().loc[lambda df: df.r > 100]
# IX Indexer is Deprecated
# 不推荐使用ix
# in favor of the more strict .iloc and .loc indexers.
# 使用.iloc和.loc代替.ix
dfd = pd.DataFrame({'A': [1, 2, 3],'B': [4, 5, 6]},index=list('abc'))
dfd
dfd.ix[[0, 2], 'A']
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
"""Entry point for launching an IPython kernel.
a 1
c 3
Name: A, dtype: int64
dfd.index[[0, 2]] # 取得索引的名称
dfd.loc[dfd.index[[0, 2]], 'A']
dfd.columns.get_loc('A') # 取得列的索引值
dfd.iloc[[0, 2], dfd.columns.get_loc('A')]
Index(['a', 'c'], dtype='object')
a 1
c 3
Name: A, dtype: int64
0
a 1
c 3
Name: A, dtype: int64
# Indexing with list with missing labels is Deprecated
# 不推荐用有缺失标签的list进行索引
# using .loc or [] with a list with one or more missing labels, is deprecated, in favor of .reindex.
# 推荐使用.reindex
s = pd.Series([1, 2, 3])
s
0 1
1 2
2 3
dtype: int64
s.loc[[1, 2]] # list中的keys都存在,则没有变化
1 2
2 3
dtype: int64
s.loc[[1, 2, 3]] # 当有缺失时,赋值为NaN
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
"""Entry point for launching an IPython kernel.
1 2.0
2 3.0
3 NaN
dtype: float64
# Reindexing
s.reindex([1, 2, 3])
1 2.0
2 3.0
3 NaN
dtype: float64
# 如果仅选择有效的keys
labels = [1, 2, 3]
s.index.intersection(labels) # index和labes的交集
s.loc[s.index.intersection(labels)]
Int64Index([1, 2], dtype='int64')
1 2
2 3
dtype: int64
# reindex 索引中不能有重复的项
s = pd.Series(np.arange(4), index=['a', 'a', 'b', 'c'])
s
labels = ['c', 'd']
# s.reindex(labels) # 不能reindex
a 0
a 1
b 2
c 3
dtype: int32
# 可以先把交集切出来,再进行reindex
# 但是交集不能有重复的index
s.index.intersection(labels)
s.loc[s.index.intersection(labels)]
s.loc[s.index.intersection(labels)].reindex(labels)
Index(['c'], dtype='object')
c 3
dtype: int32
c 3.0
d NaN
dtype: float64
# Selecting Random Samples
# 随机取样选择
# 默认行取样
s = pd.Series([0,1,2,3,4,5])
s
0 0
1 1
2 2
3 3
4 4
5 5
dtype: int64
s.sample() # 默认取1行
s.sample(n=3) # 取3行
s.sample(frac=0.5) # 小数,行数百分数
s.sample(frac=0.8) # 小数
s
1 1
dtype: int64
3 3
4 4
5 5
dtype: int64
2 2
3 3
4 4
dtype: int64
2 2
1 1
3 3
0 0
5 5
dtype: int64
0 0
1 1
2 2
3 3
4 4
5 5
dtype: int64
s.sample(n=6, replace=False) # 默认
s.sample(n=6, replace=True) # replace,不改变本身
s
5 5
4 4
1 1
0 0
2 2
3 3
dtype: int64
0 0
4 4
0 0
2 2
0 0
0 0
dtype: int64
0 0
1 1
2 2
3 3
4 4
5 5
dtype: int64
# 默认每行都有同样的概率被抽样到,也可以指定每行的概率比重
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
s.sample(n=3, weights=example_weights)
example_weights2 = [0.5, 0, 0, 0, 0, 0] # 权重将会自动的归一化
s.sample(n=1, weights=example_weights2)
2 2
3 3
5 5
dtype: int64
0 0
dtype: int64
# 可以指定DataFrame的某列作为权重
df2 = pd.DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
df2.sample(n = 3, weights = 'weight_column')
|
col1 |
weight_column |
1 |
8 |
0.4 |
2 |
7 |
0.1 |
0 |
9 |
0.5 |
# 对列进行抽样
df3 = pd.DataFrame({'col1':[1,2,3], 'col2':[2,3,4], 'col3':[3,4,5]})
df3
df3.sample(n=2, axis=1) # 指定axis=1,对列抽样,抽取的是列的组合
df3.sample(n=2, axis=0)
|
col1 |
col2 |
col3 |
0 |
1 |
2 |
3 |
1 |
2 |
3 |
4 |
2 |
3 |
4 |
5 |
|
col3 |
col2 |
0 |
3 |
2 |
1 |
4 |
3 |
2 |
5 |
4 |
|
col1 |
col2 |
col3 |
2 |
3 |
4 |
5 |
0 |
1 |
2 |
3 |
# 可以指定random seed 或者 numpy 的 randomState 对象,作为sample 随机数生成器的种子
# 一旦seed确定,随机数不变
df3.sample(n=2, random_state=2)
df3.sample(n=2, random_state=2)
df3.sample(n=2, random_state=200)
df3.sample(n=2, random_state=200)
|
col1 |
col2 |
col3 |
2 |
3 |
4 |
5 |
1 |
2 |
3 |
4 |
|
col1 |
col2 |
col3 |
2 |
3 |
4 |
5 |
1 |
2 |
3 |
4 |
|
col1 |
col2 |
col3 |
0 |
1 |
2 |
3 |
1 |
2 |
3 |
4 |
|
col1 |
col2 |
col3 |
0 |
1 |
2 |
3 |
1 |
2 |
3 |
4 |
# Setting With Enlargement
# loc或[] 操作 可以通过赋值不存在的键扩大Series或Dataframe
# Series
se = pd.Series([1,2,3])
se
se[5] = 5 # append
se
0 1
1 2
2 3
dtype: int64
0 1
1 2
2 3
5 5
dtype: int64
# DataFrame
dfi = pd.DataFrame(np.arange(6).reshape(3,2),columns=['A','B'])
dfi
dfi.loc[:,'C'] = dfi.loc[:,'A'] # enlarge 增加列
dfi
|
A |
B |
C |
0 |
0 |
1 |
0 |
1 |
2 |
3 |
2 |
2 |
4 |
5 |
4 |
dfi.loc[3] = 5 # append 增加行
dfi
|
A |
B |
C |
0 |
0 |
1 |
0 |
1 |
2 |
3 |
2 |
2 |
4 |
5 |
4 |
3 |
5 |
5 |
5 |
# Fast scalar value getting and setting
# 快速取得或设置标量的值
# [] 可以进行很多操作,所以它为了知道你要进行那种操作,会有一点计算开销
# 最快的标量访问方式是使用 at 和 iat 方法,他们可以在所有的数据结构上使用
# at 类似于 loc 基于 label
# iat 类似于 iloc 基于 整数index
s = pd.Series([0,1,2,3,4,5])
s
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df
0 0
1 1
2 2
3 3
4 4
5 5
dtype: int64
|
A |
B |
C |
D |
2000-01-01 |
0.845046 |
-0.119758 |
1.354224 |
-0.134199 |
2000-01-02 |
-1.935579 |
-1.469185 |
-2.581439 |
0.355347 |
2000-01-03 |
-0.038740 |
-1.524056 |
1.376257 |
1.572331 |
2000-01-04 |
-0.846971 |
0.189231 |
-0.287885 |
-0.561706 |
2000-01-05 |
-0.127290 |
-0.043918 |
0.103347 |
-1.055387 |
2000-01-06 |
0.406437 |
1.917624 |
0.810463 |
0.367583 |
2000-01-07 |
0.438904 |
-0.230190 |
0.593607 |
-0.438856 |
2000-01-08 |
-1.955122 |
1.531260 |
0.889124 |
-0.014259 |
# get value
s.iat[5]
df.at[dates[5], 'A']
df.iat[3, 0]
5
0.40643702489386246
-0.8469710793801154
# set value
df.at[dates[5], 'E'] = 7 # 没有列就增加一列,没有值的默认赋值为nan
df
|
A |
B |
C |
D |
E |
2000-01-01 |
0.845046 |
-0.119758 |
1.354224 |
-0.134199 |
NaN |
2000-01-02 |
-1.935579 |
-1.469185 |
-2.581439 |
0.355347 |
NaN |
2000-01-03 |
-0.038740 |
-1.524056 |
1.376257 |
1.572331 |
NaN |
2000-01-04 |
-0.846971 |
0.189231 |
-0.287885 |
-0.561706 |
NaN |
2000-01-05 |
-0.127290 |
-0.043918 |
0.103347 |
-1.055387 |
NaN |
2000-01-06 |
0.406437 |
1.917624 |
0.810463 |
0.367583 |
7.0 |
2000-01-07 |
0.438904 |
-0.230190 |
0.593607 |
-0.438856 |
NaN |
2000-01-08 |
-1.955122 |
1.531260 |
0.889124 |
-0.014259 |
NaN |
df.iat[3, 0] = 7
df
|
A |
B |
C |
D |
E |
2000-01-01 |
0.845046 |
-0.119758 |
1.354224 |
-0.134199 |
NaN |
2000-01-02 |
-1.935579 |
-1.469185 |
-2.581439 |
0.355347 |
NaN |
2000-01-03 |
-0.038740 |
-1.524056 |
1.376257 |
1.572331 |
NaN |
2000-01-04 |
7.000000 |
0.189231 |
-0.287885 |
-0.561706 |
NaN |
2000-01-05 |
-0.127290 |
-0.043918 |
0.103347 |
-1.055387 |
NaN |
2000-01-06 |
0.406437 |
1.917624 |
0.810463 |
0.367583 |
7.0 |
2000-01-07 |
0.438904 |
-0.230190 |
0.593607 |
-0.438856 |
NaN |
2000-01-08 |
-1.955122 |
1.531260 |
0.889124 |
-0.014259 |
NaN |
df.at[dates[-1]+1, 0] = 7 # 行和列都扩展了
df
|
A |
B |
C |
D |
E |
0 |
2000-01-01 |
0.845046 |
-0.119758 |
1.354224 |
-0.134199 |
NaN |
NaN |
2000-01-02 |
-1.935579 |
-1.469185 |
-2.581439 |
0.355347 |
NaN |
NaN |
2000-01-03 |
-0.038740 |
-1.524056 |
1.376257 |
1.572331 |
NaN |
NaN |
2000-01-04 |
7.000000 |
0.189231 |
-0.287885 |
-0.561706 |
NaN |
NaN |
2000-01-05 |
-0.127290 |
-0.043918 |
0.103347 |
-1.055387 |
NaN |
NaN |
2000-01-06 |
0.406437 |
1.917624 |
0.810463 |
0.367583 |
7.0 |
NaN |
2000-01-07 |
0.438904 |
-0.230190 |
0.593607 |
-0.438856 |
NaN |
NaN |
2000-01-08 |
-1.955122 |
1.531260 |
0.889124 |
-0.014259 |
NaN |
NaN |
2000-01-09 |
NaN |
NaN |
NaN |
NaN |
NaN |
7.0 |
# Boolean indexing
# 使用布尔向量过滤数据
# 操作符包括:| for or, & for and, and ~ for not ,必须用括号进行分组
s = pd.Series(range(-3, 4)) # 不用把range先list,直接可以series
s
0 -3
1 -2
2 -1
3 0
4 1
5 2
6 3
dtype: int64
s[s >= 0] # 直接用series,不用取其值
s[~(s < 0)]
s[(s < -1) | (s > 0.5)]
3 0
4 1
5 2
6 3
dtype: int64
3 0
4 1
5 2
6 3
dtype: int64
0 -3
1 -2
4 1
5 2
6 3
dtype: int64
df[df['A'] > 0]
|
A |
B |
C |
D |
E |
0 |
2000-01-01 |
0.845046 |
-0.119758 |
1.354224 |
-0.134199 |
NaN |
NaN |
2000-01-04 |
7.000000 |
0.189231 |
-0.287885 |
-0.561706 |
NaN |
NaN |
2000-01-06 |
0.406437 |
1.917624 |
0.810463 |
0.367583 |
7.0 |
NaN |
2000-01-07 |
0.438904 |
-0.230190 |
0.593607 |
-0.438856 |
NaN |
NaN |
# List comprehensions and map method of Series can also be used to produce more complex criteria:
# 列表生成式和map方法 也可以用来生成 更复杂的条件判断
df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
'c' : np.random.randn(7)})
df2
|
a |
b |
c |
0 |
one |
x |
1.706402 |
1 |
one |
y |
0.491977 |
2 |
two |
y |
1.357986 |
3 |
three |
x |
-1.023513 |
4 |
two |
y |
-0.653028 |
5 |
one |
x |
0.041052 |
6 |
six |
x |
1.021882 |
# 判别式
criterion = df2['a'].map(lambda x: x.startswith('t')) # 选择 two 和 three
criterion
0 False
1 False
2 True
3 True
4 True
5 False
6 False
Name: a, dtype: bool
df2[criterion] # 根据 a 列的判别式 选择数据表的一部分,包含其他列
|
a |
b |
c |
2 |
two |
y |
1.357986 |
3 |
three |
x |
-1.023513 |
4 |
two |
y |
-0.653028 |
# 等价的方式,但是速度慢一些
df2[[x.startswith('t') for x in df2['a']]] # 不适用map,而是用列表生成式
|
a |
b |
c |
2 |
two |
y |
1.357986 |
3 |
three |
x |
-1.023513 |
4 |
two |
y |
-0.653028 |
# 复合判断
df2[criterion & (df2['b'] == 'x')] # a 列 和 b 列 均符合某类要求
|
a |
b |
c |
3 |
three |
x |
-1.023513 |
# 布尔向量选择 可以与索引选择一并使用
df2.loc[criterion & (df2['b'] == 'x'),'b':'c'] # 只选择 b 和 c 两列,不选择c列
# Indexing with isin
# 用isin索引
# series
# 对列数据进行判断
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s
4 0
3 1
2 2
1 3
0 4
dtype: int64
s.isin([2, 4, 6]) # 列数据中是否存在列表中的值,返回布尔值
s[s.isin([2, 4, 6])] # 可以利用返回的布尔值进行选择
4 False
3 False
2 True
1 False
0 True
dtype: bool
2 2
0 4
dtype: int64
# 也可以对index obj 进行筛选
s[s.index.isin([2, 4, 6])]
s.reindex([2, 4, 6]) # reindex不同,列表中没有的值返回了nan,且原来的int64返回了float64数据类型
4 0
2 2
dtype: int64
2 2.0
4 0.0
6 NaN
dtype: float64
# 对应多重索引,可以单独选择索引级别
s_mi = pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi
0 a 0
b 1
c 2
1 a 3
b 4
c 5
dtype: int32
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)] # 指定索引级别,在第二级索引中选择
0 c 2
1 a 3
dtype: int32
0 a 0
c 2
1 a 3
c 5
dtype: int32
# DataFrame
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],'ids2': ['a', 'n', 'c', 'n']})
df
|
ids |
ids2 |
vals |
0 |
a |
a |
1 |
1 |
b |
n |
2 |
2 |
f |
c |
3 |
3 |
n |
n |
4 |
values = ['a', 'b', 1, 3]
df.isin(values) # 匹配所有的值
|
ids |
ids2 |
vals |
0 |
True |
True |
True |
1 |
True |
False |
False |
2 |
False |
False |
True |
3 |
False |
False |
False |
values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}
row_mask = df.isin(values) # 对不用的列,分别匹配某些值
row_mask
# ?row_mask.all # Return whether all elements are True over requested axis
row_mask = row_mask.all(1)
row_mask
df[row_mask] # 选择全是True的行
|
ids |
ids2 |
vals |
0 |
True |
True |
True |
1 |
True |
False |
False |
2 |
False |
True |
True |
3 |
False |
False |
False |
0 True
1 False
2 False
3 False
dtype: bool
# The where() Method and Masking
# To guarantee that selection output has the same shape as the original data
# 保证选集输出与原数据有同样的shape形态
# series
s
s[s > 0] # 只返回满足的项
s.where(s > 0) # 全部返回,不满足的项,赋值nan
4 0
3 1
2 2
1 3
0 4
dtype: int64
3 1
2 2
1 3
0 4
dtype: int64
4 NaN
3 1.0
2 2.0
1 3.0
0 4.0
dtype: float64
# DataFrame 使用布尔值选择时,返回值保留原数据结构
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
-0.500453 |
0.101632 |
0.848213 |
2000-01-02 |
-1.048619 |
0.856605 |
-0.295425 |
1.060710 |
2000-01-03 |
-0.214918 |
-1.989228 |
0.278514 |
1.088771 |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
-0.089382 |
2000-01-06 |
-1.041194 |
0.533706 |
-1.084442 |
1.824709 |
2000-01-07 |
-1.040706 |
-2.336161 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
-0.075381 |
-0.951126 |
-0.347865 |
df[df < 0]
df.where(df < 0) # 等价于上式
df.where(df < 0, -df) # where 可以传入另一个参数,用于替换条件为 False 的项,返回copy数据拷贝,不修改原值
df
|
A |
B |
C |
D |
2000-01-01 |
NaN |
-0.500453 |
NaN |
NaN |
2000-01-02 |
-1.048619 |
NaN |
-0.295425 |
NaN |
2000-01-03 |
-0.214918 |
-1.989228 |
NaN |
NaN |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
NaN |
2000-01-05 |
NaN |
NaN |
NaN |
-0.089382 |
2000-01-06 |
-1.041194 |
NaN |
-1.084442 |
NaN |
2000-01-07 |
-1.040706 |
-2.336161 |
NaN |
NaN |
2000-01-08 |
NaN |
-0.075381 |
-0.951126 |
-0.347865 |
|
A |
B |
C |
D |
2000-01-01 |
NaN |
-0.500453 |
NaN |
NaN |
2000-01-02 |
-1.048619 |
NaN |
-0.295425 |
NaN |
2000-01-03 |
-0.214918 |
-1.989228 |
NaN |
NaN |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
NaN |
2000-01-05 |
NaN |
NaN |
NaN |
-0.089382 |
2000-01-06 |
-1.041194 |
NaN |
-1.084442 |
NaN |
2000-01-07 |
-1.040706 |
-2.336161 |
NaN |
NaN |
2000-01-08 |
NaN |
-0.075381 |
-0.951126 |
-0.347865 |
|
A |
B |
C |
D |
2000-01-01 |
-0.383691 |
-0.500453 |
-0.101632 |
-0.848213 |
2000-01-02 |
-1.048619 |
-0.856605 |
-0.295425 |
-1.060710 |
2000-01-03 |
-0.214918 |
-1.989228 |
-0.278514 |
-1.088771 |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
-1.457230 |
2000-01-05 |
-0.656919 |
-1.081810 |
-1.148303 |
-0.089382 |
2000-01-06 |
-1.041194 |
-0.533706 |
-1.084442 |
-1.824709 |
2000-01-07 |
-1.040706 |
-2.336161 |
-0.565496 |
-0.269414 |
2000-01-08 |
-0.166739 |
-0.075381 |
-0.951126 |
-0.347865 |
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
-0.500453 |
0.101632 |
0.848213 |
2000-01-02 |
-1.048619 |
0.856605 |
-0.295425 |
1.060710 |
2000-01-03 |
-0.214918 |
-1.989228 |
0.278514 |
1.088771 |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
-0.089382 |
2000-01-06 |
-1.041194 |
0.533706 |
-1.084442 |
1.824709 |
2000-01-07 |
-1.040706 |
-2.336161 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
-0.075381 |
-0.951126 |
-0.347865 |
# You may wish to set values based on some boolean criteria. This can be done intuitively like so:
# 设置基于布尔值的项值
s2 = s.copy()
s2
s2[s2 < 3] = 0
s2
df2 = df.copy()
df2
df2[df2 < 0] = 0
df2
4 0
3 1
2 2
1 3
0 4
dtype: int64
4 0
3 0
2 0
1 3
0 4
dtype: int64
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
-0.500453 |
0.101632 |
0.848213 |
2000-01-02 |
-1.048619 |
0.856605 |
-0.295425 |
1.060710 |
2000-01-03 |
-0.214918 |
-1.989228 |
0.278514 |
1.088771 |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
-0.089382 |
2000-01-06 |
-1.041194 |
0.533706 |
-1.084442 |
1.824709 |
2000-01-07 |
-1.040706 |
-2.336161 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
-0.075381 |
-0.951126 |
-0.347865 |
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
0.000000 |
0.101632 |
0.848213 |
2000-01-02 |
0.000000 |
0.856605 |
0.000000 |
1.060710 |
2000-01-03 |
0.000000 |
0.000000 |
0.278514 |
1.088771 |
2000-01-04 |
0.000000 |
0.000000 |
0.000000 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
0.000000 |
2000-01-06 |
0.000000 |
0.533706 |
0.000000 |
1.824709 |
2000-01-07 |
0.000000 |
0.000000 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
0.000000 |
0.000000 |
0.000000 |
# where 默认返回修改后的数据拷贝,原值不变;可以设置inplace参数,直接修改原值,而不是创建拷贝
df_orig = df.copy()
df_orig.where(df > 0, -df, inplace=True)
df_orig
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
0.500453 |
0.101632 |
0.848213 |
2000-01-02 |
1.048619 |
0.856605 |
0.295425 |
1.060710 |
2000-01-03 |
0.214918 |
1.989228 |
0.278514 |
1.088771 |
2000-01-04 |
2.326385 |
0.225754 |
1.331598 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
0.089382 |
2000-01-06 |
1.041194 |
0.533706 |
1.084442 |
1.824709 |
2000-01-07 |
1.040706 |
2.336161 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
0.075381 |
0.951126 |
0.347865 |
# 注意 :pandas 和 numpy 的where方法不一样
# 一般的,df1.where(m, df2) 相当于 np.where(m, df1, df2)
df.where(df < 0, -df) == np.where(df < 0, df, -df)
|
A |
B |
C |
D |
2000-01-01 |
True |
True |
True |
True |
2000-01-02 |
True |
True |
True |
True |
2000-01-03 |
True |
True |
True |
True |
2000-01-04 |
True |
True |
True |
True |
2000-01-05 |
True |
True |
True |
True |
2000-01-06 |
True |
True |
True |
True |
2000-01-07 |
True |
True |
True |
True |
2000-01-08 |
True |
True |
True |
True |
# alignment 定位,对齐
# where 可以选择局部(部分区域)的布尔条件
# This is analogous to partial setting via .loc (but on the contents rather than the axis labels)
df2 = df.copy()
df2
df2[1:4] # 行选择
df2.where(df2[1:4] > 0, 3) # 对不符合项的值进行赋值!
df2[ df2[1:4] > 0 ] = 3 # 只定位部分区域,对符合项的值进行赋值
df2
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
-0.500453 |
0.101632 |
0.848213 |
2000-01-02 |
-1.048619 |
0.856605 |
-0.295425 |
1.060710 |
2000-01-03 |
-0.214918 |
-1.989228 |
0.278514 |
1.088771 |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
-0.089382 |
2000-01-06 |
-1.041194 |
0.533706 |
-1.084442 |
1.824709 |
2000-01-07 |
-1.040706 |
-2.336161 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
-0.075381 |
-0.951126 |
-0.347865 |
|
A |
B |
C |
D |
2000-01-02 |
-1.048619 |
0.856605 |
-0.295425 |
1.060710 |
2000-01-03 |
-0.214918 |
-1.989228 |
0.278514 |
1.088771 |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
1.457230 |
|
A |
B |
C |
D |
2000-01-01 |
3.0 |
3.000000 |
3.000000 |
3.000000 |
2000-01-02 |
3.0 |
0.856605 |
3.000000 |
1.060710 |
2000-01-03 |
3.0 |
3.000000 |
0.278514 |
1.088771 |
2000-01-04 |
3.0 |
3.000000 |
3.000000 |
1.457230 |
2000-01-05 |
3.0 |
3.000000 |
3.000000 |
3.000000 |
2000-01-06 |
3.0 |
3.000000 |
3.000000 |
3.000000 |
2000-01-07 |
3.0 |
3.000000 |
3.000000 |
3.000000 |
2000-01-08 |
3.0 |
3.000000 |
3.000000 |
3.000000 |
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
-0.500453 |
0.101632 |
0.848213 |
2000-01-02 |
-1.048619 |
3.000000 |
-0.295425 |
3.000000 |
2000-01-03 |
-0.214918 |
-1.989228 |
3.000000 |
3.000000 |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
3.000000 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
-0.089382 |
2000-01-06 |
-1.041194 |
0.533706 |
-1.084442 |
1.824709 |
2000-01-07 |
-1.040706 |
-2.336161 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
-0.075381 |
-0.951126 |
-0.347865 |
# Where can also accept axis and level parameters to align the input when performing the where.
# where 可以接受 轴参数axis 和 级别参数level
df2 = df.copy()
df2
df2.where(df2>0,df2['A'],axis='index') # 小于等于0的值,赋值为A列的值
df2
df2.apply(lambda x, y: x.where(x>0,y), y=df['A']) # 相当于上式,但此式较慢,同样不改变原值,而是生成一个拷贝copy
df2
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
-0.500453 |
0.101632 |
0.848213 |
2000-01-02 |
-1.048619 |
0.856605 |
-0.295425 |
1.060710 |
2000-01-03 |
-0.214918 |
-1.989228 |
0.278514 |
1.088771 |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
-0.089382 |
2000-01-06 |
-1.041194 |
0.533706 |
-1.084442 |
1.824709 |
2000-01-07 |
-1.040706 |
-2.336161 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
-0.075381 |
-0.951126 |
-0.347865 |
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
0.383691 |
0.101632 |
0.848213 |
2000-01-02 |
-1.048619 |
0.856605 |
-1.048619 |
1.060710 |
2000-01-03 |
-0.214918 |
-0.214918 |
0.278514 |
1.088771 |
2000-01-04 |
-2.326385 |
-2.326385 |
-2.326385 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
0.656919 |
2000-01-06 |
-1.041194 |
0.533706 |
-1.041194 |
1.824709 |
2000-01-07 |
-1.040706 |
-1.040706 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
0.166739 |
0.166739 |
0.166739 |
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
-0.500453 |
0.101632 |
0.848213 |
2000-01-02 |
-1.048619 |
0.856605 |
-0.295425 |
1.060710 |
2000-01-03 |
-0.214918 |
-1.989228 |
0.278514 |
1.088771 |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
-0.089382 |
2000-01-06 |
-1.041194 |
0.533706 |
-1.084442 |
1.824709 |
2000-01-07 |
-1.040706 |
-2.336161 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
-0.075381 |
-0.951126 |
-0.347865 |
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
0.383691 |
0.101632 |
0.848213 |
2000-01-02 |
-1.048619 |
0.856605 |
-1.048619 |
1.060710 |
2000-01-03 |
-0.214918 |
-0.214918 |
0.278514 |
1.088771 |
2000-01-04 |
-2.326385 |
-2.326385 |
-2.326385 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
0.656919 |
2000-01-06 |
-1.041194 |
0.533706 |
-1.041194 |
1.824709 |
2000-01-07 |
-1.040706 |
-1.040706 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
0.166739 |
0.166739 |
0.166739 |
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
-0.500453 |
0.101632 |
0.848213 |
2000-01-02 |
-1.048619 |
0.856605 |
-0.295425 |
1.060710 |
2000-01-03 |
-0.214918 |
-1.989228 |
0.278514 |
1.088771 |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
-0.089382 |
2000-01-06 |
-1.041194 |
0.533706 |
-1.084442 |
1.824709 |
2000-01-07 |
-1.040706 |
-2.336161 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
-0.075381 |
-0.951126 |
-0.347865 |
# where 可以接受一个函数调用,此函数只能有一个参数,且返回有效的布尔条件
df3 = pd.DataFrame({'A': [1, 2, 3],'B': [4, 5, 6],'C': [7, 8, 9]})
df3
df3.where(lambda x: x > 4, lambda x: x + 10) # x<=4 的值 赋值为 x+10
|
A |
B |
C |
0 |
1 |
4 |
7 |
1 |
2 |
5 |
8 |
2 |
3 |
6 |
9 |
|
A |
B |
C |
0 |
11 |
14 |
7 |
1 |
12 |
5 |
8 |
2 |
13 |
6 |
9 |
# mask 遮罩 mask is the inverse boolean operation of where. 反向的布尔值操作
s
s.mask(s >= 0) # 选择<0 的值, 不符合项 置为nan
df
df.mask(df >= 0)
4 0
3 1
2 2
1 3
0 4
dtype: int64
4 NaN
3 NaN
2 NaN
1 NaN
0 NaN
dtype: float64
|
A |
B |
C |
D |
2000-01-01 |
0.383691 |
-0.500453 |
0.101632 |
0.848213 |
2000-01-02 |
-1.048619 |
0.856605 |
-0.295425 |
1.060710 |
2000-01-03 |
-0.214918 |
-1.989228 |
0.278514 |
1.088771 |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
1.457230 |
2000-01-05 |
0.656919 |
1.081810 |
1.148303 |
-0.089382 |
2000-01-06 |
-1.041194 |
0.533706 |
-1.084442 |
1.824709 |
2000-01-07 |
-1.040706 |
-2.336161 |
0.565496 |
0.269414 |
2000-01-08 |
0.166739 |
-0.075381 |
-0.951126 |
-0.347865 |
|
A |
B |
C |
D |
2000-01-01 |
NaN |
-0.500453 |
NaN |
NaN |
2000-01-02 |
-1.048619 |
NaN |
-0.295425 |
NaN |
2000-01-03 |
-0.214918 |
-1.989228 |
NaN |
NaN |
2000-01-04 |
-2.326385 |
-0.225754 |
-1.331598 |
NaN |
2000-01-05 |
NaN |
NaN |
NaN |
-0.089382 |
2000-01-06 |
-1.041194 |
NaN |
-1.084442 |
NaN |
2000-01-07 |
-1.040706 |
-2.336161 |
NaN |
NaN |
2000-01-08 |
NaN |
-0.075381 |
-0.951126 |
-0.347865 |
# The query() Method (Experimental) 对 DataFrame对象 使用表达式进行选择
# 例如 :选择b列中 在a列和c列两值中间的 行,a
|
a |
b |
c |
0 |
0.283094 |
0.051807 |
0.126487 |
1 |
0.020097 |
0.373023 |
0.147193 |
2 |
0.091921 |
0.830956 |
0.143214 |
3 |
0.340304 |
0.527246 |
0.709769 |
4 |
0.651722 |
0.344524 |
0.151233 |
5 |
0.396685 |
0.524376 |
0.540237 |
6 |
0.502751 |
0.627708 |
0.708038 |
7 |
0.472338 |
0.269770 |
0.586165 |
8 |
0.937522 |
0.239560 |
0.861873 |
9 |
0.661879 |
0.465536 |
0.271580 |
# pure python
df[(df.a < df.b) & (df.b < df.c)]
|
a |
b |
c |
3 |
0.340304 |
0.527246 |
0.709769 |
5 |
0.396685 |
0.524376 |
0.540237 |
6 |
0.502751 |
0.627708 |
0.708038 |
# query,传入的是str表达式
df.query('(a < b) & (b < c)') # 比纯py慢?!
|
a |
b |
c |
3 |
0.340304 |
0.527246 |
0.709769 |
5 |
0.396685 |
0.524376 |
0.540237 |
6 |
0.502751 |
0.627708 |
0.708038 |
# 对于命名的index索引是低效的,比利用列名称
# 而且如果索引的名称和列名同名,列名优先
df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc'))
# df
df.index.name = "a"
df
|
b |
c |
a |
|
|
0 |
0 |
4 |
1 |
1 |
2 |
2 |
3 |
0 |
3 |
0 |
3 |
4 |
4 |
3 |
5 |
0 |
0 |
6 |
0 |
4 |
7 |
1 |
4 |
8 |
3 |
0 |
9 |
1 |
4 |
df.query('a <= b and b <= c')
# 可以不使用索引的名称,而是直接用index,这样同时可以避免与列名重名
df.query('index <= b <= c')
# MultiIndex query() Syntax
# 对于多重索引
n = 10
colors = np.random.choice(['red', 'green'], size=n)
colors
foods = np.random.choice(['eggs', 'ham'], size=n)
foods
array(['green', 'green', 'green', 'green', 'green', 'green', 'green',
'red', 'green', 'green'], dtype='
index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
index
df = pd.DataFrame(np.random.randn(n, 2), index=index)
df
MultiIndex(levels=[['green', 'red'], ['eggs', 'ham']],
labels=[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0, 1, 1]],
names=['color', 'food'])
|
|
0 |
1 |
color |
food |
|
|
green |
ham |
-0.295761 |
1.399719 |
eggs |
1.807185 |
0.498136 |
eggs |
-0.119640 |
2.279162 |
eggs |
-0.238709 |
-0.650418 |
eggs |
2.235827 |
1.066954 |
ham |
1.156794 |
1.694717 |
eggs |
-0.037158 |
-0.529213 |
red |
eggs |
0.046799 |
0.763592 |
green |
ham |
0.488240 |
-0.455112 |
ham |
-0.169486 |
-0.646891 |
df.query('color == "red"')
|
|
0 |
1 |
color |
food |
|
|
red |
eggs |
0.046799 |
0.763592 |
# 如果多重索引没有命名 可以使用特殊的名字
df.index.names = [None, None]
df
df.query('ilevel_0 == "red"') # The convention is ilevel_0, which means “index level 0” for the 0th level of the index.
|
|
0 |
1 |
green |
ham |
-0.295761 |
1.399719 |
eggs |
1.807185 |
0.498136 |
eggs |
-0.119640 |
2.279162 |
eggs |
-0.238709 |
-0.650418 |
eggs |
2.235827 |
1.066954 |
ham |
1.156794 |
1.694717 |
eggs |
-0.037158 |
-0.529213 |
red |
eggs |
0.046799 |
0.763592 |
green |
ham |
0.488240 |
-0.455112 |
ham |
-0.169486 |
-0.646891 |
|
|
0 |
1 |
red |
eggs |
0.046799 |
0.763592 |
# query() Use Cases 使用示例
# A use case for query() is when you have a collection of DataFrame objects
# that have a subset of column names (or index levels/names) in common. ## 有公共列名称子集的df
# You can pass the same query to both frames
# without having to specify which frame you’re interested in querying. ## 传入同样的query
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df2 = pd.DataFrame(np.random.rand(n + 2, 3), columns=df.columns)
df
df2
|
a |
b |
c |
0 |
0.096087 |
0.799309 |
0.112070 |
1 |
0.083101 |
0.505151 |
0.830588 |
2 |
0.501472 |
0.692792 |
0.663570 |
3 |
0.872821 |
0.442541 |
0.904903 |
4 |
0.513651 |
0.375617 |
0.786898 |
5 |
0.184368 |
0.285290 |
0.055147 |
6 |
0.284951 |
0.243556 |
0.732871 |
7 |
0.431525 |
0.676385 |
0.458296 |
8 |
0.743057 |
0.326079 |
0.434655 |
9 |
0.610921 |
0.717639 |
0.580765 |
|
a |
b |
c |
0 |
0.804932 |
0.533058 |
0.713190 |
1 |
0.305893 |
0.460048 |
0.879321 |
2 |
0.344438 |
0.970870 |
0.685098 |
3 |
0.348010 |
0.839449 |
0.799309 |
4 |
0.844713 |
0.267562 |
0.771202 |
5 |
0.403534 |
0.088786 |
0.950782 |
6 |
0.181616 |
0.567118 |
0.989711 |
7 |
0.363736 |
0.852080 |
0.140771 |
8 |
0.866127 |
0.285365 |
0.025491 |
9 |
0.329751 |
0.121716 |
0.782729 |
10 |
0.029253 |
0.419409 |
0.051255 |
11 |
0.057406 |
0.106595 |
0.559687 |
expr = '0.0 <= a <= c <= 0.5'
mp = map(lambda frame: frame.query(expr), [df, df2]) # 同一个表达式,作用在有同样列名的多个df上
for i in mp:
print(i)
a b c
0 0.096087 0.799309 0.112070
7 0.431525 0.676385 0.458296
a b c
10 0.029253 0.419409 0.051255
# query() Python versus pandas Syntax Comparison
# Full numpy-like syntax
df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc'))
df
|
a |
b |
c |
0 |
8 |
4 |
5 |
1 |
7 |
4 |
7 |
2 |
9 |
3 |
2 |
3 |
4 |
8 |
7 |
4 |
4 |
5 |
8 |
5 |
7 |
8 |
8 |
6 |
1 |
7 |
6 |
7 |
5 |
9 |
1 |
8 |
3 |
3 |
8 |
9 |
7 |
1 |
5 |
df.query('(a < b) & (b < c)')
df[(df.a < df.b) & (df.b < df.c)]
# 几种其他的不同的写法
df.query('a < b & b < c') # 去掉括号
df.query('a < b and b < c') # 使用英文and
df.query('a < b < c') # 连写,优雅的表达
# The in and not in operators
# get all rows where columns "a" and "b" have overlapping values
# 得到a列和b列有重叠值的行,a列中的值在 in b列的值中
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
'c': np.random.randint(5, size=12),
'd': np.random.randint(9, size=12)})
df
|
a |
b |
c |
d |
0 |
a |
a |
0 |
2 |
1 |
a |
a |
0 |
6 |
2 |
b |
a |
3 |
3 |
3 |
b |
a |
1 |
2 |
4 |
c |
b |
1 |
2 |
5 |
c |
b |
1 |
2 |
6 |
d |
b |
3 |
8 |
7 |
d |
b |
1 |
0 |
8 |
e |
c |
2 |
7 |
9 |
e |
c |
0 |
1 |
10 |
f |
c |
0 |
1 |
11 |
f |
c |
3 |
1 |
df.query('a in b') # 第一次运行113ms 第二次明显加快 25ms
|
a |
b |
c |
d |
0 |
a |
a |
0 |
2 |
1 |
a |
a |
0 |
6 |
2 |
b |
a |
3 |
3 |
3 |
b |
a |
1 |
2 |
4 |
c |
b |
1 |
2 |
5 |
c |
b |
1 |
2 |
df[df.a.isin(df.b)] # How you'd do it in pure Python 仍然比query快 35ms
|
a |
b |
c |
d |
0 |
a |
a |
0 |
2 |
1 |
a |
a |
0 |
6 |
2 |
b |
a |
3 |
3 |
3 |
b |
a |
1 |
2 |
4 |
c |
b |
1 |
2 |
5 |
c |
b |
1 |
2 |
df[~df.a.isin(df.b)]
|
a |
b |
c |
d |
6 |
d |
b |
3 |
8 |
7 |
d |
b |
1 |
0 |
8 |
e |
c |
2 |
7 |
9 |
e |
c |
0 |
1 |
10 |
f |
c |
0 |
1 |
11 |
f |
c |
3 |
1 |
df.query('a not in b')
|
a |
b |
c |
d |
6 |
d |
b |
3 |
8 |
7 |
d |
b |
1 |
0 |
8 |
e |
c |
2 |
7 |
9 |
e |
c |
0 |
1 |
10 |
f |
c |
0 |
1 |
11 |
f |
c |
3 |
1 |
# rows where cols a and b have overlapping values and col c's values are less than col d's
df.query('a in b and c < d')
df[df.a.isin(df.b) & (df.c < df.d)]
|
a |
b |
c |
d |
0 |
a |
a |
0 |
2 |
1 |
a |
a |
0 |
6 |
3 |
b |
a |
1 |
2 |
4 |
c |
b |
1 |
2 |
5 |
c |
b |
1 |
2 |
|
a |
b |
c |
d |
0 |
a |
a |
0 |
2 |
1 |
a |
a |
0 |
6 |
3 |
b |
a |
1 |
2 |
4 |
c |
b |
1 |
2 |
5 |
c |
b |
1 |
2 |
# 注意:?????
# Note that in and not in are evaluated in Python, since numexpr has no equivalent of this operation.
# However, only the in/not in expression itself is evaluated in vanilla Python.
# For example, in the expression
df.query('a in b + c + d')
# (b + c + d) is evaluated by numexpr and then the in operation is evaluated in plain Python. In general, any operations that can be evaluated using numexpr will be.
# Special use of the == operator with list objects
# 特别的用法,== 用于list
# Comparing a list of values to a column using ==/!= works similarly to in/not in
# 用==/!=比较列表的值与列的值,类似于in/not in
df
|
a |
b |
c |
d |
0 |
a |
a |
0 |
2 |
1 |
a |
a |
0 |
6 |
2 |
b |
a |
3 |
3 |
3 |
b |
a |
1 |
2 |
4 |
c |
b |
1 |
2 |
5 |
c |
b |
1 |
2 |
6 |
d |
b |
3 |
8 |
7 |
d |
b |
1 |
0 |
8 |
e |
c |
2 |
7 |
9 |
e |
c |
0 |
1 |
10 |
f |
c |
0 |
1 |
11 |
f |
c |
3 |
1 |
df.query('b == ["a", "b", "c"]') # b列中有列表中值的
|
a |
b |
c |
d |
0 |
a |
a |
0 |
2 |
1 |
a |
a |
0 |
6 |
2 |
b |
a |
3 |
3 |
3 |
b |
a |
1 |
2 |
4 |
c |
b |
1 |
2 |
5 |
c |
b |
1 |
2 |
6 |
d |
b |
3 |
8 |
7 |
d |
b |
1 |
0 |
8 |
e |
c |
2 |
7 |
9 |
e |
c |
0 |
1 |
10 |
f |
c |
0 |
1 |
11 |
f |
c |
3 |
1 |
df.query('c == [1, 2]')
df.query('[1, 2] in c')
df[df.c.isin([1, 2])]
|
a |
b |
c |
d |
3 |
b |
a |
1 |
2 |
4 |
c |
b |
1 |
2 |
5 |
c |
b |
1 |
2 |
7 |
d |
b |
1 |
0 |
8 |
e |
c |
2 |
7 |
|
a |
b |
c |
d |
3 |
b |
a |
1 |
2 |
4 |
c |
b |
1 |
2 |
5 |
c |
b |
1 |
2 |
7 |
d |
b |
1 |
0 |
8 |
e |
c |
2 |
7 |
|
a |
b |
c |
d |
3 |
b |
a |
1 |
2 |
4 |
c |
b |
1 |
2 |
5 |
c |
b |
1 |
2 |
7 |
d |
b |
1 |
0 |
8 |
e |
c |
2 |
7 |
# Boolean Operators 布尔操作符
# ~ 或 not
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df
df['bools'] = np.random.rand(len(df)) > 0.5
df
|
a |
b |
c |
0 |
0.528671 |
0.265870 |
0.932892 |
1 |
0.520069 |
0.047895 |
0.478818 |
2 |
0.648595 |
0.180744 |
0.838445 |
3 |
0.145614 |
0.906190 |
0.762163 |
4 |
0.612637 |
0.027232 |
0.778020 |
5 |
0.640565 |
0.535538 |
0.018280 |
6 |
0.633664 |
0.124654 |
0.185709 |
7 |
0.635088 |
0.377300 |
0.914968 |
8 |
0.702684 |
0.504459 |
0.107014 |
9 |
0.635757 |
0.261144 |
0.665611 |
|
a |
b |
c |
bools |
0 |
0.528671 |
0.265870 |
0.932892 |
False |
1 |
0.520069 |
0.047895 |
0.478818 |
True |
2 |
0.648595 |
0.180744 |
0.838445 |
True |
3 |
0.145614 |
0.906190 |
0.762163 |
False |
4 |
0.612637 |
0.027232 |
0.778020 |
True |
5 |
0.640565 |
0.535538 |
0.018280 |
True |
6 |
0.633664 |
0.124654 |
0.185709 |
True |
7 |
0.635088 |
0.377300 |
0.914968 |
True |
8 |
0.702684 |
0.504459 |
0.107014 |
False |
9 |
0.635757 |
0.261144 |
0.665611 |
False |
df.query('~bools')
df.query('not bools')
|
a |
b |
c |
bools |
0 |
0.528671 |
0.265870 |
0.932892 |
False |
3 |
0.145614 |
0.906190 |
0.762163 |
False |
8 |
0.702684 |
0.504459 |
0.107014 |
False |
9 |
0.635757 |
0.261144 |
0.665611 |
False |
|
a |
b |
c |
bools |
0 |
0.528671 |
0.265870 |
0.932892 |
False |
3 |
0.145614 |
0.906190 |
0.762163 |
False |
8 |
0.702684 |
0.504459 |
0.107014 |
False |
9 |
0.635757 |
0.261144 |
0.665611 |
False |
# 任意组合布尔表达式
shorter = df.query('a < b < c and (not bools) or bools > 2') # short query syntax
longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] # equivalent in pure Python
shorter
longer
# Performance of query()
# DataFrame.query() using numexpr is slightly faster than Python for large frames
# Note : You will only see the performance benefits of using the numexpr engine with DataFrame.query()
# if your frame has more than approximately 200,000 rows
# query() 效能在大数据时高于数字表达式
# Duplicate Data
# 识别和剔除重复数据,两个方法:duplicated and drop_duplicates,他们的参数都是列
# duplicated 返回布尔向量,标示重复的行
# drop_duplicates 删除重复列
# 默认保留第一个找到的值,但可以通过keep参数指定保留的值
# keep='first' (default): mark / drop duplicates except for the first occurrence. 只保留第一个
# keep='last': mark / drop duplicates except for the last occurrence. 只保留最后一个重复值
# keep=False: mark / drop all duplicates. 剔除所有重复值
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
'c': np.random.randn(7)})
df2
|
a |
b |
c |
0 |
one |
x |
2.206877 |
1 |
one |
y |
-1.852209 |
2 |
two |
x |
-0.706555 |
3 |
two |
y |
-1.007860 |
4 |
two |
x |
-0.185356 |
5 |
three |
x |
-0.687592 |
6 |
four |
x |
-2.052029 |
df2.duplicated('a')
df2.duplicated("a", keep="last")
df2.duplicated('a', keep=False)
0 False
1 True
2 False
3 True
4 True
5 False
6 False
dtype: bool
0 True
1 False
2 True
3 True
4 False
5 False
6 False
dtype: bool
0 True
1 True
2 True
3 True
4 True
5 False
6 False
dtype: bool
df2.drop_duplicates('a')
df2.drop_duplicates('a', keep="last")
df2.drop_duplicates('a', keep=False)
|
a |
b |
c |
0 |
one |
x |
2.206877 |
2 |
two |
x |
-0.706555 |
5 |
three |
x |
-0.687592 |
6 |
four |
x |
-2.052029 |
|
a |
b |
c |
1 |
one |
y |
-1.852209 |
4 |
two |
x |
-0.185356 |
5 |
three |
x |
-0.687592 |
6 |
four |
x |
-2.052029 |
|
a |
b |
c |
5 |
three |
x |
-0.687592 |
6 |
four |
x |
-2.052029 |
# 传入一个列表作为参数
df2.duplicated(['a', 'b']) # a b 两列看做一个整体,标示重复值
df2.drop_duplicates(['a', 'b'])
0 False
1 False
2 False
3 False
4 True
5 False
6 False
dtype: bool
|
a |
b |
c |
0 |
one |
x |
2.206877 |
1 |
one |
y |
-1.852209 |
2 |
two |
x |
-0.706555 |
3 |
two |
y |
-1.007860 |
5 |
three |
x |
-0.687592 |
6 |
four |
x |
-2.052029 |
# index索引去重 index.duplicated
df3 = pd.DataFrame({'a': np.arange(6),'b': np.random.randn(6)},index=['a', 'a', 'b', 'c', 'b', 'a'])
df3
|
a |
b |
a |
0 |
1.446937 |
a |
1 |
-1.740284 |
b |
2 |
-0.604590 |
c |
3 |
0.096239 |
b |
4 |
0.823314 |
a |
5 |
1.990803 |
df3.index.duplicated()
df3[~df3.index.duplicated()]
df3[~df3.index.duplicated(keep='last')]
df3[~df3.index.duplicated(keep=False)]
array([False, True, False, False, True, True])
|
a |
b |
a |
0 |
1.446937 |
b |
2 |
-0.604590 |
c |
3 |
0.096239 |
|
a |
b |
c |
3 |
0.096239 |
b |
4 |
0.823314 |
a |
5 |
1.990803 |
# Dictionary-like get() method get方法
# Each of Series, DataFrame, and Panel have a get method which can return a default value.
s = pd.Series([1,2,3], index=['a','b','c'])
s.get("a") # 相当于s["a"]
s.get("x", default=-1) # 可以对不存在的index赋值
1
-1
# The lookup() Method
# 按一定的顺序取得行/列的值
dflookup = pd.DataFrame(np.random.rand(20,4), columns = ['A','B','C','D'])
dflookup
|
A |
B |
C |
D |
0 |
0.942679 |
0.727316 |
0.658345 |
0.465770 |
1 |
0.437278 |
0.225158 |
0.436522 |
0.164805 |
2 |
0.270947 |
0.280223 |
0.309800 |
0.015967 |
3 |
0.559836 |
0.630962 |
0.673678 |
0.712503 |
4 |
0.535372 |
0.989887 |
0.661567 |
0.361962 |
5 |
0.726322 |
0.601192 |
0.547858 |
0.477509 |
6 |
0.829411 |
0.583613 |
0.871647 |
0.460966 |
7 |
0.365722 |
0.563660 |
0.164954 |
0.134314 |
8 |
0.817334 |
0.523003 |
0.684492 |
0.369386 |
9 |
0.710906 |
0.693633 |
0.618877 |
0.888263 |
10 |
0.026953 |
0.185217 |
0.746235 |
0.568846 |
11 |
0.192765 |
0.927200 |
0.431736 |
0.084300 |
12 |
0.988460 |
0.784320 |
0.180145 |
0.039405 |
13 |
0.233349 |
0.148678 |
0.888210 |
0.004917 |
14 |
0.105130 |
0.146724 |
0.261370 |
0.936558 |
15 |
0.701224 |
0.791860 |
0.420083 |
0.391538 |
16 |
0.290186 |
0.307993 |
0.139429 |
0.618879 |
17 |
0.182132 |
0.174420 |
0.845501 |
0.647986 |
18 |
0.732009 |
0.919950 |
0.197361 |
0.582814 |
19 |
0.568096 |
0.539125 |
0.269016 |
0.537584 |
list(range(0,10,2))
dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D'])
[0, 2, 4, 6, 8]
array([0.72731646, 0.30979986, 0.53537223, 0.58361289, 0.36938598])
# index objets 索引对象
# 索引可以通过list或其他序列对象直接创建
index = pd.Index(['e', 'd', 'a', 'b'])
index
# 可以命名
index = pd.Index(['e', 'd', 'a', 'b'], name='something')
index.name
index
Index(['e', 'd', 'a', 'b'], dtype='object')
'something'
Index(['e', 'd', 'a', 'b'], dtype='object', name='something')
index = pd.Index(list(range(5)), name='rows')
index
columns = pd.Index(['A', 'B', 'C'], name='cols')
columns
Int64Index([0, 1, 2, 3, 4], dtype='int64', name='rows')
Index(['A', 'B', 'C'], dtype='object', name='cols')
df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns) # 使用索引给列命名,列名是一个索引对象
df
cols |
A |
B |
C |
rows |
|
|
|
0 |
1.868165 |
0.704614 |
-2.049472 |
1 |
-0.878810 |
0.706876 |
-0.741121 |
2 |
-2.649523 |
-0.952211 |
0.806387 |
3 |
0.369280 |
-0.052788 |
-0.995775 |
4 |
0.218246 |
-0.034493 |
-0.198815 |
# Setting metadata 设置元数据
# 索引“多半”是“不可变”,但是可以设置和改变其元数据,比如索引的名称,或者,多重索引的级别和标签
# 可以使用 rename, set_names, set_levels, and set_labels 设置这些属性。
# 默认返回一个拷贝(不修改原值),也可以就地修改in place
ind = pd.Index([1, 2, 3])
ind
Int64Index([1, 2, 3], dtype='int64')
ind.name = "ind"
ind
ind.rename("apple")
ind
ind.name = "apple"
ind
ind.set_names(["bob"], inplace=True)
ind
Int64Index([1, 2, 3], dtype='int64', name='ind')
Int64Index([1, 2, 3], dtype='int64', name='apple')
Int64Index([1, 2, 3], dtype='int64', name='ind')
Int64Index([1, 2, 3], dtype='int64', name='apple')
Int64Index([1, 2, 3], dtype='int64', name='bob')
# set_names, set_levels, and set_labels also take an optional level` argument
# df.query('ilevel_0 == "red"') 在query中使用 ilevel_0 ……
index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second'])
index
MultiIndex(levels=[[0, 1, 2], ['one', 'two']],
labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
index.levels[1]
index.levels[0]
index.set_levels(["a", "b"], level=1)
Index(['one', 'two'], dtype='object', name='second')
Int64Index([0, 1, 2], dtype='int64', name='first')
MultiIndex(levels=[[0, 1, 2], ['a', 'b']],
labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
# Set operations on Index objects 索引对象中的集合操作
# Note: The resulting index from a set operation will be sorted in ascending order.
# 注意: 索引的集合操作返回的是升序排序后的结果
a = pd.Index(['c', 'b', 'a'])
b = pd.Index(['c', 'e', 'd'])
a | b # 并集
a & b # 交集
# a - b # 不支持
a.difference(b) # 差集,非对称
a.symmetric_difference(b) # 对称差集,相当于idx1.difference(idx2).union(idx2.difference(idx1))
a ^ b # 对称差集
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
Index(['c'], dtype='object')
Index(['a', 'b'], dtype='object')
Index(['a', 'b', 'd', 'e'], dtype='object')
Index(['a', 'b', 'd', 'e'], dtype='object')
# Missing values 缺失值
# 注意:索引能够容纳缺失值,但是应该避免这样的情况。因为可能出现不可预料的结果,例如有些操作默认排除缺失值。
idx1 = pd.Index([1, np.nan, 3, 4])
idx1
Float64Index([1.0, nan, 3.0, 4.0], dtype='float64')
idx1.fillna(2) # 缺失值赋值为2,nan的index赋值为2
Float64Index([1.0, 2.0, 3.0, 4.0], dtype='float64')
idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')]) # pd的缺失值是NaT,与np不同
idx2
DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03'], dtype='datetime64[ns]', freq=None)
idx2.fillna(pd.Timestamp('2011-01-02'))
DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='datetime64[ns]', freq=None)
# Set / Reset Index 设置/重设索引
# DataFrame
data = pd.DataFrame({"a":["bar", "bar", "foo", "foo"],
"b":["one","two","one","two"],
"c":["z","y","x", "w"],
"d":range(1,5)}) # range 不用list
data
|
a |
b |
c |
d |
0 |
bar |
one |
z |
1 |
1 |
bar |
two |
y |
2 |
2 |
foo |
one |
x |
3 |
3 |
foo |
two |
w |
4 |
indexed1 = data.set_index("c") # 把df的某列设置为index索引
indexed1
|
a |
b |
d |
c |
|
|
|
z |
bar |
one |
1 |
y |
bar |
two |
2 |
x |
foo |
one |
3 |
w |
foo |
two |
4 |
indexed2 = data.set_index(['a', 'b']) # 把df的多列设置为多重索引
indexed2
|
|
c |
d |
a |
b |
|
|
bar |
one |
z |
1 |
two |
y |
2 |
foo |
one |
x |
3 |
two |
w |
4 |
frame = data.set_index('c', drop=False) # drop参数,可以设置索引时不删除列,默认为删除列
frame
frame = frame.set_index(['a', 'b'], append=True) # append参数,在原有索引的基础上,增加索引,变成复合索引
frame
|
a |
b |
c |
d |
c |
|
|
|
|
z |
bar |
one |
z |
1 |
y |
bar |
two |
y |
2 |
x |
foo |
one |
x |
3 |
w |
foo |
two |
w |
4 |
|
|
|
c |
d |
c |
a |
b |
|
|
z |
bar |
one |
z |
1 |
y |
bar |
two |
y |
2 |
x |
foo |
one |
x |
3 |
w |
foo |
two |
w |
4 |
data.set_index('c', drop=False) # 把某列设为索引,但不删除
data
data.set_index(['a', 'b'], inplace=True) # 就地修改,而不是返回拷贝
data
|
a |
b |
c |
d |
c |
|
|
|
|
z |
bar |
one |
z |
1 |
y |
bar |
two |
y |
2 |
x |
foo |
one |
x |
3 |
w |
foo |
two |
w |
4 |
|
a |
b |
c |
d |
0 |
bar |
one |
z |
1 |
1 |
bar |
two |
y |
2 |
2 |
foo |
one |
x |
3 |
3 |
foo |
two |
w |
4 |
|
|
c |
d |
a |
b |
|
|
bar |
one |
z |
1 |
two |
y |
2 |
foo |
one |
x |
3 |
two |
w |
4 |
# Reset the index 重置索引
# Note: The reset_index method used to be called delevel which is now deprecated. 不推荐使用delevel
data
data.index.name # 无返回值?
data.index.names # 索引列的名称
data.reset_index() # 重置索引,原索引变为普通列,原索引名称变为列名
|
|
c |
d |
a |
b |
|
|
bar |
one |
z |
1 |
two |
y |
2 |
foo |
one |
x |
3 |
two |
w |
4 |
FrozenList(['a', 'b'])
|
a |
b |
c |
d |
0 |
bar |
one |
z |
1 |
1 |
bar |
two |
y |
2 |
2 |
foo |
one |
x |
3 |
3 |
foo |
two |
w |
4 |
frame
frame.reset_index(level=1) # 对于多重索引,可以指定重置哪一级索引,而不是全部重置
|
|
|
c |
d |
c |
a |
b |
|
|
z |
bar |
one |
z |
1 |
y |
bar |
two |
y |
2 |
x |
foo |
one |
x |
3 |
w |
foo |
two |
w |
4 |
|
|
a |
c |
d |
c |
b |
|
|
|
z |
one |
bar |
z |
1 |
y |
two |
bar |
y |
2 |
x |
one |
foo |
x |
3 |
w |
two |
foo |
w |
4 |
# reset_index方法可以使用drop参数,若其为true,则仅是把索引剔除,而不转换为df数据列
frame
frame.reset_index(level=2, drop=True)
|
|
|
c |
d |
c |
a |
b |
|
|
z |
bar |
one |
z |
1 |
y |
bar |
two |
y |
2 |
x |
foo |
one |
x |
3 |
w |
foo |
two |
w |
4 |
|
|
c |
d |
c |
a |
|
|
z |
bar |
z |
1 |
y |
bar |
y |
2 |
x |
foo |
x |
3 |
w |
foo |
w |
4 |
# Adding an ad hoc index
# If you create an index yourself, you can just assign it to the index field
# 创建index对象,并赋值给df
index = pd.MultiIndex.from_product([range(2), ['one', 'two']], names=['first', 'second'])
index
data
data.index = index # 注意:index的长度要与df向适应
data
MultiIndex(levels=[[0, 1], ['one', 'two']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['first', 'second'])
|
a |
b |
c |
d |
0 |
bar |
one |
z |
1 |
1 |
bar |
two |
y |
2 |
2 |
foo |
one |
x |
3 |
3 |
foo |
two |
w |
4 |
|
|
a |
b |
c |
d |
first |
second |
|
|
|
|
0 |
one |
bar |
one |
z |
1 |
two |
bar |
two |
y |
2 |
1 |
one |
foo |
one |
x |
3 |
two |
foo |
two |
w |
4 |
# Returning a view versus a copy 返回视图 对比 返回拷贝
# 避免链式操作,即多个连接的[],具体见前面的说明
# Why does assignment fail when using chained indexing? 为什么链式索引赋值失败
# python解释器,对以下赋值的区别
# 第一种:非链式赋值
dfmi.loc[:,('one','second')] = value
# becomes, python解释为:
dfmi.loc.__setitem__((slice(None), ('one', 'second')), value) # 直接set
# Of course, dfmi.loc.__getitem__(idx) may be a view or a copy of dfmi.
# 第二种:链式赋值
dfmi['one']['second'] = value # pd会抛出异常SettingWithCopy,链式操作具有不确定性!
# becomes, python解释为:
dfmi.__getitem__('one').__setitem__('second', value) # 先get,再set,前面的get返回的是一个copy
# SettingWithCopy有时候会在没有明显的链式操作的情况下出现,例如:
def do_something(df):
foo = df[['bar', 'baz']] # Is foo a view? A copy? Nobody knows!
# ... many lines here ...
foo['quux'] = value # We don't know whether this will modify df or not! # “隐式”的链式操作
return foo
# Evaluation order matters 赋值命令事情
# 链式赋值操作引起的SettingWithCopyWarning,可以通过设置option mode取消或抑制。
dfb = pd.DataFrame({'a' : ['one', 'one', 'two','three', 'two', 'one', 'six'],'c' : np.arange(7)})
dfb
|
a |
c |
0 |
one |
0 |
1 |
one |
1 |
2 |
two |
2 |
3 |
three |
3 |
4 |
two |
4 |
5 |
one |
5 |
6 |
six |
6 |
# This will show the SettingWithCopyWarning
# but the frame values will be set
dfb['c'][dfb.a.str.startswith('o')] = 42
dfb
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is separate from the ipykernel package so we can avoid doing imports until
|
a |
c |
0 |
one |
42 |
1 |
one |
42 |
2 |
two |
2 |
3 |
three |
3 |
4 |
two |
4 |
5 |
one |
42 |
6 |
six |
6 |
# This however is operating on a copy and will not work.
# 把option的提示级别由warn改为raise ????
pd.set_option('mode.chained_assignment','warn') # This however is operating on a copy and will not work. # ??
# A chained assignment can also crop up in setting in a mixed dtype frame. 出现在
# Note : These setting rules apply to all of .loc/.iloc
# This is the correct access method 正确的存取方法
dfc = pd.DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]})
dfc
dfc.loc[0,'A'] = 11
dfc
|
A |
B |
0 |
aaa |
1 |
1 |
bbb |
2 |
2 |
ccc |
3 |
|
A |
B |
0 |
11 |
1 |
1 |
bbb |
2 |
2 |
ccc |
3 |
dfc = dfc.copy()
dfc
dfc['A'][0] = 111 # This can work at times, but is not guaranteed, and so should be avoided 有时可以工作,但不保证,应避免使用
dfc
|
A |
B |
0 |
11 |
1 |
1 |
bbb |
2 |
2 |
ccc |
3 |
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is separate from the ipykernel package so we can avoid doing imports until
|
A |
B |
0 |
111 |
1 |
1 |
bbb |
2 |
2 |
ccc |
3 |
pd.set_option('mode.chained_assignment', 'raise') # 链式赋值将不会执行
# http://pandas.pydata.org/pandas-docs/stable/indexing.html 全文完
# 2018-02-19