pandas索引.pandas.v0.22

# from http://pandas.pydata.org/pandas-docs/stable/indexing.html
# pandas 0.22.0 
import pandas as pd
import numpy as np
"""
尽量利用pandas提供的专用索引方式,而不是python通用的切片方式。
三种主要的索引方式:indexers
.loc label based 基于标签, 可以是标签名,可以是布尔值,可以是一元函数
.iloc integer psition based 基于整数位置(from 0 to length-1 of the axis),和python切片类似
[]

"""

"""
Object Type         Selection           Return Value Type
------------------------------------------------------------
Series              series[label]       scalar value
DataFrame           frame[colname]      Series corresponding to colname
Panel               panel[itemname]     DataFrame corresponding to the itemname
"""
'\n尽量利用pandas提供的专用索引方式,而不是python通用的切片方式。\n三种主要的索引方式:indexers\n.loc label based 基于标签, 可以是标签名,可以是布尔值,可以是一元函数\n.iloc integer psition based 基于整数位置(from 0 to length-1 of the axis),和python切片类似\n[]\n\n'






'\nObject Type \t\tIndexers\nSeries \t\t\t\ts.loc[indexer]\nDataFrame \t\t\tdf.loc[row_indexer,column_indexer]\nPanel \t\t\t\tp.loc[item_indexer,major_indexer,minor_indexer]\n'
# Here we construct a simple time series data set to use for illustrating the indexing functionality 
# 构造时间序列,举例说明索引的功能

dates = pd.date_range('1/1/2000', periods=8)
dates
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')
A B C D
2000-01-01 0.136628 0.324097 0.856313 -0.145259
2000-01-02 0.113819 -0.718630 0.016217 -1.571263
2000-01-03 -0.603598 -0.143643 -1.143063 -0.425266
2000-01-04 -0.486200 -0.136663 -2.016020 -0.815514
2000-01-05 0.835318 1.036607 -0.502919 0.878680
2000-01-06 -2.376205 0.362577 -0.484754 -0.478711
2000-01-07 0.193371 1.330468 0.544160 1.030900
2000-01-08 0.476533 -0.476653 -0.434356 0.744500
panel = pd.Panel({'one' : df, 'two' : df - df.mean()})  # 多维表格
panel
panel['one']
panel['two']

Dimensions: 2 (items) x 8 (major_axis) x 4 (minor_axis)
Items axis: one to two
Major_axis axis: 2000-01-01 00:00:00 to 2000-01-08 00:00:00
Minor_axis axis: A to D
A B C D
2000-01-01 0.136628 0.324097 0.856313 -0.145259
2000-01-02 0.113819 -0.718630 0.016217 -1.571263
2000-01-03 -0.603598 -0.143643 -1.143063 -0.425266
2000-01-04 -0.486200 -0.136663 -2.016020 -0.815514
2000-01-05 0.835318 1.036607 -0.502919 0.878680
2000-01-06 -2.376205 0.362577 -0.484754 -0.478711
2000-01-07 0.193371 1.330468 0.544160 1.030900
2000-01-08 0.476533 -0.476653 -0.434356 0.744500
A B C D
2000-01-01 0.350419 0.126827 1.251866 -0.047518
2000-01-02 0.327611 -0.915900 0.411770 -1.473522
2000-01-03 -0.389806 -0.340913 -0.747511 -0.327524
2000-01-04 -0.272408 -0.333933 -1.620467 -0.717772
2000-01-05 1.049110 0.839337 -0.107366 0.976422
2000-01-06 -2.162413 0.165307 -0.089201 -0.380969
2000-01-07 0.407163 1.133198 0.939713 1.128641
2000-01-08 0.690325 -0.673923 -0.038803 0.842241
# Thus, as per above, we have the most basic indexing using []:
# 最基本的索引方法,使用[]

s = df['A']
s
dates[5]  # 注意切片从0计数
s[dates[5]]
2000-01-01    0.136628
2000-01-02    0.113819
2000-01-03   -0.603598
2000-01-04   -0.486200
2000-01-05    0.835318
2000-01-06   -2.376205
2000-01-07    0.193371
2000-01-08    0.476533
Freq: D, Name: A, dtype: float64






Timestamp('2000-01-06 00:00:00', freq='D')






-2.376204948581219
# 在[]中传入列名的列表,如[ 'A', "B" ]
columns_l = ['A', 'B']
df[columns_l]  
df[[ 'A', "B" ]]  # 相当于上面,注意两重[]
df
A B
2000-01-01 0.324097 0.136628
2000-01-02 -0.718630 0.113819
2000-01-03 -0.143643 -0.603598
2000-01-04 -0.136663 -0.486200
2000-01-05 1.036607 0.835318
2000-01-06 0.362577 -2.376205
2000-01-07 1.330468 0.193371
2000-01-08 -0.476653 0.476533
A B
2000-01-01 0.324097 0.136628
2000-01-02 -0.718630 0.113819
2000-01-03 -0.143643 -0.603598
2000-01-04 -0.136663 -0.486200
2000-01-05 1.036607 0.835318
2000-01-06 0.362577 -2.376205
2000-01-07 1.330468 0.193371
2000-01-08 -0.476653 0.476533
A B C D
2000-01-01 0.324097 0.136628 0.856313 -0.145259
2000-01-02 -0.718630 0.113819 0.016217 -1.571263
2000-01-03 -0.143643 -0.603598 -1.143063 -0.425266
2000-01-04 -0.136663 -0.486200 -2.016020 -0.815514
2000-01-05 1.036607 0.835318 -0.502919 0.878680
2000-01-06 0.362577 -2.376205 -0.484754 -0.478711
2000-01-07 1.330468 0.193371 0.544160 1.030900
2000-01-08 -0.476653 0.476533 -0.434356 0.744500
df[['A', 'B']] = df[['B', 'A']]  # 交换两列的值
df
df.loc[:,['B', 'A']]

# 下式不能交换两列的值
# This will not modify df because the column alignment is before value assignment. ? 不理解?
# 列赋值在值赋值之前?
df.loc[:,['B', 'A']] = df[['A', 'B']]  
df

# df.loc[:,['B', 'A']] = df[:, ['A', 'B']]   # 错误?
# df

# 正确的方式:
df.loc[:,['B', 'A']] = df[['A', 'B']].values  # 取列的值
df
A B C D
2000-01-01 0.136628 0.324097 0.856313 -0.145259
2000-01-02 0.113819 -0.718630 0.016217 -1.571263
2000-01-03 -0.603598 -0.143643 -1.143063 -0.425266
2000-01-04 -0.486200 -0.136663 -2.016020 -0.815514
2000-01-05 0.835318 1.036607 -0.502919 0.878680
2000-01-06 -2.376205 0.362577 -0.484754 -0.478711
2000-01-07 0.193371 1.330468 0.544160 1.030900
2000-01-08 0.476533 -0.476653 -0.434356 0.744500
B A
2000-01-01 0.324097 0.136628
2000-01-02 -0.718630 0.113819
2000-01-03 -0.143643 -0.603598
2000-01-04 -0.136663 -0.486200
2000-01-05 1.036607 0.835318
2000-01-06 0.362577 -2.376205
2000-01-07 1.330468 0.193371
2000-01-08 -0.476653 0.476533
A B C D
2000-01-01 0.136628 0.324097 0.856313 -0.145259
2000-01-02 0.113819 -0.718630 0.016217 -1.571263
2000-01-03 -0.603598 -0.143643 -1.143063 -0.425266
2000-01-04 -0.486200 -0.136663 -2.016020 -0.815514
2000-01-05 0.835318 1.036607 -0.502919 0.878680
2000-01-06 -2.376205 0.362577 -0.484754 -0.478711
2000-01-07 0.193371 1.330468 0.544160 1.030900
2000-01-08 0.476533 -0.476653 -0.434356 0.744500
A B C D
2000-01-01 0.324097 0.136628 0.856313 -0.145259
2000-01-02 -0.718630 0.113819 0.016217 -1.571263
2000-01-03 -0.143643 -0.603598 -1.143063 -0.425266
2000-01-04 -0.136663 -0.486200 -2.016020 -0.815514
2000-01-05 1.036607 0.835318 -0.502919 0.878680
2000-01-06 0.362577 -2.376205 -0.484754 -0.478711
2000-01-07 1.330468 0.193371 0.544160 1.030900
2000-01-08 -0.476653 0.476533 -0.434356 0.744500
# Attribute Access¶
# 属性访问,属性存取
# You may access an index on a Series, column on a DataFrame, 
# and an item on a Panel directly as an attribute:

sa = pd.Series([1,2,3],index=list('abc'))
sa

dfa = df.copy()
dfa
a    1
b    2
c    3
dtype: int64
A B C D
2000-01-01 0.324097 0.136628 0.856313 -0.145259
2000-01-02 -0.718630 0.113819 0.016217 -1.571263
2000-01-03 -0.143643 -0.603598 -1.143063 -0.425266
2000-01-04 -0.136663 -0.486200 -2.016020 -0.815514
2000-01-05 1.036607 0.835318 -0.502919 0.878680
2000-01-06 0.362577 -2.376205 -0.484754 -0.478711
2000-01-07 1.330468 0.193371 0.544160 1.030900
2000-01-08 -0.476653 0.476533 -0.434356 0.744500
# 对item column等赋值
# 就像属性一样的存取,但需要注意:
# 1. 名称应符合python命名规则,由字母、数字和下划线组成
# 2. 不能和已有的方法名称重名,例如 min
# 3. 不能与pandas内部“关键字”重名,例如 index axis items labels
# 以上情况,可以使用 [ "" ] 引用

sa.a = 5
sa

dfa.index
dfa.A = list(range(len(dfa.index)))
dfa
a    5
b    2
c    3
dtype: int64






DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'],
              dtype='datetime64[ns]', freq='D')
A B C D
2000-01-01 0 0.136628 0.856313 -0.145259
2000-01-02 1 0.113819 0.016217 -1.571263
2000-01-03 2 -0.603598 -1.143063 -0.425266
2000-01-04 3 -0.486200 -2.016020 -0.815514
2000-01-05 4 0.835318 -0.502919 0.878680
2000-01-06 5 -2.376205 -0.484754 -0.478711
2000-01-07 6 0.193371 0.544160 1.030900
2000-01-08 7 0.476533 -0.434356 0.744500
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})  # 字典key值为列名
x
x y
0 1 3
1 2 4
2 3 5
dict(x=9, y=99)
x.iloc[1]
x.iloc[1] = dict(x=9, y=99)
x
{'x': 9, 'y': 99}






x    2
y    4
Name: 1, dtype: int64
x y
0 1 3
1 9 99
2 3 5
df = pd.DataFrame({'one': [1., 2., 3.]})
df
df.two = [4, 5, 6]  # 错误,不能增加一列,利用属性的方式不能对没有的列赋值
df.two  # 但是增加了一项属性,而且可以取得这项属性
df
df['two'] = [4, 5, 6]  # 可以增加一列
df
one
0 1.0
1 2.0
2 3.0
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  This is separate from the ipykernel package so we can avoid doing imports until





[4, 5, 6]
one
0 1.0
1 2.0
2 3.0
one two
0 1.0 4
1 2.0 5
2 3.0 6
# Slicing ranges
# 切片范围
# iloc 方法是最稳健和兼容的
# 下面介绍 方括号 [] 操作符 作为切片

# 对series操作
# 取值
s
s[:5]
s[::2]
s[::-1]
2000-01-01    0.324097
2000-01-02   -0.718630
2000-01-03   -0.143643
2000-01-04   -0.136663
2000-01-05    1.036607
2000-01-06    0.362577
2000-01-07    1.330468
2000-01-08   -0.476653
Freq: D, Name: A, dtype: float64






2000-01-01    0.324097
2000-01-02   -0.718630
2000-01-03   -0.143643
2000-01-04   -0.136663
2000-01-05    1.036607
Freq: D, Name: A, dtype: float64






2000-01-01    0.324097
2000-01-03   -0.143643
2000-01-05    1.036607
2000-01-07    1.330468
Freq: 2D, Name: A, dtype: float64






2000-01-08   -0.476653
2000-01-07    1.330468
2000-01-06    0.362577
2000-01-05    1.036607
2000-01-04   -0.136663
2000-01-03   -0.143643
2000-01-02   -0.718630
2000-01-01    0.324097
Freq: -1D, Name: A, dtype: float64
# 赋值
s2 = s.copy()
s2
s2[:5] = 0
s2
2000-01-01    0.324097
2000-01-02   -0.718630
2000-01-03   -0.143643
2000-01-04   -0.136663
2000-01-05    1.036607
2000-01-06    0.362577
2000-01-07    1.330468
2000-01-08   -0.476653
Freq: D, Name: A, dtype: float64






2000-01-01    0.000000
2000-01-02    0.000000
2000-01-03    0.000000
2000-01-04    0.000000
2000-01-05    0.000000
2000-01-06    0.362577
2000-01-07    1.330468
2000-01-08   -0.476653
Freq: D, Name: A, dtype: float64
# 对DataFrame操作
# [] 操作 选择的是行
df
df[:3]
df[::-1]
one two
0 1.0 4
1 2.0 5
2 3.0 6
one two
0 1.0 4
1 2.0 5
2 3.0 6
one two
2 3.0 6
1 2.0 5
0 1.0 4
# Selection By Label
# 通过标签选择

# 这种方式可能依靠上下文关系,有时候会调用链式赋值,这应该避免。参考:
# Returning a view versus a copy 返回视图 对比 拷贝

# When setting values in a pandas object, 
# care must be taken to avoid what is called chained indexing. 
# Here is an example.

[list('abcd'),list('efgh'),list('ijkl'),list('mnop')]
[['one','two'],['first','second']]

dfmi = pd.DataFrame([list('abcd'),list('efgh'),list('ijkl'),list('mnop')],  # 定义四列数值
        columns=pd.MultiIndex.from_product(
         [['one','two'],['first','second']])) # 定义多重索引,第一个list是第一层,以下类推

dfmi
[['a', 'b', 'c', 'd'],
 ['e', 'f', 'g', 'h'],
 ['i', 'j', 'k', 'l'],
 ['m', 'n', 'o', 'p']]






[['one', 'two'], ['first', 'second']]
one two
first second first second
0 a b c d
1 e f g h
2 i j k l
3 m n o p
#  Compare these two access methods:
dfmi['one']['second']  # chained
dfmi.loc[:,('one','second')]
0    b
1    f
2    j
3    n
Name: second, dtype: object






0    b
1    f
2    j
3    n
Name: (one, second), dtype: object
# 两种方式分析

# 第一种方式:链式 使用两个 []
dfmi['one']  # 第一个[] 先生成了一个DataFrame
dfmi['one']['second']  # pandas 把两个[] 作为分开的事件,他们执行分开的两步调用 __getitem__
first second
0 a b
1 e f
2 i j
3 m n
# 第二种方式:loc
# 通过嵌套的元组切片
df_s = (slice(None), ('one', 'second'))
dfmi.loc[df_s]  # 只调用了一次 __getitem__
0    b
1    f
2    j
3    n
Name: (one, second), dtype: object
# Selection By Label
# 通过标签选择

# loc要注意索引的数据类型,必须与索引的数据类型一致才可以,
# 例如 datetimeIndex 中,使用loc[2:3] ,即整数型的slice 将会出现TypeError


dfl = pd.DataFrame(np.random.randn(5,4), columns=list('ABCD'), 
                   index=pd.date_range('20130101',periods=5))

dfl
# dfl.loc[2:3]  # 错误的loc
dfl.loc['20130102':'20130104']  # 使用可转换为datetime的字符串
dfl.loc['20130202':'20130204']  # 不报错,返回为空DataFrame
# dfl.loc['20130202']  # 报错,错误信息是index无此值
dfl.loc['20130104':'20130114']  # 只返回存在的数据
A B C D
2013-01-01 -0.887358 1.068362 0.551961 -0.378400
2013-01-02 1.239840 -0.986149 -0.880655 2.112731
2013-01-03 -0.785526 1.583703 -0.871005 -0.659880
2013-01-04 -1.267462 2.500886 -0.980569 1.308624
2013-01-05 -0.842107 -0.921086 1.020196 -0.055930
A B C D
2013-01-02 1.239840 -0.986149 -0.880655 2.112731
2013-01-03 -0.785526 1.583703 -0.871005 -0.659880
2013-01-04 -1.267462 2.500886 -0.980569 1.308624
A B C D
2013-01-04 -1.267462 2.500886 -0.980569 1.308624
2013-01-05 -0.842107 -0.921086 1.020196 -0.055930
# loc可以使用整数,但此时的整数不代表位置,而是label
# loc是基本的取值方法
# 给loc的输入,即在[]中的值,可以是:
# 1. 单独的label  e.g. 5 or 'a
# 2. labels的list   ['a', 'b', 'c']
# 3. slice对象  'a':'f'  !! 注意:与python的切片不同,pandas的切片包括开始和结尾,而python不包括结尾
# 4. 布尔值
# 5. 调用函数  [lambda df: df.A > 0, :]

# Series
s1 = pd.Series(np.random.randn(6),index=list('abcdef'))
s1
s1.loc['c':]
s1.loc['b']
s1.loc['c':] = 0
s1
a    0.796911
b   -1.341250
c    0.008152
d   -0.745881
e    0.674385
f    1.108411
dtype: float64






c    0.008152
d   -0.745881
e    0.674385
f    1.108411
dtype: float64






-1.3412499335785426






a    0.796911
b   -1.341250
c    0.000000
d    0.000000
e    0.000000
f    0.000000
dtype: float64
# DataFrame
df1 = pd.DataFrame(np.random.randn(6,4),  # 6X4 阵列
                   index=list('abcdef'),  # 索引
                   columns=list('ABCD'))  # 行号
df1
A B C D
a -1.912141 -0.835589 -0.188341 -1.024797
b -0.977498 -2.050214 0.355172 -0.291794
c -0.183401 -0.376330 -0.188848 -2.116438
d -1.008359 0.230593 -0.099235 -0.426229
e -0.027338 2.125459 0.066200 -0.247813
f -1.132103 1.945235 1.891179 1.549750
df1.loc[['a', 'b', 'd'], :]  # 先是行label选择,再是列label选择
df1.loc['d':, 'A':'C']

# 取得 a cross section 截面,用单个的label,返回Series
# 以下三式等同
df1.loc['a']
df1.loc['a',:]
df1.xs('a')
type(df1.loc['a'])
A B C D
a -1.912141 -0.835589 -0.188341 -1.024797
b -0.977498 -2.050214 0.355172 -0.291794
d -1.008359 0.230593 -0.099235 -0.426229
A B C
d -1.008359 0.230593 -0.099235
e -0.027338 2.125459 0.066200
f -1.132103 1.945235 1.891179
A   -1.912141
B   -0.835589
C   -0.188341
D   -1.024797
Name: a, dtype: float64






A   -1.912141
B   -0.835589
C   -0.188341
D   -1.024797
Name: a, dtype: float64






A   -1.912141
B   -0.835589
C   -0.188341
D   -1.024797
Name: a, dtype: float64






pandas.core.series.Series
# 通过布尔值数组取值
df1.loc['a'] > 0
df1.loc[:, df1.loc['a'] > 0]
A    False
B    False
C    False
D    False
Name: a, dtype: bool
a
b
c
d
e
f
# 获取一个值
# this is also equivalent to ``df1.at['a','A']``
df1.loc['a', 'A']
df1.at['a','A']
-1.9121410098540752






-1.9121410098540752
# Slicing with labels¶
# 用labels切片
s = pd.Series(list('abcde'), index=[0,3,2,5,4])
s
s.loc[3:5]  # 包含 5, 注意不是 3 4 5 ,而是从 标签3 到 标签5
0    a
3    b
2    c
5    d
4    e
dtype: object






3    b
2    c
5    d
dtype: object
# s.loc[3:6]  # 2个错误,ValueError: index must be monotonic increasing or decreasing  KeyError: 6
# 如果排序的话,可以超出范围
s.sort_index()  # 不改变原值
s
s.sort_index().loc[1:6]  # 可以超出范围
s.sort_index().loc[6:8]  # 即使一个都没有
# s.sort_index().loc[8]  # 但不能是单值,必须是切片
0    a
2    c
3    b
4    e
5    d
dtype: object






0    a
3    b
2    c
5    d
4    e
dtype: object






2    c
3    b
4    e
5    d
dtype: object






Series([], dtype: object)
# Selection By Position
# 通过位置选择,仅通过基于索引的整数,与python和numpy类似,从0开始,且不包括最后一个
# iloc的输入:
# 1. 整数
# 2. 整数的list
# 3. 整数的切片
# 4. 布尔值数组
# 5. 调用函数

# Series
s1 = pd.Series(np.random.randn(5), index=list(range(0,10,2)))
s1
s1.iloc[:3]
s1.iloc[3]
s1.iloc[:3] = 0
s1
0   -0.312716
2    1.425936
4    1.716575
6    2.099666
8    0.262365
dtype: float64






0   -0.312716
2    1.425936
4    1.716575
dtype: float64






2.099665679869975






0    0.000000
2    0.000000
4    0.000000
6    2.099666
8    0.262365
dtype: float64
# DataFrame
df1 = pd.DataFrame(np.random.randn(6,4),
                  index=list(range(0,12,2)),
                  columns=list(range(0,8,2)))
df1
0 2 4 6
0 1.635993 -0.512450 1.786760 -0.002533
2 0.173188 0.275977 -0.044987 -1.077772
4 1.985020 1.604020 0.127853 -1.003384
6 0.250428 -0.102090 1.566787 -1.708521
8 -2.111103 -1.232141 0.863753 -0.545229
10 -1.762999 1.009840 0.274013 0.786940
df1.iloc[:3]
df1.iloc[1:5, 2:4]
df1.iloc[[1, 3, 5], [1, 3]]
df1.iloc[1:3, :]
df1.iloc[:, 1:3]
0 2 4 6
0 1.635993 -0.512450 1.786760 -0.002533
2 0.173188 0.275977 -0.044987 -1.077772
4 1.985020 1.604020 0.127853 -1.003384
4 6
2 -0.044987 -1.077772
4 0.127853 -1.003384
6 1.566787 -1.708521
8 0.863753 -0.545229
2 6
2 0.275977 -1.077772
6 -0.102090 -1.708521
10 1.009840 0.786940
0 2 4 6
2 0.173188 0.275977 -0.044987 -1.077772
4 1.985020 1.604020 0.127853 -1.003384
2 4
0 -0.512450 1.786760
2 0.275977 -0.044987
4 1.604020 0.127853
6 -0.102090 1.566787
8 -1.232141 0.863753
10 1.009840 0.274013
# this is also equivalent to ``df1.iat[1,1]``
# 取单个的值
df1.iloc[1, 1]
df1.iat[1, 1]
0.2759774784621013






0.2759774784621013
# For getting a cross section using an integer position (equiv to df.xs(1))
# 取截面,得到Series
df1.iloc[1]
df1.iloc[:,1]
0    0.173188
2    0.275977
4   -0.044987
6   -1.077772
Name: 2, dtype: float64






0    -0.512450
2     0.275977
4     1.604020
6    -0.102090
8    -1.232141
10    1.009840
Name: 2, dtype: float64
# Out of range slice indexes are handled gracefully just as in Python/Numpy.
# 超过索引的切片处理,与python和numpy一样
# 注意:不能是单独索引,或列表中,有超过界限的值,只可以是slice,即带冒号的切片才不会提示错误

x = list('abcdef')
x[4:10]
x[8:10]
s = pd.Series(x)
s.iloc[4:10]
s.iloc[8:10]  # 超过界限bound返回空
['e', 'f']






[]






4    e
5    f
dtype: object






Series([], dtype: object)
dfl = pd.DataFrame(np.random.randn(5,2), columns=list('AB'))
dfl
dfl.iloc[:, 2:3]
dfl.iloc[:, 1:3]
dfl.iloc[4:6]
A B
0 -0.069941 1.124145
1 -0.025781 0.940736
2 -0.117417 0.503736
3 0.882286 0.302845
4 -0.136374 0.276822
0
1
2
3
4
B
0 1.124145
1 0.940736
2 0.503736
3 0.302845
4 0.276822
A B
4 -0.136374 0.276822
# Selection By Callable
# 通过调用函数进行选择

df1 = pd.DataFrame(np.random.randn(6,4),
                  index=list("abcdef"),
                  columns=list("ABCD"))
df1
df1.A
A B C D
a -1.947578 0.874286 1.139484 -3.004564
b 0.565255 0.028440 0.685688 0.973264
c -1.275992 0.732339 -0.324490 1.116887
d 0.433325 0.002567 -1.310127 0.844756
e 0.341412 -0.606646 0.034623 0.772968
f 1.518936 -0.590351 0.604839 -1.461750
a   -1.947578
b    0.565255
c   -1.275992
d    0.433325
e    0.341412
f    1.518936
Name: A, dtype: float64
df1.loc[lambda df: df.A > 0, :]
df1.loc[:, lambda df: ['A', 'B']]
df1.iloc[:, lambda df: [0, 1]]
df1[lambda df: df.columns[0]]
A B C D
b 0.565255 0.028440 0.685688 0.973264
d 0.433325 0.002567 -1.310127 0.844756
e 0.341412 -0.606646 0.034623 0.772968
f 1.518936 -0.590351 0.604839 -1.461750
A B
a -1.947578 0.874286
b 0.565255 0.028440
c -1.275992 0.732339
d 0.433325 0.002567
e 0.341412 -0.606646
f 1.518936 -0.590351
A B
a -1.947578 0.874286
b 0.565255 0.028440
c -1.275992 0.732339
d 0.433325 0.002567
e 0.341412 -0.606646
f 1.518936 -0.590351
a   -1.947578
b    0.565255
c   -1.275992
d    0.433325
e    0.341412
f    1.518936
Name: A, dtype: float64
df1.A
df1.A.loc[lambda s: s > 0]
df1.A.loc[df1.A > 0]
a   -1.947578
b    0.565255
c   -1.275992
d    0.433325
e    0.341412
f    1.518936
Name: A, dtype: float64






b    0.565255
d    0.433325
e    0.341412
f    1.518936
Name: A, dtype: float64






b    0.565255
d    0.433325
e    0.341412
f    1.518936
Name: A, dtype: float64
# 使用这些方法或索引,可以使用链式的选择方法,而不用中间的临时变量。链式方法,不是链式[]
bb = pd.read_csv('data/baseball.csv', index_col='id')
bb.groupby(['year', 'team']).sum().loc[lambda df: df.r > 100]
# IX Indexer is Deprecated
# 不推荐使用ix
# in favor of the more strict .iloc and .loc indexers.
# 使用.iloc和.loc代替.ix

dfd = pd.DataFrame({'A': [1, 2, 3],'B': [4, 5, 6]},index=list('abc'))
dfd
A B
a 1 4
b 2 5
c 3 6
dfd.ix[[0, 2], 'A']
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.





a    1
c    3
Name: A, dtype: int64
dfd.index[[0, 2]]  # 取得索引的名称
dfd.loc[dfd.index[[0, 2]], 'A']
dfd.columns.get_loc('A')  # 取得列的索引值
dfd.iloc[[0, 2], dfd.columns.get_loc('A')]
Index(['a', 'c'], dtype='object')






a    1
c    3
Name: A, dtype: int64






0






a    1
c    3
Name: A, dtype: int64
# Indexing with list with missing labels is Deprecated
# 不推荐用有缺失标签的list进行索引
# using .loc or [] with a list with one or more missing labels, is deprecated, in favor of .reindex.
# 推荐使用.reindex

s = pd.Series([1, 2, 3])
s
0    1
1    2
2    3
dtype: int64
s.loc[[1, 2]]  # list中的keys都存在,则没有变化
1    2
2    3
dtype: int64
s.loc[[1, 2, 3]]  # 当有缺失时,赋值为NaN 
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: 
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.





1    2.0
2    3.0
3    NaN
dtype: float64
# Reindexing

s.reindex([1, 2, 3])
1    2.0
2    3.0
3    NaN
dtype: float64
# 如果仅选择有效的keys
labels = [1, 2, 3]
s.index.intersection(labels)  # index和labes的交集
s.loc[s.index.intersection(labels)]
Int64Index([1, 2], dtype='int64')






1    2
2    3
dtype: int64
# reindex 索引中不能有重复的项
s = pd.Series(np.arange(4), index=['a', 'a', 'b', 'c'])
s
labels = ['c', 'd']
# s.reindex(labels)  # 不能reindex
a    0
a    1
b    2
c    3
dtype: int32
# 可以先把交集切出来,再进行reindex
# 但是交集不能有重复的index
s.index.intersection(labels)
s.loc[s.index.intersection(labels)]
s.loc[s.index.intersection(labels)].reindex(labels)
Index(['c'], dtype='object')






c    3
dtype: int32






c    3.0
d    NaN
dtype: float64
# Selecting Random Samples
# 随机取样选择
# 默认行取样

s = pd.Series([0,1,2,3,4,5])
s
0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64
s.sample()  # 默认取1行
s.sample(n=3)  # 取3行
s.sample(frac=0.5)  # 小数,行数百分数
s.sample(frac=0.8)  # 小数
s
1    1
dtype: int64






3    3
4    4
5    5
dtype: int64






2    2
3    3
4    4
dtype: int64






2    2
1    1
3    3
0    0
5    5
dtype: int64






0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64
s.sample(n=6, replace=False)  # 默认
s.sample(n=6, replace=True)  # replace,不改变本身
s
5    5
4    4
1    1
0    0
2    2
3    3
dtype: int64






0    0
4    4
0    0
2    2
0    0
0    0
dtype: int64






0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64
# 默认每行都有同样的概率被抽样到,也可以指定每行的概率比重
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
s.sample(n=3, weights=example_weights)

example_weights2 = [0.5, 0, 0, 0, 0, 0]  # 权重将会自动的归一化
s.sample(n=1, weights=example_weights2)
2    2
3    3
5    5
dtype: int64






0    0
dtype: int64
# 可以指定DataFrame的某列作为权重
df2 = pd.DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})
df2.sample(n = 3, weights = 'weight_column')
col1 weight_column
1 8 0.4
2 7 0.1
0 9 0.5
# 对列进行抽样
df3 = pd.DataFrame({'col1':[1,2,3], 'col2':[2,3,4], 'col3':[3,4,5]})
df3
df3.sample(n=2, axis=1)  # 指定axis=1,对列抽样,抽取的是列的组合
df3.sample(n=2, axis=0)
col1 col2 col3
0 1 2 3
1 2 3 4
2 3 4 5
col3 col2
0 3 2
1 4 3
2 5 4
col1 col2 col3
2 3 4 5
0 1 2 3
# 可以指定random seed 或者 numpy 的 randomState 对象,作为sample 随机数生成器的种子
# 一旦seed确定,随机数不变
df3.sample(n=2, random_state=2)
df3.sample(n=2, random_state=2)
df3.sample(n=2, random_state=200)
df3.sample(n=2, random_state=200)
col1 col2 col3
2 3 4 5
1 2 3 4
col1 col2 col3
2 3 4 5
1 2 3 4
col1 col2 col3
0 1 2 3
1 2 3 4
col1 col2 col3
0 1 2 3
1 2 3 4
# Setting With Enlargement
# loc或[] 操作 可以通过赋值不存在的键扩大Series或Dataframe

# Series
se = pd.Series([1,2,3])
se
se[5] = 5  # append
se
0    1
1    2
2    3
dtype: int64






0    1
1    2
2    3
5    5
dtype: int64
# DataFrame

dfi = pd.DataFrame(np.arange(6).reshape(3,2),columns=['A','B'])
dfi
A B
0 0 1
1 2 3
2 4 5
dfi.loc[:,'C'] = dfi.loc[:,'A']  # enlarge 增加列
dfi
A B C
0 0 1 0
1 2 3 2
2 4 5 4
dfi.loc[3] = 5  # append 增加行
dfi
A B C
0 0 1 0
1 2 3 2
2 4 5 4
3 5 5 5
# Fast scalar value getting and setting
# 快速取得或设置标量的值
# [] 可以进行很多操作,所以它为了知道你要进行那种操作,会有一点计算开销
# 最快的标量访问方式是使用 at 和 iat 方法,他们可以在所有的数据结构上使用
# at 类似于 loc 基于 label
# iat 类似于 iloc 基于 整数index
s = pd.Series([0,1,2,3,4,5])
s
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df
0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64
A B C D
2000-01-01 0.845046 -0.119758 1.354224 -0.134199
2000-01-02 -1.935579 -1.469185 -2.581439 0.355347
2000-01-03 -0.038740 -1.524056 1.376257 1.572331
2000-01-04 -0.846971 0.189231 -0.287885 -0.561706
2000-01-05 -0.127290 -0.043918 0.103347 -1.055387
2000-01-06 0.406437 1.917624 0.810463 0.367583
2000-01-07 0.438904 -0.230190 0.593607 -0.438856
2000-01-08 -1.955122 1.531260 0.889124 -0.014259
# get value
s.iat[5]
df.at[dates[5], 'A']
df.iat[3, 0]
5






0.40643702489386246






-0.8469710793801154
# set value
df.at[dates[5], 'E'] = 7  # 没有列就增加一列,没有值的默认赋值为nan
df
A B C D E
2000-01-01 0.845046 -0.119758 1.354224 -0.134199 NaN
2000-01-02 -1.935579 -1.469185 -2.581439 0.355347 NaN
2000-01-03 -0.038740 -1.524056 1.376257 1.572331 NaN
2000-01-04 -0.846971 0.189231 -0.287885 -0.561706 NaN
2000-01-05 -0.127290 -0.043918 0.103347 -1.055387 NaN
2000-01-06 0.406437 1.917624 0.810463 0.367583 7.0
2000-01-07 0.438904 -0.230190 0.593607 -0.438856 NaN
2000-01-08 -1.955122 1.531260 0.889124 -0.014259 NaN
df.iat[3, 0] = 7
df
A B C D E
2000-01-01 0.845046 -0.119758 1.354224 -0.134199 NaN
2000-01-02 -1.935579 -1.469185 -2.581439 0.355347 NaN
2000-01-03 -0.038740 -1.524056 1.376257 1.572331 NaN
2000-01-04 7.000000 0.189231 -0.287885 -0.561706 NaN
2000-01-05 -0.127290 -0.043918 0.103347 -1.055387 NaN
2000-01-06 0.406437 1.917624 0.810463 0.367583 7.0
2000-01-07 0.438904 -0.230190 0.593607 -0.438856 NaN
2000-01-08 -1.955122 1.531260 0.889124 -0.014259 NaN
df.at[dates[-1]+1, 0] = 7  # 行和列都扩展了
df
A B C D E 0
2000-01-01 0.845046 -0.119758 1.354224 -0.134199 NaN NaN
2000-01-02 -1.935579 -1.469185 -2.581439 0.355347 NaN NaN
2000-01-03 -0.038740 -1.524056 1.376257 1.572331 NaN NaN
2000-01-04 7.000000 0.189231 -0.287885 -0.561706 NaN NaN
2000-01-05 -0.127290 -0.043918 0.103347 -1.055387 NaN NaN
2000-01-06 0.406437 1.917624 0.810463 0.367583 7.0 NaN
2000-01-07 0.438904 -0.230190 0.593607 -0.438856 NaN NaN
2000-01-08 -1.955122 1.531260 0.889124 -0.014259 NaN NaN
2000-01-09 NaN NaN NaN NaN NaN 7.0
# Boolean indexing
# 使用布尔向量过滤数据
# 操作符包括:| for or, & for and, and ~ for not ,必须用括号进行分组
s = pd.Series(range(-3, 4))  # 不用把range先list,直接可以series
s
0   -3
1   -2
2   -1
3    0
4    1
5    2
6    3
dtype: int64
s[s >= 0]  # 直接用series,不用取其值
s[~(s < 0)]
s[(s < -1) | (s > 0.5)]
3    0
4    1
5    2
6    3
dtype: int64






3    0
4    1
5    2
6    3
dtype: int64






0   -3
1   -2
4    1
5    2
6    3
dtype: int64
df[df['A'] > 0]
A B C D E 0
2000-01-01 0.845046 -0.119758 1.354224 -0.134199 NaN NaN
2000-01-04 7.000000 0.189231 -0.287885 -0.561706 NaN NaN
2000-01-06 0.406437 1.917624 0.810463 0.367583 7.0 NaN
2000-01-07 0.438904 -0.230190 0.593607 -0.438856 NaN NaN
# List comprehensions and map method of Series can also be used to produce more complex criteria:
# 列表生成式和map方法 也可以用来生成 更复杂的条件判断

df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
                     'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
                     'c' : np.random.randn(7)})
df2
a b c
0 one x 1.706402
1 one y 0.491977
2 two y 1.357986
3 three x -1.023513
4 two y -0.653028
5 one x 0.041052
6 six x 1.021882
# 判别式
criterion = df2['a'].map(lambda x: x.startswith('t'))  # 选择 two 和 three
criterion
0    False
1    False
2     True
3     True
4     True
5    False
6    False
Name: a, dtype: bool
df2[criterion]  # 根据 a 列的判别式 选择数据表的一部分,包含其他列
a b c
2 two y 1.357986
3 three x -1.023513
4 two y -0.653028
# 等价的方式,但是速度慢一些 
df2[[x.startswith('t') for x in df2['a']]]  # 不适用map,而是用列表生成式
a b c
2 two y 1.357986
3 three x -1.023513
4 two y -0.653028
# 复合判断
df2[criterion & (df2['b'] == 'x')]  # a 列 和 b 列 均符合某类要求
a b c
3 three x -1.023513
# 布尔向量选择 可以与索引选择一并使用
df2.loc[criterion & (df2['b'] == 'x'),'b':'c']  # 只选择 b 和 c 两列,不选择c列
b c
3 x -1.023513
# Indexing with isin
# 用isin索引
# series
# 对列数据进行判断
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s
4    0
3    1
2    2
1    3
0    4
dtype: int64
s.isin([2, 4, 6])  # 列数据中是否存在列表中的值,返回布尔值
s[s.isin([2, 4, 6])]  # 可以利用返回的布尔值进行选择
4    False
3    False
2     True
1    False
0     True
dtype: bool






2    2
0    4
dtype: int64
# 也可以对index obj 进行筛选
s[s.index.isin([2, 4, 6])]
s.reindex([2, 4, 6])  # reindex不同,列表中没有的值返回了nan,且原来的int64返回了float64数据类型
4    0
2    2
dtype: int64






2    2.0
4    0.0
6    NaN
dtype: float64
# 对应多重索引,可以单独选择索引级别
s_mi = pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi
0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int32
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]  # 指定索引级别,在第二级索引中选择
0  c    2
1  a    3
dtype: int32






0  a    0
   c    2
1  a    3
   c    5
dtype: int32
# DataFrame
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],'ids2': ['a', 'n', 'c', 'n']})
df
ids ids2 vals
0 a a 1
1 b n 2
2 f c 3
3 n n 4
values = ['a', 'b', 1, 3]
df.isin(values)  # 匹配所有的值
ids ids2 vals
0 True True True
1 True False False
2 False False True
3 False False False
values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}
row_mask = df.isin(values)  # 对不用的列,分别匹配某些值
row_mask
# ?row_mask.all  # Return whether all elements are True over requested axis
row_mask = row_mask.all(1)
row_mask
df[row_mask]  # 选择全是True的行
ids ids2 vals
0 True True True
1 True False False
2 False True True
3 False False False
0     True
1    False
2    False
3    False
dtype: bool
ids ids2 vals
0 a a 1
# The where() Method and Masking
# To guarantee that selection output has the same shape as the original data
# 保证选集输出与原数据有同样的shape形态

# series
s
s[s > 0]  # 只返回满足的项
s.where(s > 0)  # 全部返回,不满足的项,赋值nan
4    0
3    1
2    2
1    3
0    4
dtype: int64






3    1
2    2
1    3
0    4
dtype: int64






4    NaN
3    1.0
2    2.0
1    3.0
0    4.0
dtype: float64
# DataFrame 使用布尔值选择时,返回值保留原数据结构
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df
A B C D
2000-01-01 0.383691 -0.500453 0.101632 0.848213
2000-01-02 -1.048619 0.856605 -0.295425 1.060710
2000-01-03 -0.214918 -1.989228 0.278514 1.088771
2000-01-04 -2.326385 -0.225754 -1.331598 1.457230
2000-01-05 0.656919 1.081810 1.148303 -0.089382
2000-01-06 -1.041194 0.533706 -1.084442 1.824709
2000-01-07 -1.040706 -2.336161 0.565496 0.269414
2000-01-08 0.166739 -0.075381 -0.951126 -0.347865
df[df < 0]
df.where(df < 0)  # 等价于上式
df.where(df < 0, -df)  # where 可以传入另一个参数,用于替换条件为 False 的项,返回copy数据拷贝,不修改原值
df
A B C D
2000-01-01 NaN -0.500453 NaN NaN
2000-01-02 -1.048619 NaN -0.295425 NaN
2000-01-03 -0.214918 -1.989228 NaN NaN
2000-01-04 -2.326385 -0.225754 -1.331598 NaN
2000-01-05 NaN NaN NaN -0.089382
2000-01-06 -1.041194 NaN -1.084442 NaN
2000-01-07 -1.040706 -2.336161 NaN NaN
2000-01-08 NaN -0.075381 -0.951126 -0.347865
A B C D
2000-01-01 NaN -0.500453 NaN NaN
2000-01-02 -1.048619 NaN -0.295425 NaN
2000-01-03 -0.214918 -1.989228 NaN NaN
2000-01-04 -2.326385 -0.225754 -1.331598 NaN
2000-01-05 NaN NaN NaN -0.089382
2000-01-06 -1.041194 NaN -1.084442 NaN
2000-01-07 -1.040706 -2.336161 NaN NaN
2000-01-08 NaN -0.075381 -0.951126 -0.347865
A B C D
2000-01-01 -0.383691 -0.500453 -0.101632 -0.848213
2000-01-02 -1.048619 -0.856605 -0.295425 -1.060710
2000-01-03 -0.214918 -1.989228 -0.278514 -1.088771
2000-01-04 -2.326385 -0.225754 -1.331598 -1.457230
2000-01-05 -0.656919 -1.081810 -1.148303 -0.089382
2000-01-06 -1.041194 -0.533706 -1.084442 -1.824709
2000-01-07 -1.040706 -2.336161 -0.565496 -0.269414
2000-01-08 -0.166739 -0.075381 -0.951126 -0.347865
A B C D
2000-01-01 0.383691 -0.500453 0.101632 0.848213
2000-01-02 -1.048619 0.856605 -0.295425 1.060710
2000-01-03 -0.214918 -1.989228 0.278514 1.088771
2000-01-04 -2.326385 -0.225754 -1.331598 1.457230
2000-01-05 0.656919 1.081810 1.148303 -0.089382
2000-01-06 -1.041194 0.533706 -1.084442 1.824709
2000-01-07 -1.040706 -2.336161 0.565496 0.269414
2000-01-08 0.166739 -0.075381 -0.951126 -0.347865
# You may wish to set values based on some boolean criteria. This can be done intuitively like so:
# 设置基于布尔值的项值

s2 = s.copy()
s2
s2[s2 < 3] = 0
s2
df2 = df.copy()
df2
df2[df2 < 0] = 0
df2
4    0
3    1
2    2
1    3
0    4
dtype: int64






4    0
3    0
2    0
1    3
0    4
dtype: int64
A B C D
2000-01-01 0.383691 -0.500453 0.101632 0.848213
2000-01-02 -1.048619 0.856605 -0.295425 1.060710
2000-01-03 -0.214918 -1.989228 0.278514 1.088771
2000-01-04 -2.326385 -0.225754 -1.331598 1.457230
2000-01-05 0.656919 1.081810 1.148303 -0.089382
2000-01-06 -1.041194 0.533706 -1.084442 1.824709
2000-01-07 -1.040706 -2.336161 0.565496 0.269414
2000-01-08 0.166739 -0.075381 -0.951126 -0.347865
A B C D
2000-01-01 0.383691 0.000000 0.101632 0.848213
2000-01-02 0.000000 0.856605 0.000000 1.060710
2000-01-03 0.000000 0.000000 0.278514 1.088771
2000-01-04 0.000000 0.000000 0.000000 1.457230
2000-01-05 0.656919 1.081810 1.148303 0.000000
2000-01-06 0.000000 0.533706 0.000000 1.824709
2000-01-07 0.000000 0.000000 0.565496 0.269414
2000-01-08 0.166739 0.000000 0.000000 0.000000
# where 默认返回修改后的数据拷贝,原值不变;可以设置inplace参数,直接修改原值,而不是创建拷贝

df_orig = df.copy()
df_orig.where(df > 0, -df, inplace=True)
df_orig
A B C D
2000-01-01 0.383691 0.500453 0.101632 0.848213
2000-01-02 1.048619 0.856605 0.295425 1.060710
2000-01-03 0.214918 1.989228 0.278514 1.088771
2000-01-04 2.326385 0.225754 1.331598 1.457230
2000-01-05 0.656919 1.081810 1.148303 0.089382
2000-01-06 1.041194 0.533706 1.084442 1.824709
2000-01-07 1.040706 2.336161 0.565496 0.269414
2000-01-08 0.166739 0.075381 0.951126 0.347865
# 注意 :pandas 和 numpy 的where方法不一样
# 一般的,df1.where(m, df2)  相当于 np.where(m, df1, df2)

df.where(df < 0, -df) == np.where(df < 0, df, -df)
A B C D
2000-01-01 True True True True
2000-01-02 True True True True
2000-01-03 True True True True
2000-01-04 True True True True
2000-01-05 True True True True
2000-01-06 True True True True
2000-01-07 True True True True
2000-01-08 True True True True
# alignment 定位,对齐
# where 可以选择局部(部分区域)的布尔条件  
# This is analogous to partial setting via .loc (but on the contents rather than the axis labels)

df2 = df.copy()
df2
df2[1:4]   # 行选择
df2.where(df2[1:4] > 0, 3)  # 对不符合项的值进行赋值!
df2[ df2[1:4] > 0 ] = 3  # 只定位部分区域,对符合项的值进行赋值
df2
A B C D
2000-01-01 0.383691 -0.500453 0.101632 0.848213
2000-01-02 -1.048619 0.856605 -0.295425 1.060710
2000-01-03 -0.214918 -1.989228 0.278514 1.088771
2000-01-04 -2.326385 -0.225754 -1.331598 1.457230
2000-01-05 0.656919 1.081810 1.148303 -0.089382
2000-01-06 -1.041194 0.533706 -1.084442 1.824709
2000-01-07 -1.040706 -2.336161 0.565496 0.269414
2000-01-08 0.166739 -0.075381 -0.951126 -0.347865
A B C D
2000-01-02 -1.048619 0.856605 -0.295425 1.060710
2000-01-03 -0.214918 -1.989228 0.278514 1.088771
2000-01-04 -2.326385 -0.225754 -1.331598 1.457230
A B C D
2000-01-01 3.0 3.000000 3.000000 3.000000
2000-01-02 3.0 0.856605 3.000000 1.060710
2000-01-03 3.0 3.000000 0.278514 1.088771
2000-01-04 3.0 3.000000 3.000000 1.457230
2000-01-05 3.0 3.000000 3.000000 3.000000
2000-01-06 3.0 3.000000 3.000000 3.000000
2000-01-07 3.0 3.000000 3.000000 3.000000
2000-01-08 3.0 3.000000 3.000000 3.000000
A B C D
2000-01-01 0.383691 -0.500453 0.101632 0.848213
2000-01-02 -1.048619 3.000000 -0.295425 3.000000
2000-01-03 -0.214918 -1.989228 3.000000 3.000000
2000-01-04 -2.326385 -0.225754 -1.331598 3.000000
2000-01-05 0.656919 1.081810 1.148303 -0.089382
2000-01-06 -1.041194 0.533706 -1.084442 1.824709
2000-01-07 -1.040706 -2.336161 0.565496 0.269414
2000-01-08 0.166739 -0.075381 -0.951126 -0.347865
# Where can also accept axis and level parameters to align the input when performing the where.
# where 可以接受 轴参数axis 和 级别参数level 
df2 = df.copy()
df2
df2.where(df2>0,df2['A'],axis='index')  # 小于等于0的值,赋值为A列的值
df2
df2.apply(lambda x, y: x.where(x>0,y), y=df['A'])  # 相当于上式,但此式较慢,同样不改变原值,而是生成一个拷贝copy
df2
A B C D
2000-01-01 0.383691 -0.500453 0.101632 0.848213
2000-01-02 -1.048619 0.856605 -0.295425 1.060710
2000-01-03 -0.214918 -1.989228 0.278514 1.088771
2000-01-04 -2.326385 -0.225754 -1.331598 1.457230
2000-01-05 0.656919 1.081810 1.148303 -0.089382
2000-01-06 -1.041194 0.533706 -1.084442 1.824709
2000-01-07 -1.040706 -2.336161 0.565496 0.269414
2000-01-08 0.166739 -0.075381 -0.951126 -0.347865
A B C D
2000-01-01 0.383691 0.383691 0.101632 0.848213
2000-01-02 -1.048619 0.856605 -1.048619 1.060710
2000-01-03 -0.214918 -0.214918 0.278514 1.088771
2000-01-04 -2.326385 -2.326385 -2.326385 1.457230
2000-01-05 0.656919 1.081810 1.148303 0.656919
2000-01-06 -1.041194 0.533706 -1.041194 1.824709
2000-01-07 -1.040706 -1.040706 0.565496 0.269414
2000-01-08 0.166739 0.166739 0.166739 0.166739
A B C D
2000-01-01 0.383691 -0.500453 0.101632 0.848213
2000-01-02 -1.048619 0.856605 -0.295425 1.060710
2000-01-03 -0.214918 -1.989228 0.278514 1.088771
2000-01-04 -2.326385 -0.225754 -1.331598 1.457230
2000-01-05 0.656919 1.081810 1.148303 -0.089382
2000-01-06 -1.041194 0.533706 -1.084442 1.824709
2000-01-07 -1.040706 -2.336161 0.565496 0.269414
2000-01-08 0.166739 -0.075381 -0.951126 -0.347865
A B C D
2000-01-01 0.383691 0.383691 0.101632 0.848213
2000-01-02 -1.048619 0.856605 -1.048619 1.060710
2000-01-03 -0.214918 -0.214918 0.278514 1.088771
2000-01-04 -2.326385 -2.326385 -2.326385 1.457230
2000-01-05 0.656919 1.081810 1.148303 0.656919
2000-01-06 -1.041194 0.533706 -1.041194 1.824709
2000-01-07 -1.040706 -1.040706 0.565496 0.269414
2000-01-08 0.166739 0.166739 0.166739 0.166739
A B C D
2000-01-01 0.383691 -0.500453 0.101632 0.848213
2000-01-02 -1.048619 0.856605 -0.295425 1.060710
2000-01-03 -0.214918 -1.989228 0.278514 1.088771
2000-01-04 -2.326385 -0.225754 -1.331598 1.457230
2000-01-05 0.656919 1.081810 1.148303 -0.089382
2000-01-06 -1.041194 0.533706 -1.084442 1.824709
2000-01-07 -1.040706 -2.336161 0.565496 0.269414
2000-01-08 0.166739 -0.075381 -0.951126 -0.347865
# where 可以接受一个函数调用,此函数只能有一个参数,且返回有效的布尔条件
df3 = pd.DataFrame({'A': [1, 2, 3],'B': [4, 5, 6],'C': [7, 8, 9]})
df3
df3.where(lambda x: x > 4, lambda x: x + 10)  # x<=4 的值 赋值为  x+10
A B C
0 1 4 7
1 2 5 8
2 3 6 9
A B C
0 11 14 7
1 12 5 8
2 13 6 9
# mask 遮罩 mask is the inverse boolean operation of where. 反向的布尔值操作
s
s.mask(s >= 0)  # 选择<0 的值, 不符合项 置为nan
df
df.mask(df >= 0)
4    0
3    1
2    2
1    3
0    4
dtype: int64






4   NaN
3   NaN
2   NaN
1   NaN
0   NaN
dtype: float64
A B C D
2000-01-01 0.383691 -0.500453 0.101632 0.848213
2000-01-02 -1.048619 0.856605 -0.295425 1.060710
2000-01-03 -0.214918 -1.989228 0.278514 1.088771
2000-01-04 -2.326385 -0.225754 -1.331598 1.457230
2000-01-05 0.656919 1.081810 1.148303 -0.089382
2000-01-06 -1.041194 0.533706 -1.084442 1.824709
2000-01-07 -1.040706 -2.336161 0.565496 0.269414
2000-01-08 0.166739 -0.075381 -0.951126 -0.347865
A B C D
2000-01-01 NaN -0.500453 NaN NaN
2000-01-02 -1.048619 NaN -0.295425 NaN
2000-01-03 -0.214918 -1.989228 NaN NaN
2000-01-04 -2.326385 -0.225754 -1.331598 NaN
2000-01-05 NaN NaN NaN -0.089382
2000-01-06 -1.041194 NaN -1.084442 NaN
2000-01-07 -1.040706 -2.336161 NaN NaN
2000-01-08 NaN -0.075381 -0.951126 -0.347865
# The query() Method (Experimental) 对 DataFrame对象 使用表达式进行选择

# 例如 :选择b列中 在a列和c列两值中间的 行,a
a b c
0 0.283094 0.051807 0.126487
1 0.020097 0.373023 0.147193
2 0.091921 0.830956 0.143214
3 0.340304 0.527246 0.709769
4 0.651722 0.344524 0.151233
5 0.396685 0.524376 0.540237
6 0.502751 0.627708 0.708038
7 0.472338 0.269770 0.586165
8 0.937522 0.239560 0.861873
9 0.661879 0.465536 0.271580
# pure python
df[(df.a < df.b) & (df.b < df.c)]
a b c
3 0.340304 0.527246 0.709769
5 0.396685 0.524376 0.540237
6 0.502751 0.627708 0.708038
# query,传入的是str表达式
df.query('(a < b) & (b < c)')  # 比纯py慢?!
a b c
3 0.340304 0.527246 0.709769
5 0.396685 0.524376 0.540237
6 0.502751 0.627708 0.708038
# 对于命名的index索引是低效的,比利用列名称
# 而且如果索引的名称和列名同名,列名优先
df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc'))
# df
df.index.name = "a"
df
b c
a
0 0 4
1 1 2
2 3 0
3 0 3
4 4 3
5 0 0
6 0 4
7 1 4
8 3 0
9 1 4
df.query('a <= b and b <= c')
b c
a
0 0 4
1 1 2
# 可以不使用索引的名称,而是直接用index,这样同时可以避免与列名重名
df.query('index <= b <= c')
b c
a
0 0 4
1 1 2
# MultiIndex query() Syntax
# 对于多重索引

n = 10
colors = np.random.choice(['red', 'green'], size=n)
colors
foods = np.random.choice(['eggs', 'ham'], size=n)
foods
array(['green', 'green', 'green', 'green', 'green', 'green', 'green',
       'red', 'green', 'green'], dtype='
index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
index
df = pd.DataFrame(np.random.randn(n, 2), index=index)
df
MultiIndex(levels=[['green', 'red'], ['eggs', 'ham']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0, 1, 1]],
           names=['color', 'food'])
0 1
color food
green ham -0.295761 1.399719
eggs 1.807185 0.498136
eggs -0.119640 2.279162
eggs -0.238709 -0.650418
eggs 2.235827 1.066954
ham 1.156794 1.694717
eggs -0.037158 -0.529213
red eggs 0.046799 0.763592
green ham 0.488240 -0.455112
ham -0.169486 -0.646891
df.query('color == "red"')
0 1
color food
red eggs 0.046799 0.763592
# 如果多重索引没有命名 可以使用特殊的名字
df.index.names = [None, None]
df
df.query('ilevel_0 == "red"') # The convention is ilevel_0, which means “index level 0” for the 0th level of the index.
0 1
green ham -0.295761 1.399719
eggs 1.807185 0.498136
eggs -0.119640 2.279162
eggs -0.238709 -0.650418
eggs 2.235827 1.066954
ham 1.156794 1.694717
eggs -0.037158 -0.529213
red eggs 0.046799 0.763592
green ham 0.488240 -0.455112
ham -0.169486 -0.646891
0 1
red eggs 0.046799 0.763592
# query() Use Cases 使用示例
# A use case for query() is when you have a collection of DataFrame objects 
# that have a subset of column names (or index levels/names) in common.   ## 有公共列名称子集的df
# You can pass the same query to both frames 
# without having to specify which frame you’re interested in querying.  ## 传入同样的query
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df2 = pd.DataFrame(np.random.rand(n + 2, 3), columns=df.columns)
df
df2
a b c
0 0.096087 0.799309 0.112070
1 0.083101 0.505151 0.830588
2 0.501472 0.692792 0.663570
3 0.872821 0.442541 0.904903
4 0.513651 0.375617 0.786898
5 0.184368 0.285290 0.055147
6 0.284951 0.243556 0.732871
7 0.431525 0.676385 0.458296
8 0.743057 0.326079 0.434655
9 0.610921 0.717639 0.580765
a b c
0 0.804932 0.533058 0.713190
1 0.305893 0.460048 0.879321
2 0.344438 0.970870 0.685098
3 0.348010 0.839449 0.799309
4 0.844713 0.267562 0.771202
5 0.403534 0.088786 0.950782
6 0.181616 0.567118 0.989711
7 0.363736 0.852080 0.140771
8 0.866127 0.285365 0.025491
9 0.329751 0.121716 0.782729
10 0.029253 0.419409 0.051255
11 0.057406 0.106595 0.559687
expr = '0.0 <= a <= c <= 0.5'
mp = map(lambda frame: frame.query(expr), [df, df2])  # 同一个表达式,作用在有同样列名的多个df上
for i in mp:
    print(i)
          a         b         c
0  0.096087  0.799309  0.112070
7  0.431525  0.676385  0.458296
           a         b         c
10  0.029253  0.419409  0.051255
# query() Python versus pandas Syntax Comparison


# Full numpy-like syntax
df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc'))
df
a b c
0 8 4 5
1 7 4 7
2 9 3 2
3 4 8 7
4 4 5 8
5 7 8 8
6 1 7 6
7 5 9 1
8 3 3 8
9 7 1 5
 df.query('(a < b) & (b < c)')
 df[(df.a < df.b) & (df.b < df.c)]   
a b c
4 4 5 8
a b c
4 4 5 8
# 几种其他的不同的写法
df.query('a < b & b < c')  # 去掉括号
df.query('a < b and b < c')  # 使用英文and
df.query('a < b < c')  # 连写,优雅的表达
a b c
4 4 5 8
a b c
4 4 5 8
a b c
4 4 5 8
# The in and not in operators

# get all rows where columns "a" and "b" have overlapping values 
# 得到a列和b列有重叠值的行,a列中的值在 in b列的值中
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
                       'c': np.random.randint(5, size=12),
                       'd': np.random.randint(9, size=12)})
df
a b c d
0 a a 0 2
1 a a 0 6
2 b a 3 3
3 b a 1 2
4 c b 1 2
5 c b 1 2
6 d b 3 8
7 d b 1 0
8 e c 2 7
9 e c 0 1
10 f c 0 1
11 f c 3 1
df.query('a in b')  # 第一次运行113ms 第二次明显加快 25ms
a b c d
0 a a 0 2
1 a a 0 6
2 b a 3 3
3 b a 1 2
4 c b 1 2
5 c b 1 2
df[df.a.isin(df.b)]   # How you'd do it in pure Python  仍然比query快 35ms
a b c d
0 a a 0 2
1 a a 0 6
2 b a 3 3
3 b a 1 2
4 c b 1 2
5 c b 1 2
df[~df.a.isin(df.b)]
a b c d
6 d b 3 8
7 d b 1 0
8 e c 2 7
9 e c 0 1
10 f c 0 1
11 f c 3 1
df.query('a not in b')
a b c d
6 d b 3 8
7 d b 1 0
8 e c 2 7
9 e c 0 1
10 f c 0 1
11 f c 3 1
# rows where cols a and b have overlapping values and col c's values are less than col d's
df.query('a in b and c < d')
df[df.a.isin(df.b) & (df.c < df.d)]
a b c d
0 a a 0 2
1 a a 0 6
3 b a 1 2
4 c b 1 2
5 c b 1 2
a b c d
0 a a 0 2
1 a a 0 6
3 b a 1 2
4 c b 1 2
5 c b 1 2
# 注意:?????
# Note that in and not in are evaluated in Python, since numexpr has no equivalent of this operation. 
# However, only the in/not in expression itself is evaluated in vanilla Python. 
# For example, in the expression
df.query('a in b + c + d')
# (b + c + d) is evaluated by numexpr and then the in operation is evaluated in plain Python. In general, any operations that can be evaluated using numexpr will be.
# Special use of the == operator with list objects
# 特别的用法,== 用于list
# Comparing a list of values to a column using ==/!= works similarly to in/not in
# 用==/!=比较列表的值与列的值,类似于in/not in
df
a b c d
0 a a 0 2
1 a a 0 6
2 b a 3 3
3 b a 1 2
4 c b 1 2
5 c b 1 2
6 d b 3 8
7 d b 1 0
8 e c 2 7
9 e c 0 1
10 f c 0 1
11 f c 3 1
df.query('b == ["a", "b", "c"]')  # b列中有列表中值的
a b c d
0 a a 0 2
1 a a 0 6
2 b a 3 3
3 b a 1 2
4 c b 1 2
5 c b 1 2
6 d b 3 8
7 d b 1 0
8 e c 2 7
9 e c 0 1
10 f c 0 1
11 f c 3 1
df.query('c == [1, 2]')
df.query('[1, 2] in c')
df[df.c.isin([1, 2])]
a b c d
3 b a 1 2
4 c b 1 2
5 c b 1 2
7 d b 1 0
8 e c 2 7
a b c d
3 b a 1 2
4 c b 1 2
5 c b 1 2
7 d b 1 0
8 e c 2 7
a b c d
3 b a 1 2
4 c b 1 2
5 c b 1 2
7 d b 1 0
8 e c 2 7
# Boolean Operators 布尔操作符
# ~ 或 not
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df
df['bools'] = np.random.rand(len(df)) > 0.5
df
a b c
0 0.528671 0.265870 0.932892
1 0.520069 0.047895 0.478818
2 0.648595 0.180744 0.838445
3 0.145614 0.906190 0.762163
4 0.612637 0.027232 0.778020
5 0.640565 0.535538 0.018280
6 0.633664 0.124654 0.185709
7 0.635088 0.377300 0.914968
8 0.702684 0.504459 0.107014
9 0.635757 0.261144 0.665611
a b c bools
0 0.528671 0.265870 0.932892 False
1 0.520069 0.047895 0.478818 True
2 0.648595 0.180744 0.838445 True
3 0.145614 0.906190 0.762163 False
4 0.612637 0.027232 0.778020 True
5 0.640565 0.535538 0.018280 True
6 0.633664 0.124654 0.185709 True
7 0.635088 0.377300 0.914968 True
8 0.702684 0.504459 0.107014 False
9 0.635757 0.261144 0.665611 False
df.query('~bools')
df.query('not bools')
a b c bools
0 0.528671 0.265870 0.932892 False
3 0.145614 0.906190 0.762163 False
8 0.702684 0.504459 0.107014 False
9 0.635757 0.261144 0.665611 False
a b c bools
0 0.528671 0.265870 0.932892 False
3 0.145614 0.906190 0.762163 False
8 0.702684 0.504459 0.107014 False
9 0.635757 0.261144 0.665611 False
# 任意组合布尔表达式
shorter = df.query('a < b < c and (not bools) or bools > 2')  # short query syntax
longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)]  # equivalent in pure Python
shorter
longer
# Performance of query()
# DataFrame.query() using numexpr is slightly faster than Python for large frames
# Note : You will only see the performance benefits of using the numexpr engine with DataFrame.query() 
# if your frame has more than approximately 200,000 rows

# query() 效能在大数据时高于数字表达式
# Duplicate Data
# 识别和剔除重复数据,两个方法:duplicated and drop_duplicates,他们的参数都是列
# duplicated 返回布尔向量,标示重复的行
# drop_duplicates 删除重复列
# 默认保留第一个找到的值,但可以通过keep参数指定保留的值
# keep='first' (default): mark / drop duplicates except for the first occurrence. 只保留第一个
# keep='last': mark / drop duplicates except for the last occurrence. 只保留最后一个重复值
# keep=False: mark / drop all duplicates.  剔除所有重复值
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
                    'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
                    'c': np.random.randn(7)})
df2
a b c
0 one x 2.206877
1 one y -1.852209
2 two x -0.706555
3 two y -1.007860
4 two x -0.185356
5 three x -0.687592
6 four x -2.052029
df2.duplicated('a')
df2.duplicated("a", keep="last")
df2.duplicated('a', keep=False)
0    False
1     True
2    False
3     True
4     True
5    False
6    False
dtype: bool






0     True
1    False
2     True
3     True
4    False
5    False
6    False
dtype: bool






0     True
1     True
2     True
3     True
4     True
5    False
6    False
dtype: bool
df2.drop_duplicates('a')
df2.drop_duplicates('a', keep="last")
df2.drop_duplicates('a', keep=False)
a b c
0 one x 2.206877
2 two x -0.706555
5 three x -0.687592
6 four x -2.052029
a b c
1 one y -1.852209
4 two x -0.185356
5 three x -0.687592
6 four x -2.052029
a b c
5 three x -0.687592
6 four x -2.052029
# 传入一个列表作为参数
df2.duplicated(['a', 'b'])  # a b 两列看做一个整体,标示重复值
df2.drop_duplicates(['a', 'b'])
0    False
1    False
2    False
3    False
4     True
5    False
6    False
dtype: bool
a b c
0 one x 2.206877
1 one y -1.852209
2 two x -0.706555
3 two y -1.007860
5 three x -0.687592
6 four x -2.052029
# index索引去重  index.duplicated
df3 = pd.DataFrame({'a': np.arange(6),'b': np.random.randn(6)},index=['a', 'a', 'b', 'c', 'b', 'a'])
df3
a b
a 0 1.446937
a 1 -1.740284
b 2 -0.604590
c 3 0.096239
b 4 0.823314
a 5 1.990803
df3.index.duplicated()
df3[~df3.index.duplicated()]
df3[~df3.index.duplicated(keep='last')]
df3[~df3.index.duplicated(keep=False)]
array([False,  True, False, False,  True,  True])
a b
a 0 1.446937
b 2 -0.604590
c 3 0.096239
a b
c 3 0.096239
b 4 0.823314
a 5 1.990803
a b
c 3 0.096239
# Dictionary-like get() method get方法
# Each of Series, DataFrame, and Panel have a get method which can return a default value.
s = pd.Series([1,2,3], index=['a','b','c'])
s.get("a")  # 相当于s["a"]
s.get("x", default=-1)  # 可以对不存在的index赋值
1






-1
# The lookup() Method
# 按一定的顺序取得行/列的值
dflookup = pd.DataFrame(np.random.rand(20,4), columns = ['A','B','C','D'])
dflookup
A B C D
0 0.942679 0.727316 0.658345 0.465770
1 0.437278 0.225158 0.436522 0.164805
2 0.270947 0.280223 0.309800 0.015967
3 0.559836 0.630962 0.673678 0.712503
4 0.535372 0.989887 0.661567 0.361962
5 0.726322 0.601192 0.547858 0.477509
6 0.829411 0.583613 0.871647 0.460966
7 0.365722 0.563660 0.164954 0.134314
8 0.817334 0.523003 0.684492 0.369386
9 0.710906 0.693633 0.618877 0.888263
10 0.026953 0.185217 0.746235 0.568846
11 0.192765 0.927200 0.431736 0.084300
12 0.988460 0.784320 0.180145 0.039405
13 0.233349 0.148678 0.888210 0.004917
14 0.105130 0.146724 0.261370 0.936558
15 0.701224 0.791860 0.420083 0.391538
16 0.290186 0.307993 0.139429 0.618879
17 0.182132 0.174420 0.845501 0.647986
18 0.732009 0.919950 0.197361 0.582814
19 0.568096 0.539125 0.269016 0.537584
list(range(0,10,2))
dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D'])
[0, 2, 4, 6, 8]






array([0.72731646, 0.30979986, 0.53537223, 0.58361289, 0.36938598])
# index objets 索引对象

# 索引可以通过list或其他序列对象直接创建
index = pd.Index(['e', 'd', 'a', 'b'])
index

# 可以命名
index = pd.Index(['e', 'd', 'a', 'b'], name='something')
index.name
index
Index(['e', 'd', 'a', 'b'], dtype='object')






'something'






Index(['e', 'd', 'a', 'b'], dtype='object', name='something')
index = pd.Index(list(range(5)), name='rows')
index
columns = pd.Index(['A', 'B', 'C'], name='cols')
columns
Int64Index([0, 1, 2, 3, 4], dtype='int64', name='rows')






Index(['A', 'B', 'C'], dtype='object', name='cols')
df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns)  # 使用索引给列命名,列名是一个索引对象
df
cols A B C
rows
0 1.868165 0.704614 -2.049472
1 -0.878810 0.706876 -0.741121
2 -2.649523 -0.952211 0.806387
3 0.369280 -0.052788 -0.995775
4 0.218246 -0.034493 -0.198815
# Setting metadata 设置元数据
# 索引“多半”是“不可变”,但是可以设置和改变其元数据,比如索引的名称,或者,多重索引的级别和标签
# 可以使用 rename, set_names, set_levels, and set_labels 设置这些属性。
# 默认返回一个拷贝(不修改原值),也可以就地修改in place
ind = pd.Index([1, 2, 3])
ind
Int64Index([1, 2, 3], dtype='int64')
ind.name = "ind"
ind
ind.rename("apple")
ind
ind.name = "apple"
ind
ind.set_names(["bob"], inplace=True)
ind
Int64Index([1, 2, 3], dtype='int64', name='ind')






Int64Index([1, 2, 3], dtype='int64', name='apple')






Int64Index([1, 2, 3], dtype='int64', name='ind')






Int64Index([1, 2, 3], dtype='int64', name='apple')






Int64Index([1, 2, 3], dtype='int64', name='bob')
# set_names, set_levels, and set_labels also take an optional level` argument
# df.query('ilevel_0 == "red"') 在query中使用 ilevel_0 ……
index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second'])
index
MultiIndex(levels=[[0, 1, 2], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])
index.levels[1]
index.levels[0]
index.set_levels(["a", "b"], level=1)
Index(['one', 'two'], dtype='object', name='second')






Int64Index([0, 1, 2], dtype='int64', name='first')






MultiIndex(levels=[[0, 1, 2], ['a', 'b']],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])
# Set operations on Index objects 索引对象中的集合操作 
# Note: The resulting index from a set operation will be sorted in ascending order. 
# 注意: 索引的集合操作返回的是升序排序后的结果
a = pd.Index(['c', 'b', 'a'])
b = pd.Index(['c', 'e', 'd'])
a | b  # 并集
a & b  # 交集
# a - b  # 不支持
a.difference(b)  # 差集,非对称
a.symmetric_difference(b)  # 对称差集,相当于idx1.difference(idx2).union(idx2.difference(idx1))
a ^ b  # 对称差集
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')






Index(['c'], dtype='object')






Index(['a', 'b'], dtype='object')






Index(['a', 'b', 'd', 'e'], dtype='object')






Index(['a', 'b', 'd', 'e'], dtype='object')
# Missing values 缺失值
# 注意:索引能够容纳缺失值,但是应该避免这样的情况。因为可能出现不可预料的结果,例如有些操作默认排除缺失值。

idx1 = pd.Index([1, np.nan, 3, 4])
idx1
Float64Index([1.0, nan, 3.0, 4.0], dtype='float64')
idx1.fillna(2)  # 缺失值赋值为2,nan的index赋值为2
Float64Index([1.0, 2.0, 3.0, 4.0], dtype='float64')
idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')])  # pd的缺失值是NaT,与np不同
idx2
DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03'], dtype='datetime64[ns]', freq=None)
idx2.fillna(pd.Timestamp('2011-01-02'))
DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='datetime64[ns]', freq=None)
# Set / Reset Index 设置/重设索引


# DataFrame
data = pd.DataFrame({"a":["bar", "bar", "foo", "foo"],
                    "b":["one","two","one","two"],
                    "c":["z","y","x", "w"],
                    "d":range(1,5)})  # range 不用list
data
a b c d
0 bar one z 1
1 bar two y 2
2 foo one x 3
3 foo two w 4
indexed1 = data.set_index("c")  # 把df的某列设置为index索引
indexed1
a b d
c
z bar one 1
y bar two 2
x foo one 3
w foo two 4
indexed2 = data.set_index(['a', 'b'])  # 把df的多列设置为多重索引
indexed2
c d
a b
bar one z 1
two y 2
foo one x 3
two w 4
frame = data.set_index('c', drop=False)  # drop参数,可以设置索引时不删除列,默认为删除列
frame
frame = frame.set_index(['a', 'b'], append=True)  # append参数,在原有索引的基础上,增加索引,变成复合索引
frame
a b c d
c
z bar one z 1
y bar two y 2
x foo one x 3
w foo two w 4
c d
c a b
z bar one z 1
y bar two y 2
x foo one x 3
w foo two w 4
data.set_index('c', drop=False)  # 把某列设为索引,但不删除
data
data.set_index(['a', 'b'], inplace=True)  # 就地修改,而不是返回拷贝
data
a b c d
c
z bar one z 1
y bar two y 2
x foo one x 3
w foo two w 4
a b c d
0 bar one z 1
1 bar two y 2
2 foo one x 3
3 foo two w 4
c d
a b
bar one z 1
two y 2
foo one x 3
two w 4
# Reset the index 重置索引
# Note: The reset_index method used to be called delevel which is now deprecated. 不推荐使用delevel

data
data.index.name  # 无返回值?
data.index.names  # 索引列的名称
data.reset_index()  # 重置索引,原索引变为普通列,原索引名称变为列名
c d
a b
bar one z 1
two y 2
foo one x 3
two w 4
FrozenList(['a', 'b'])
a b c d
0 bar one z 1
1 bar two y 2
2 foo one x 3
3 foo two w 4
frame
frame.reset_index(level=1)  # 对于多重索引,可以指定重置哪一级索引,而不是全部重置
c d
c a b
z bar one z 1
y bar two y 2
x foo one x 3
w foo two w 4
a c d
c b
z one bar z 1
y two bar y 2
x one foo x 3
w two foo w 4
# reset_index方法可以使用drop参数,若其为true,则仅是把索引剔除,而不转换为df数据列
frame
frame.reset_index(level=2, drop=True) 
c d
c a b
z bar one z 1
y bar two y 2
x foo one x 3
w foo two w 4
c d
c a
z bar z 1
y bar y 2
x foo x 3
w foo w 4
# Adding an ad hoc index
# If you create an index yourself, you can just assign it to the index field
# 创建index对象,并赋值给df
index = pd.MultiIndex.from_product([range(2), ['one', 'two']], names=['first', 'second'])
index
data
data.index = index  # 注意:index的长度要与df向适应
data
MultiIndex(levels=[[0, 1], ['one', 'two']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
           names=['first', 'second'])
a b c d
0 bar one z 1
1 bar two y 2
2 foo one x 3
3 foo two w 4
a b c d
first second
0 one bar one z 1
two bar two y 2
1 one foo one x 3
two foo two w 4
# Returning a view versus a copy 返回视图 对比 返回拷贝
# 避免链式操作,即多个连接的[],具体见前面的说明

# Why does assignment fail when using chained indexing? 为什么链式索引赋值失败
# python解释器,对以下赋值的区别

# 第一种:非链式赋值
dfmi.loc[:,('one','second')] = value
# becomes, python解释为:
dfmi.loc.__setitem__((slice(None), ('one', 'second')), value)  # 直接set
# Of course, dfmi.loc.__getitem__(idx) may be a view or a copy of dfmi.

# 第二种:链式赋值
dfmi['one']['second'] = value  # pd会抛出异常SettingWithCopy,链式操作具有不确定性!
# becomes, python解释为:
dfmi.__getitem__('one').__setitem__('second', value)  # 先get,再set,前面的get返回的是一个copy
# SettingWithCopy有时候会在没有明显的链式操作的情况下出现,例如:
def do_something(df):
   foo = df[['bar', 'baz']]  # Is foo a view? A copy? Nobody knows!
   # ... many lines here ...
   foo['quux'] = value       # We don't know whether this will modify df or not!  # “隐式”的链式操作
   return foo
# Evaluation order matters 赋值命令事情
# 链式赋值操作引起的SettingWithCopyWarning,可以通过设置option mode取消或抑制。

dfb = pd.DataFrame({'a' : ['one', 'one', 'two','three', 'two', 'one', 'six'],'c' : np.arange(7)})
dfb
a c
0 one 0
1 one 1
2 two 2
3 three 3
4 two 4
5 one 5
6 six 6
# This will show the SettingWithCopyWarning
# but the frame values will be set
dfb['c'][dfb.a.str.startswith('o')] = 42
dfb
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
a c
0 one 42
1 one 42
2 two 2
3 three 3
4 two 4
5 one 42
6 six 6
# This however is operating on a copy and will not work.
# 把option的提示级别由warn改为raise ????
pd.set_option('mode.chained_assignment','warn')  # This however is operating on a copy and will not work. # ??
# A chained assignment can also crop up in setting in a mixed dtype frame. 出现在
# Note : These setting rules apply to all of .loc/.iloc 

# This is the correct access method 正确的存取方法
dfc = pd.DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]})
dfc
dfc.loc[0,'A'] = 11
dfc
A B
0 aaa 1
1 bbb 2
2 ccc 3
A B
0 11 1
1 bbb 2
2 ccc 3
dfc = dfc.copy()
dfc
dfc['A'][0] = 111  # This can work at times, but is not guaranteed, and so should be avoided 有时可以工作,但不保证,应避免使用
dfc
A B
0 11 1
1 bbb 2
2 ccc 3
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A B
0 111 1
1 bbb 2
2 ccc 3
pd.set_option('mode.chained_assignment', 'raise')  # 链式赋值将不会执行
# http://pandas.pydata.org/pandas-docs/stable/indexing.html 全文完
# 2018-02-19

转载于:https://www.cnblogs.com/shigubuhua/p/8459134.html

你可能感兴趣的:(pandas索引.pandas.v0.22)