pandas学习记录

导入所需要的package

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

基本操作

In [4]: s = pd.Series([1,3,5,np.nan,6,8],dtype=float)
# Series表示一个一维的表结构,类似一个定长的有序字典
In [5]: s
Out[5]:
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [9]: dates = pd.date_range('20130101', periods=6)
#生成一个日期索引
In [10]: dates
Out[10]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
In [11]: df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
#DataFrame表示一个二维的表结构
In [12]: df
Out[12]:
                   A         B         C         D
2013-01-01  1.523050 -0.153522  0.738800  0.118666
2013-01-02  0.381207  1.206249  0.972379 -0.570013
2013-01-03  1.218619  0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153  2.424681  0.437909
2013-01-05 -1.545638 -1.858394  0.681397  0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
In [13]: df2 = pd.DataFrame({  'A' : 1,
    ...:                       'B' : pd.Timestamp('20170102'),
    ...:                       'C' : pd.Series([1,2,3,4],index=list(range(4)),dtype='int32'),
    ...:                       'D' : np.array([3] * 4,dtype='int32'),
    ...:                       'E' : pd.Categorical(["test","train","test","train"]),
    ...:                       'F' : 'foo' })
#手动生成,index为默认值
In [14]: df2
Out[14]:
   A          B  C  D      E    F
0  1 2017-01-02  1  3   test  foo
1  1 2017-01-02  2  3  train  foo
2  1 2017-01-02  3  3   test  foo
3  1 2017-01-02  4  3  train  foo

In [15]: df2.dtypes
Out[15]:
A             int64
B    datetime64[ns]
C             int32
D             int32
E          category
F            object
dtype: object
#得到DateFrame对象的前n个或者后n个
In [33]: df
Out[33]:
                   A         B         C         D
2013-01-01  1.523050 -0.153522  0.738800  0.118666
2013-01-02  0.381207  1.206249  0.972379 -0.570013
2013-01-03  1.218619  0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153  2.424681  0.437909
2013-01-05 -1.545638 -1.858394  0.681397  0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
#注意,如果没有参数,那么返回除第一个或者最后一个的所有列表
In [32]: df.head()
Out[32]:
                   A         B         C         D
2013-01-01  1.523050 -0.153522  0.738800  0.118666
2013-01-02  0.381207  1.206249  0.972379 -0.570013
2013-01-03  1.218619  0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153  2.424681  0.437909
2013-01-05 -1.545638 -1.858394  0.681397  0.827652

In [16]: df.head(2)
Out[16]:
                   A         B         C         D
2013-01-01  1.523050 -0.153522  0.738800  0.118666
2013-01-02  0.381207  1.206249  0.972379 -0.570013

In [17]: df.tail(3)
Out[17]:
                   A         B         C         D
2013-01-04 -1.077268 -0.696153  2.424681  0.437909
2013-01-05 -1.545638 -1.858394  0.681397  0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
#如上df
In [37]: df.index
Out[37]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
In [38]: df.columns
Out[38]: Index(['A', 'B', 'C', 'D'], dtype='object')
In [39]: df.values
Out[39]:
array([[ 1.52305048, -0.15352237,  0.73880011,  0.11866569],
       [ 0.38120739,  1.20624856,  0.97237919, -0.57001282],
       [ 1.21861889,  0.68241767, -1.38723555, -0.91492475],
       [-1.07726844, -0.69615296,  2.42468084,  0.43790878],
       [-1.5456376 , -1.8583936 ,  0.68139653,  0.82765189],
       [-2.13237432, -0.72172305, -0.02113291, -0.53177846]])
#对数据快速统计
In [40]: df.describe()
Out[40]:
              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean  -0.272067 -0.256854  0.568148 -0.105415
std    1.523314  1.096138  1.250827  0.673642
min   -2.132374 -1.858394 -1.387236 -0.914925
25%   -1.428545 -0.715331  0.154499 -0.560454
50%   -0.348031 -0.424838  0.710098 -0.206556
75%    1.009266  0.473433  0.913984  0.358098
max    1.523050  1.206249  2.424681  0.827652
In [41]: df.T#转置操作
Out[41]:
   2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
A    1.523050    0.381207    1.218619   -1.077268   -1.545638   -2.132374
B   -0.153522    1.206249    0.682418   -0.696153   -1.858394   -0.721723
C    0.738800    0.972379   -1.387236    2.424681    0.681397   -0.021133
D    0.118666   -0.570013   -0.914925    0.437909    0.827652   -0.531778
In [42]: df.sort_index(axis=1, ascending=False)#按索引大小排序,默认为从大到小
#如果axis为0表示按列索引排序,axis为1表示按行索引排序
#ascending为True,表示由小到大排序,否则,由大到小排序
Out[42]:
                   D         C         B         A
2013-01-01  0.118666  0.738800 -0.153522  1.523050
2013-01-02 -0.570013  0.972379  1.206249  0.381207
2013-01-03 -0.914925 -1.387236  0.682418  1.218619
2013-01-04  0.437909  2.424681 -0.696153 -1.077268
2013-01-05  0.827652  0.681397 -1.858394 -1.545638
2013-01-06 -0.531778 -0.021133 -0.721723 -2.132374
In [43]: df.sort_values(by='B')#按值升序排序
Out[43]:
                   A         B         C         D
2013-01-05 -1.545638 -1.858394  0.681397  0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
2013-01-04 -1.077268 -0.696153  2.424681  0.437909
2013-01-01  1.523050 -0.153522  0.738800  0.118666
2013-01-03  1.218619  0.682418 -1.387236 -0.914925
2013-01-02  0.381207  1.206249  0.972379 -0.570013

In [44]: df.sort_values(by='B',ascending=False)#按值降序排列
Out[44]:
                   A         B         C         D
2013-01-02  0.381207  1.206249  0.972379 -0.570013
2013-01-03  1.218619  0.682418 -1.387236 -0.914925
2013-01-01  1.523050 -0.153522  0.738800  0.118666
2013-01-04 -1.077268 -0.696153  2.424681  0.437909
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
2013-01-05 -1.545638 -1.858394  0.681397  0.827652

数据选择

In [45]: df.A#得到单独一列,相当于一个Series
Out[45]:
2013-01-01    1.523050
2013-01-02    0.381207
2013-01-03    1.218619
2013-01-04   -1.077268
2013-01-05   -1.545638
2013-01-06   -2.132374
Freq: D, Name: A, dtype: float64

In [46]: df[0:3]#前三列
Out[46]:
                   A         B         C         D
2013-01-01  1.523050 -0.153522  0.738800  0.118666
2013-01-02  0.381207  1.206249  0.972379 -0.570013
2013-01-03  1.218619  0.682418 -1.387236 -0.914925

In [47]: df['20130102':'20130104']#也可以这样操作
Out[47]:
                   A         B         C         D
2013-01-02  0.381207  1.206249  0.972379 -0.570013
2013-01-03  1.218619  0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153  2.424681  0.437909
#交叉区域
In [48]: dates
Out[48]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [49]: dates[0]
Out[49]: Timestamp('2013-01-01 00:00:00', offset='D')

In [55]: df
Out[55]:
                   A         B         C         D
2013-01-01  1.523050 -0.153522  0.738800  0.118666
2013-01-02  0.381207  1.206249  0.972379 -0.570013
2013-01-03  1.218619  0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153  2.424681  0.437909
2013-01-05 -1.545638 -1.858394  0.681397  0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778

In [51]: df.loc[dates[0]]#去某一行
Out[51]:
A    1.523050
B   -0.153522
C    0.738800
D    0.118666
Name: 2013-01-01 00:00:00, dtype: float64

In [54]: df.loc[:,['A','B']]#获取列
Out[54]:
                   A         B
2013-01-01  1.523050 -0.153522
2013-01-02  0.381207  1.206249
2013-01-03  1.218619  0.682418
2013-01-04 -1.077268 -0.696153
2013-01-05 -1.545638 -1.858394
2013-01-06 -2.132374 -0.721723

In [56]: df.loc['20130102':'20130104',['A','B']]#指定行列
Out[56]:
                   A         B
2013-01-02  0.381207  1.206249
2013-01-03  1.218619  0.682418
2013-01-04 -1.077268 -0.696153

In [57]: df.loc['20130102',['A','B']]#对于返回的对象进行维度缩减
Out[57]:
A    0.381207
B    1.206249
Name: 2013-01-02 00:00:00, dtype: float64

In [58]: df.loc[dates[0],'A']#得到一个具体值
Out[58]: 1.5230504820975281
In [59]: df.at[dates[0],'A']#得到一个具体值,与上边的方法相同
Out[59]: 1.5230504820975281

In [60]: df.iloc[3]#通过传递数值进行选择,第四行
Out[60]:
A   -1.077268
B   -0.696153
C    2.424681
D    0.437909
Name: 2013-01-04 00:00:00, dtype: float64

In [61]: df.iloc[3:5,0:2]#用数字进行切片
Out[61]:
                   A         B
2013-01-04 -1.077268 -0.696153
2013-01-05 -1.545638 -1.858394

In [62]: df.iloc[[1,2,4],[0,2]]#返回指定的行列
Out[62]:
                   A         C
2013-01-02  0.381207  0.972379
2013-01-03  1.218619 -1.387236
2013-01-05 -1.545638  0.681397

In [63]: df.iloc[1:3,:]#第二到三行
Out[63]:
                   A         B         C         D
2013-01-02  0.381207  1.206249  0.972379 -0.570013
2013-01-03  1.218619  0.682418 -1.387236 -0.914925

In [64]: df.iloc[:,1:3]#第二到三列
Out[64]:
                   B         C
2013-01-01 -0.153522  0.738800
2013-01-02  1.206249  0.972379
2013-01-03  0.682418 -1.387236
2013-01-04 -0.696153  2.424681
2013-01-05 -1.858394  0.681397
2013-01-06 -0.721723 -0.021133

#第二行,第二列的值
In [65]: df.iloc[1,1]
Out[65]: 1.2062485635529525
In [66]: df.iat[1,1]
Out[66]: 1.2062485635529525
In [68]: df
Out[68]:
                   A         B         C         D
2013-01-01  1.523050 -0.153522  0.738800  0.118666
2013-01-02  0.381207  1.206249  0.972379 -0.570013
2013-01-03  1.218619  0.682418 -1.387236 -0.914925
2013-01-04 -1.077268 -0.696153  2.424681  0.437909
2013-01-05 -1.545638 -1.858394  0.681397  0.827652
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778
In [67]: df[df.A > 0]
Out[67]:
                   A         B         C         D
2013-01-01  1.523050 -0.153522  0.738800  0.118666
2013-01-02  0.381207  1.206249  0.972379 -0.570013
2013-01-03  1.218619  0.682418 -1.387236 -0.914925

In [69]: df[df > 0]
Out[69]:
                   A         B         C         D
2013-01-01  1.523050       NaN  0.738800  0.118666
2013-01-02  0.381207  1.206249  0.972379       NaN
2013-01-03  1.218619  0.682418       NaN       NaN
2013-01-04       NaN       NaN  2.424681  0.437909
2013-01-05       NaN       NaN  0.681397  0.827652
2013-01-06       NaN       NaN       NaN       NaN

df[df.iloc[:,0]>0]#得到第一列大于0的行
In [70]: df2 = df.copy()

In [71]: df2['E'] = ['one', 'one','two','three','four','three']

In [72]: df2
Out[72]:
                   A         B         C         D      E
2013-01-01  1.523050 -0.153522  0.738800  0.118666    one
2013-01-02  0.381207  1.206249  0.972379 -0.570013    one
2013-01-03  1.218619  0.682418 -1.387236 -0.914925    two
2013-01-04 -1.077268 -0.696153  2.424681  0.437909  three
2013-01-05 -1.545638 -1.858394  0.681397  0.827652   four
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778  three

In [73]: df2[df2['E'].isin(['two','one'])]#找到特定行索引(E)包含某对象(two,one)的行
Out[73]:
                   A         B         C         D    E
2013-01-01  1.523050 -0.153522  0.738800  0.118666  one
2013-01-02  0.381207  1.206249  0.972379 -0.570013  one
2013-01-03  1.218619  0.682418 -1.387236 -0.914925  two
In [74]: s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

In [75]: s1
Out[75]:
2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64
In [76]: df['F'] = s1

In [77]: df
Out[77]:
                   A         B         C         D    F
2013-01-01  1.523050 -0.153522  0.738800  0.118666  NaN
2013-01-02  0.381207  1.206249  0.972379 -0.570013  1.0
2013-01-03  1.218619  0.682418 -1.387236 -0.914925  2.0
2013-01-04 -1.077268 -0.696153  2.424681  0.437909  3.0
2013-01-05 -1.545638 -1.858394  0.681397  0.827652  4.0
2013-01-06 -2.132374 -0.721723 -0.021133 -0.531778  5.0
In [78]: df.at[dates[0],'A'] = 0

In [79]: df.iat[0,1] = 0

In [80]: df.loc[:,'D'] = np.array([5] * len(df))

In [81]: df
Out[81]:
                   A         B         C  D    F
2013-01-01  0.000000  0.000000  0.738800  5  NaN
2013-01-02  0.381207  1.206249  0.972379  5  1.0
2013-01-03  1.218619  0.682418 -1.387236  5  2.0
2013-01-04 -1.077268 -0.696153  2.424681  5  3.0
2013-01-05 -1.545638 -1.858394  0.681397  5  4.0
2013-01-06 -2.132374 -0.721723 -0.021133  5  5.0
In [82]: df2 = df.copy()

In [83]: df2[df2 > 0] = -df2

In [84]: df2
Out[84]:
                   A         B         C  D    F
2013-01-01  0.000000  0.000000 -0.738800 -5  NaN
2013-01-02 -0.381207 -1.206249 -0.972379 -5 -1.0
2013-01-03 -1.218619 -0.682418 -1.387236 -5 -2.0
2013-01-04 -1.077268 -0.696153 -2.424681 -5 -3.0
2013-01-05 -1.545638 -1.858394 -0.681397 -5 -4.0
2013-01-06 -2.132374 -0.721723 -0.021133 -5 -5.0
In [85]: df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

In [86]: df1.loc[dates[0]:dates[1],'E'] = 1

In [87]: df1
Out[87]:
                   A         B         C  D    F    E
2013-01-01  0.000000  0.000000  0.738800  5  NaN  1.0
2013-01-02  0.381207  1.206249  0.972379  5  1.0  1.0
2013-01-03  1.218619  0.682418 -1.387236  5  2.0  NaN
2013-01-04 -1.077268 -0.696153  2.424681  5  3.0  NaN
#去掉包含缺失值得行
In [88]: df1.dropna(how='any')#how='all'#如果为all,所有值都为NA时去掉这个label,any为任意NA去掉
Out[88]:
                   A         B         C  D    F    E
2013-01-02  0.381207  1.206249  0.972379  5  1.0  1.0
#使用8来填充
In [90]: df1.fillna(value=8)
Out[90]:
                   A         B         C  D    F    E
2013-01-01  0.000000  0.000000  0.738800  5  8.0  1.0
2013-01-02  0.381207  1.206249  0.972379  5  1.0  1.0
2013-01-03  1.218619  0.682418 -1.387236  5  2.0  8.0
2013-01-04 -1.077268 -0.696153  2.424681  5  3.0  8.0

In [91]: pd.isnull(df1)#如果是NaN,返回Ture
Out[91]:
                A      B      C      D      F      E
2013-01-01  False  False  False  False   True  False
2013-01-02  False  False  False  False  False  False
2013-01-03  False  False  False  False  False   True
2013-01-04  False  False  False  False  False   True
In [92]: df
Out[92]:
                   A         B         C  D    F
2013-01-01  0.000000  0.000000  0.738800  5  NaN
2013-01-02  0.381207  1.206249  0.972379  5  1.0
2013-01-03  1.218619  0.682418 -1.387236  5  2.0
2013-01-04 -1.077268 -0.696153  2.424681  5  3.0
2013-01-05 -1.545638 -1.858394  0.681397  5  4.0
2013-01-06 -2.132374 -0.721723 -0.021133  5  5.0

In [93]: df.mean()#列均值,忽略NaN
Out[93]:
A   -0.525909
B   -0.231267
C    0.568148
D    5.000000
F    3.000000
dtype: float64

In [94]: df.mean(1)#行均值,忽略NaN
Out[94]:
2013-01-01    1.434700
2013-01-02    1.711967
2013-01-03    1.502760
2013-01-04    1.730252
2013-01-05    1.255473
2013-01-06    1.424954
Freq: D, dtype: float64
In [97]: dates
Out[97]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
In [95]: s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
#对于拥有不同维度,需要对齐的对象进行操作。Pandas会自动的沿着指定的维度进行广播,向后飘两个单位
In [96]: s
Out[96]:
2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [98]: df.sub(s, axis='index')#s和df按index相加
Out[98]:
                   A         B         C    D    F
2013-01-01       NaN       NaN       NaN  NaN  NaN
2013-01-02       NaN       NaN       NaN  NaN  NaN
2013-01-03  0.218619 -0.317582 -2.387236  4.0  1.0
2013-01-04 -4.077268 -3.696153 -0.575319  2.0  0.0
2013-01-05 -6.545638 -6.858394 -4.318603  0.0 -1.0
2013-01-06       NaN       NaN       NaN  NaN  NaN
In [99]: df
Out[99]:
                   A         B         C  D    F
2013-01-01  0.000000  0.000000  0.738800  5  NaN
2013-01-02  0.381207  1.206249  0.972379  5  1.0
2013-01-03  1.218619  0.682418 -1.387236  5  2.0
2013-01-04 -1.077268 -0.696153  2.424681  5  3.0
2013-01-05 -1.545638 -1.858394  0.681397  5  4.0
2013-01-06 -2.132374 -0.721723 -0.021133  5  5.0
#将一个函数应用于该对象
In [100]: df.apply(np.cumsum)
Out[100]:
                   A         B         C   D     F
2013-01-01  0.000000  0.000000  0.738800   5   NaN
2013-01-02  0.381207  1.206249  1.711179  10   1.0
2013-01-03  1.599826  1.888666  0.323944  15   3.0
2013-01-04  0.522558  1.192513  2.748625  20   6.0
2013-01-05 -1.023080 -0.665880  3.430021  25  10.0
2013-01-06 -3.155454 -1.387603  3.408888  30  15.0

In [101]: df.apply(lambda x: x.max() - x.min())
Out[101]:
A    3.350993
B    3.064642
C    3.811916
D    0.000000
F    4.000000
dtype: float64
In [102]: s = pd.Series(np.random.randint(0, 7, size=10))
In [103]: s
Out[103]:
0    0
1    2
2    4
3    2
4    2
5    4
6    1
7    2
8    6
9    5
dtype: int32

In [104]: s.value_counts()
Out[104]:
2    4
4    2
6    1
5    1
1    1
0    1
dtype: int64

String Methods*

In [105]: s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
In [106]: s.str.lower()
Out[106]:
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

concat:连接

In [107]: df = pd.DataFrame(np.random.randn(10, 4))

In [108]: df
Out[108]:
          0         1         2         3
0 -0.350383  0.066715 -0.340783  0.676047
1  1.246707  1.074448 -0.306961 -0.870164
2  0.978723  2.676164  0.810125  1.742859
3 -1.881706  1.900046  0.446396 -1.850192
4 -1.869060  0.194756 -3.999157  0.357495
5  0.821017 -0.131944 -0.172258 -0.550951
6  1.065902  0.788957 -0.155128  0.954346
7  0.331280  0.257569  0.005222  2.232761
8 -1.046927  0.723164 -0.311852 -2.101644
9 -0.518778 -1.058908  0.241587  0.112990

In [111]: pieces = [df[:2], df[:3], df[:5]]

In [112]: pd.concat(pieces)
Out[112]:
          0         1         2         3
0 -0.350383  0.066715 -0.340783  0.676047
1  1.246707  1.074448 -0.306961 -0.870164
0 -0.350383  0.066715 -0.340783  0.676047
1  1.246707  1.074448 -0.306961 -0.870164
2  0.978723  2.676164  0.810125  1.742859
0 -0.350383  0.066715 -0.340783  0.676047
1  1.246707  1.074448 -0.306961 -0.870164
2  0.978723  2.676164  0.810125  1.742859
3 -1.881706  1.900046  0.446396 -1.850192
4 -1.869060  0.194756 -3.999157  0.357495

Join

In [113]: left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
In [114]: right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
In [115]: left
Out[115]:
   key  lval
0  foo     1
1  foo     2
In [116]: right
Out[116]:
   key  rval
0  foo     4
1  foo     5

In [117]: pd.merge(left, right, on='key')
Out[117]:
   key  lval  rval
0  foo     1     4
1  foo     1     5
2  foo     2     4
3  foo     2     5

#对比
In [119]: left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})

In [120]: right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

In [121]: left
Out[121]:
   key  lval
0  foo     1
1  bar     2

In [122]: right
Out[122]:
   key  rval
0  foo     4
1  bar     5

In [123]: pd.merge(left, right, on='key')
Out[123]:
   key  lval  rval
0  foo     1     4
1  bar     2     5

append

In [124]: df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])

In [125]: df
Out[125]:
          A         B         C         D
0 -0.313586  0.010213  0.521081 -1.286954
1  0.079310  1.344337 -0.206647  1.471418
2  0.439421 -0.908587  0.382107  0.765690
3 -0.473688 -0.607198 -0.028644  0.541583
4 -0.778374 -0.743020 -1.231464  0.226309
5  0.335211 -1.464544  0.792436 -0.678236
6 -1.690446  0.315661  1.148733 -1.110375
7 -1.965615  0.341851  0.604430 -0.418681

In [126]: s = df.iloc[3]

In [127]: df.append(s, ignore_index=True)
Out[127]:
          A         B         C         D
0 -0.313586  0.010213  0.521081 -1.286954
1  0.079310  1.344337 -0.206647  1.471418
2  0.439421 -0.908587  0.382107  0.765690
3 -0.473688 -0.607198 -0.028644  0.541583
4 -0.778374 -0.743020 -1.231464  0.226309
5  0.335211 -1.464544  0.792436 -0.678236
6 -1.690446  0.315661  1.148733 -1.110375
7 -1.965615  0.341851  0.604430 -0.418681
8 -0.473688 -0.607198 -0.028644  0.541583

Grouping

对于”group by”操作,我们通常是指以下一个或多个操作步骤:

  • (Splitting)按照一些规则将数据分为不同的组;

  • (Applying)对于每组数据分别执行一个函数;

  • (Combining)将结果组合到一个数据结构中;

In [128]: df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
     ...:                               'foo', 'bar', 'foo', 'foo'],
     ...:                        'B' : ['one', 'one', 'two', 'three',
     ...:                               'two', 'two', 'one', 'three'],
     ...:                        'C' : np.random.randn(8),
     ...:                        'D' : np.random.randn(8)})
In [129]: df
Out[129]:
     A      B         C         D
0  foo    one  1.089882 -0.893373
1  bar    one  0.799028  0.237772
2  foo    two -0.937741 -1.471799
3  bar  three  1.195312  0.692406
4  foo    two  1.005337 -0.249233
5  bar    two -1.306802 -0.627874
6  foo    one  1.619168 -0.539545
7  foo  three -0.902081  1.303749

In [130]: df.groupby('A').sum()
Out[130]:
            C         D
A
bar  0.687537  0.302305
foo  1.874566 -1.850200

In [131]: df.groupby(['A','B']).sum()
Out[131]:
                  C         D
A   B
bar one    0.799028  0.237772
    three  1.195312  0.692406
    two   -1.306802 -0.627874
foo one    2.709051 -1.432917
    three -0.902081  1.303749
    two    0.067596 -1.721032

Reshaping

In [132]: tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
     ...:                          'foo', 'foo', 'qux', 'qux'],
     ...:                         ['one', 'two', 'one', 'two',
     ...:                          'one', 'two', 'one', 'two']]))

In [133]: tuples
Out[133]:
[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [134]: index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

In [135]: df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])

In [136]: df
Out[136]:
                     A         B
first second
bar   one     0.535103  0.335592
      two    -0.007379 -0.271691
baz   one    -1.927102 -0.348317
      two    -0.392650  0.178679
foo   one     0.620067  0.365730
      two    -0.675787 -1.806861
qux   one    -0.160767  0.376871
      two    -0.667896 -0.137858

In [137]: df2 = df[:4]

In [138]: df2
Out[138]:
                     A         B
first second
bar   one     0.535103  0.335592
      two    -0.007379 -0.271691
baz   one    -1.927102 -0.348317
      two    -0.392650  0.178679

In [139]: stacked = df2.stack()

In [140]: stacked
Out[140]:
first  second
bar    one     A    0.535103
               B    0.335592
       two     A   -0.007379
               B   -0.271691
baz    one     A   -1.927102
               B   -0.348317
       two     A   -0.392650
               B    0.178679
dtype: float64

In [141]: stacked.unstack()
Out[141]:
                     A         B
first second
bar   one     0.535103  0.335592
      two    -0.007379 -0.271691
baz   one    -1.927102 -0.348317
      two    -0.392650  0.178679

In [142]: stacked.unstack(1)
Out[142]:
second        one       two
first
bar   A  0.535103 -0.007379
      B  0.335592 -0.271691
baz   A -1.927102 -0.392650
      B -0.348317  0.178679

In [143]: stacked.unstack(0)
Out[143]:
first          bar       baz
second
one    A  0.535103 -1.927102
       B  0.335592 -0.348317
two    A -0.007379 -0.392650
       B -0.271691  0.178679

Pivot Tables

In [144]: df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
     ...:                        'B' : ['A', 'B', 'C'] * 4,
     ...:                        'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
     ...:                        'D' : np.random.randn(12),
     ...:                        'E' : np.random.randn(12)})

In [145]: df
Out[145]:
        A  B    C         D         E
0     one  A  foo -0.029439 -1.855289
1     one  B  foo -0.648452  1.242427
2     two  C  foo -0.120858 -0.085237
3   three  A  bar -0.215119  2.244871
4     one  B  bar  1.412866 -0.886582
5     one  C  bar  0.818426  1.053411
6     two  A  foo  0.594699 -0.461450
7   three  B  foo  0.861705  0.687342
8     one  C  foo -1.429140  1.337407
9     one  A  bar -0.872577  0.641517
10    two  B  bar  0.524023 -1.575005
11  three  C  bar  0.394977  0.075784

In [146]: pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
Out[146]:
C             bar       foo
A     B
one   A -0.872577 -0.029439
      B  1.412866 -0.648452
      C  0.818426 -1.429140
three A -0.215119       NaN
      B       NaN  0.861705
      C  0.394977       NaN
two   A       NaN  0.594699
      B  0.524023       NaN
      C       NaN -0.120858

Time Series*

In [158]: rng = pd.date_range('1/1/2017', periods=10, freq='S')
#从指定时刻开始产生时刻
In [159]: rng
Out[159]:
DatetimeIndex(['2017-01-01 00:00:00', '2017-01-01 00:00:01',
               '2017-01-01 00:00:02', '2017-01-01 00:00:03',
               '2017-01-01 00:00:04', '2017-01-01 00:00:05',
               '2017-01-01 00:00:06', '2017-01-01 00:00:07',
               '2017-01-01 00:00:08', '2017-01-01 00:00:09'],
              dtype='datetime64[ns]', freq='S')

In [160]: ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
#赋予指定时刻随机值
In [161]: ts
Out[161]:
2017-01-01 00:00:00      3
2017-01-01 00:00:01    235
2017-01-01 00:00:02     97
2017-01-01 00:00:03     10
2017-01-01 00:00:04    411
2017-01-01 00:00:05    153
2017-01-01 00:00:06    188
2017-01-01 00:00:07    195
2017-01-01 00:00:08    418
2017-01-01 00:00:09    226
Freq: S, dtype: int32
ts.resample('5Min').sum()#指定时间间隔求和
Out[163]:
2017-01-01    1936
Freq: 5T, dtype: int32

In [166]: rng = pd.date_range('1/1/2017', periods=100, freq='S')

In [167]: ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

In [168]: ts.resample('1Min').sum()
Out[168]:
2017-01-01 00:00:00    15906
2017-01-01 00:01:00     9008
Freq: T, dtype: int32

In [169]: sum(ts.values[:60])
Out[169]: 15906

In [170]: sum(ts.values[60:])
Out[170]: 9008
In [171]: rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
In [172]: rng
Out[172]:
DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')

In [173]: ts = pd.Series(np.random.randn(len(rng)), rng)

In [175]: ts
Out[175]:
2012-03-06    1.421425
2012-03-07   -0.096355
2012-03-08   -0.648838
2012-03-09    0.688946
2012-03-10   -1.051607
Freq: D, dtype: float64

In [176]: ts_utc = ts.tz_localize('UTC')

In [177]: ts_utc
Out[177]:
2012-03-06 00:00:00+00:00    1.421425
2012-03-07 00:00:00+00:00   -0.096355
2012-03-08 00:00:00+00:00   -0.648838
2012-03-09 00:00:00+00:00    0.688946
2012-03-10 00:00:00+00:00   -1.051607
Freq: D, dtype: float64

In [178]: ts_utc.tz_convert('US/Eastern')#时区转化
Out[178]:
2012-03-05 19:00:00-05:00    1.421425
2012-03-06 19:00:00-05:00   -0.096355
2012-03-07 19:00:00-05:00   -0.648838
2012-03-08 19:00:00-05:00    0.688946
2012-03-09 19:00:00-05:00   -1.051607
Freq: D, dtype: float64
In [179]: rng = pd.date_range('1/1/2012', periods=5, freq='M')

In [180]: rng
Out[180]:
DatetimeIndex(['2012-01-31', '2012-02-29', '2012-03-31', '2012-04-30',
               '2012-05-31'],
              dtype='datetime64[ns]', freq='M')

In [181]: ts = pd.Series(np.random.randn(len(rng)), index=rng)

In [182]: ts
Out[182]:
2012-01-31   -0.155942
2012-02-29   -0.767510
2012-03-31   -0.155900
2012-04-30   -0.098059
2012-05-31    0.607614
Freq: M, dtype: float64

In [183]: ps = ts.to_period()

In [184]: ps
Out[184]:
2012-01   -0.155942
2012-02   -0.767510
2012-03   -0.155900
2012-04   -0.098059
2012-05    0.607614
Freq: M, dtype: float64

In [185]: ps.to_timestamp()
Out[185]:
2012-01-01   -0.155942
2012-02-01   -0.767510
2012-03-01   -0.155900
2012-04-01   -0.098059
2012-05-01    0.607614
Freq: MS, dtype: float64
In [186]: prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
In [187]: prng
Out[187]:
PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
             '1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
             '1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
             '1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
             '1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
             '1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
             '1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
             '2000Q3', '2000Q4'],
            dtype='int64', freq='Q-NOV')
In [188]: ts = pd.Series(np.random.randn(len(prng)), prng)
In [189]: ts
Out[189]:
1990Q1    0.297797
1990Q2    0.283513
1990Q3   -0.322105
1990Q4   -1.541848
1991Q1    0.539559
1991Q2    0.161805
1991Q3    0.966456
1991Q4    0.228009
1992Q1    0.033140
1992Q2   -1.803392
1992Q3    0.795412
1992Q4    0.670945
1993Q1    1.443656
1993Q2   -1.191861
1993Q3   -0.747472
1993Q4    0.919639
1994Q1    1.414509
1994Q2   -0.087450
1994Q3   -1.556399
1994Q4   -1.028431
1995Q1    0.516376
1995Q2   -0.798248
1995Q3   -0.032770
1995Q4    0.127403
1996Q1    0.118407
1996Q2    0.983731
1996Q3    0.444699
1996Q4    1.305806
1997Q1   -0.445013
1997Q2   -0.130233
1997Q3   -1.544834
1997Q4   -0.737617
1998Q1   -0.481101
1998Q2   -0.694068
1998Q3   -1.945882
1998Q4    0.091352
1999Q1    0.097229
1999Q2   -0.422791
1999Q3    1.093277
1999Q4   -0.551056
2000Q1   -1.201843
2000Q2    0.270037
2000Q3   -0.256666
2000Q4    1.233440
Freq: Q-NOV, dtype: float64
In [190]: ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
In [191]:  ts.head()
Out[191]:
1990-03-01 09:00    0.297797
1990-06-01 09:00    0.283513
1990-09-01 09:00   -0.322105
1990-12-01 09:00   -1.541848
1991-03-01 09:00    0.539559
Freq: H, dtype: float64

Categoricals

Plotting

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
pandas学习记录_第1张图片
ts.plot
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                   columns=['A', 'B', 'C', 'D'])

df = df.cumsum()

plt.figure(); df.plot(); plt.legend(loc='best')
pandas学习记录_第2张图片
best colour

你可能感兴趣的:(pandas学习记录)