十分钟搞定pandas
习惯上导入包的方式:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
一、 创建对象
import pandas as pd
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
import pandas as pd
dates = pd.date_range('20170101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
|
A |
B |
C |
D |
2017-01-01 |
-0.338806 |
-1.343308 |
1.196353 |
1.414468 |
2017-01-02 |
-0.506825 |
1.152511 |
0.972928 |
-0.154819 |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
-0.217771 |
2017-01-04 |
-0.312771 |
-0.635625 |
-2.005167 |
0.233515 |
2017-01-05 |
0.955924 |
-0.131035 |
0.473385 |
-0.827717 |
2017-01-06 |
0.455371 |
-0.079677 |
1.030877 |
-0.276262 |
import pandas as pd
df2 = pd.DataFrame({
'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1, index=range(4), dtype='float32'),
'D' : np.array([3]*4, dtype='int32'),
'E' : pd.Categorical(['test1', 'test2', 'test3', 'test4']),
'F' : 'foo'
})
df2
|
A |
B |
C |
D |
E |
F |
0 |
1.0 |
2013-01-02 |
1.0 |
3 |
test1 |
foo |
1 |
1.0 |
2013-01-02 |
1.0 |
3 |
test2 |
foo |
2 |
1.0 |
2013-01-02 |
1.0 |
3 |
test3 |
foo |
3 |
1.0 |
2013-01-02 |
1.0 |
3 |
test4 |
foo |
df2.dtypes
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
二、 查看数据
查看DataFrame头部和尾部的行
df.head(1)
|
A |
B |
C |
D |
2017-01-01 |
-0.338806 |
-1.343308 |
1.196353 |
1.414468 |
df.tail(3)
|
A |
B |
C |
D |
2017-01-04 |
-0.312771 |
-0.635625 |
-2.005167 |
0.233515 |
2017-01-05 |
0.955924 |
-0.131035 |
0.473385 |
-0.827717 |
2017-01-06 |
0.455371 |
-0.079677 |
1.030877 |
-0.276262 |
显示索引、列和底层的numpy数据
print "显示索引:"
print df.index
print "\n显示列:"
print df.columns
print "\n显示底层numpy数据:"
print df.values
运行结果如下:
显示索引:
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
'2017-01-05', '2017-01-06'],
dtype='datetime64[ns]', freq='D')
显示列:
Index([u'A', u'B', u'C', u'D'], dtype='object')
显示底层numpy数据:
[[-0.33880624 -1.34330788 1.19635337 1.41446846]
[-0.50682548 1.15251089 0.97292792 -0.15481877]
[ 1.4460934 0.08359888 0.06867169 -0.21777067]
[-0.3127707 -0.63562542 -2.00516672 0.23351479]
[ 0.95592407 -0.13103515 0.47338504 -0.82771723]
[ 0.45537138 -0.07967712 1.03087708 -0.27626179]]
数据的快速统计汇总describe函数
df.describe()
|
A |
B |
C |
D |
count |
6.000000 |
6.000000 |
6.000000 |
6.000000 |
mean |
0.283164 |
-0.158923 |
0.289508 |
0.028569 |
std |
0.800092 |
0.826528 |
1.198930 |
0.759366 |
min |
-0.506825 |
-1.343308 |
-2.005167 |
-0.827717 |
25% |
-0.332297 |
-0.509478 |
0.169850 |
-0.261639 |
50% |
0.071300 |
-0.105356 |
0.723156 |
-0.186295 |
75% |
0.830786 |
0.042780 |
1.016390 |
0.136431 |
max |
1.446093 |
1.152511 |
1.196353 |
1.414468 |
数据的转置
df.T
|
2017-01-01 00:00:00 |
2017-01-02 00:00:00 |
2017-01-03 00:00:00 |
2017-01-04 00:00:00 |
2017-01-05 00:00:00 |
2017-01-06 00:00:00 |
A |
-0.338806 |
-0.506825 |
1.446093 |
-0.312771 |
0.955924 |
0.455371 |
B |
-1.343308 |
1.152511 |
0.083599 |
-0.635625 |
-0.131035 |
-0.079677 |
C |
1.196353 |
0.972928 |
0.068672 |
-2.005167 |
0.473385 |
1.030877 |
D |
1.414468 |
-0.154819 |
-0.217771 |
0.233515 |
-0.827717 |
-0.276262 |
按轴进行排序
df.sort_index(axis=1, ascending=False)
|
D |
C |
B |
A |
2017-01-01 |
1.414468 |
1.196353 |
-1.343308 |
-0.338806 |
2017-01-02 |
-0.154819 |
0.972928 |
1.152511 |
-0.506825 |
2017-01-03 |
-0.217771 |
0.068672 |
0.083599 |
1.446093 |
2017-01-04 |
0.233515 |
-2.005167 |
-0.635625 |
-0.312771 |
2017-01-05 |
-0.827717 |
0.473385 |
-0.131035 |
0.955924 |
2017-01-06 |
-0.276262 |
1.030877 |
-0.079677 |
0.455371 |
按值进行排序
df.sort_values(by='A')
|
A |
B |
C |
D |
2017-01-02 |
-0.506825 |
1.152511 |
0.972928 |
-0.154819 |
2017-01-01 |
-0.338806 |
-1.343308 |
1.196353 |
1.414468 |
2017-01-04 |
-0.312771 |
-0.635625 |
-2.005167 |
0.233515 |
2017-01-06 |
0.455371 |
-0.079677 |
1.030877 |
-0.276262 |
2017-01-05 |
0.955924 |
-0.131035 |
0.473385 |
-0.827717 |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
-0.217771 |
三、 选择
标准的python/numpy的选择和设置表达式都能够直接派上用场, 但是作为工程中使用的代码,
推荐使用经过优化的pandas数据访问方式:.at, .iat, .loc, .iloc和.ix。
获取
df['A']
df[0:3]
df['20170101':'20170104']
|
A |
B |
C |
D |
2017-01-01 |
-0.338806 |
-1.343308 |
1.196353 |
1.414468 |
2017-01-02 |
-0.506825 |
1.152511 |
0.972928 |
-0.154819 |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
-0.217771 |
2017-01-04 |
-0.312771 |
-0.635625 |
-2.005167 |
0.233515 |
通过标签选择
df.loc[dates[0]]
df.loc[:, ['A', 'B']]
df.loc['20170102':'20170104',['A','B']]
df.loc['20170102', ['A', 'B']]
df.loc[dates[0], 'A']
df.at[dates[0], 'A']
-0.338806236660884
通过位置进行选择
df.iloc[3]
A -0.312771 B -0.635625 C -2.005167 D 0.233515 Name: 2017-01-04 00:00:00, dtype: float64
df.iloc[3:5, 0:2]
|
A |
B |
2017-01-04 |
-0.312771 |
-0.635625 |
2017-01-05 |
0.955924 |
-0.131035 |
df.iloc[[1,2,4], [0,2]]
|
A |
C |
2017-01-02 |
-0.506825 |
0.972928 |
2017-01-03 |
1.446093 |
0.068672 |
2017-01-05 |
0.955924 |
0.473385 |
df.iloc[1:3, :]
|
A |
B |
C |
D |
2017-01-02 |
-0.506825 |
1.152511 |
0.972928 |
-0.154819 |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
-0.217771 |
df.iloc[:, 1:3]
|
B |
C |
2017-01-01 |
-1.343308 |
1.196353 |
2017-01-02 |
1.152511 |
0.972928 |
2017-01-03 |
0.083599 |
0.068672 |
2017-01-04 |
-0.635625 |
-2.005167 |
2017-01-05 |
-0.131035 |
0.473385 |
2017-01-06 |
-0.079677 |
1.030877 |
df.iloc[1,1]
1.1525108922362974
df.iat[1,1]
1.1525108922362974
布尔索引
df[df.A > 0]
|
A |
B |
C |
D |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
-0.217771 |
2017-01-05 |
0.955924 |
-0.131035 |
0.473385 |
-0.827717 |
2017-01-06 |
0.455371 |
-0.079677 |
1.030877 |
-0.276262 |
df[df > 0]
|
A |
B |
C |
D |
2017-01-01 |
NaN |
NaN |
1.196353 |
1.414468 |
2017-01-02 |
NaN |
1.152511 |
0.972928 |
NaN |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
NaN |
2017-01-04 |
NaN |
NaN |
NaN |
0.233515 |
2017-01-05 |
0.955924 |
NaN |
0.473385 |
NaN |
2017-01-06 |
0.455371 |
NaN |
1.030877 |
NaN |
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
df2[df2['E'].isin(['two', 'four'])]
|
A |
B |
C |
D |
E |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
-0.217771 |
two |
2017-01-05 |
0.955924 |
-0.131035 |
0.473385 |
-0.827717 |
four |
设置
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20170102', periods=6))
df['F']=s1
df
|
A |
B |
C |
D |
F |
2017-01-01 |
-0.338806 |
-1.343308 |
1.196353 |
1.414468 |
NaN |
2017-01-02 |
-0.506825 |
1.152511 |
0.972928 |
-0.154819 |
1.0 |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
-0.217771 |
2.0 |
2017-01-04 |
-0.312771 |
-0.635625 |
-2.005167 |
0.233515 |
3.0 |
2017-01-05 |
0.955924 |
-0.131035 |
0.473385 |
-0.827717 |
4.0 |
2017-01-06 |
0.455371 |
-0.079677 |
1.030877 |
-0.276262 |
5.0 |
df.at[dates[0], 'A'] = 0
df
|
A |
B |
C |
D |
F |
2017-01-01 |
0.000000 |
-1.343308 |
1.196353 |
1.414468 |
NaN |
2017-01-02 |
-0.506825 |
1.152511 |
0.972928 |
-0.154819 |
1.0 |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
-0.217771 |
2.0 |
2017-01-04 |
-0.312771 |
-0.635625 |
-2.005167 |
0.233515 |
3.0 |
2017-01-05 |
0.955924 |
-0.131035 |
0.473385 |
-0.827717 |
4.0 |
2017-01-06 |
0.455371 |
-0.079677 |
1.030877 |
-0.276262 |
5.0 |
df.iat[0,1] = 0
df
|
A |
B |
C |
D |
F |
2017-01-01 |
0.000000 |
0.000000 |
1.196353 |
1.414468 |
NaN |
2017-01-02 |
-0.506825 |
1.152511 |
0.972928 |
-0.154819 |
1.0 |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
-0.217771 |
2.0 |
2017-01-04 |
-0.312771 |
-0.635625 |
-2.005167 |
0.233515 |
3.0 |
2017-01-05 |
0.955924 |
-0.131035 |
0.473385 |
-0.827717 |
4.0 |
2017-01-06 |
0.455371 |
-0.079677 |
1.030877 |
-0.276262 |
5.0 |
df.loc[:, 'D'] = np.array([5] * len(df))
df
|
A |
B |
C |
D |
F |
2017-01-01 |
0.000000 |
0.000000 |
1.196353 |
5 |
NaN |
2017-01-02 |
-0.506825 |
1.152511 |
0.972928 |
5 |
1.0 |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
5 |
2.0 |
2017-01-04 |
-0.312771 |
-0.635625 |
-2.005167 |
5 |
3.0 |
2017-01-05 |
0.955924 |
-0.131035 |
0.473385 |
5 |
4.0 |
2017-01-06 |
0.455371 |
-0.079677 |
1.030877 |
5 |
5.0 |
df2 = df.copy()
df2[df2<0] = -df2
df2
|
A |
B |
C |
D |
F |
2017-01-01 |
0.000000 |
0.000000 |
1.196353 |
5 |
NaN |
2017-01-02 |
0.506825 |
1.152511 |
0.972928 |
5 |
1.0 |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
5 |
2.0 |
2017-01-04 |
0.312771 |
0.635625 |
2.005167 |
5 |
3.0 |
2017-01-05 |
0.955924 |
0.131035 |
0.473385 |
5 |
4.0 |
2017-01-06 |
0.455371 |
0.079677 |
1.030877 |
5 |
5.0 |
四、 缺失值处理
在pandas中,使用np.nan来代替缺失值,这些值默认不会包含在计算中。
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1], 'E'] = 1
df1
|
A |
B |
C |
D |
F |
E |
2017-01-01 |
0.000000 |
0.000000 |
1.196353 |
5 |
NaN |
1.0 |
2017-01-02 |
-0.506825 |
1.152511 |
0.972928 |
5 |
1.0 |
1.0 |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
5 |
2.0 |
NaN |
2017-01-04 |
-0.312771 |
-0.635625 |
-2.005167 |
5 |
3.0 |
NaN |
df1.dropna(how='any')
|
A |
B |
C |
D |
F |
E |
2017-01-02 |
-0.506825 |
1.152511 |
0.972928 |
5 |
1.0 |
1.0 |
df1.fillna(value=3)
|
A |
B |
C |
D |
F |
E |
2017-01-01 |
0.000000 |
0.000000 |
1.196353 |
5 |
3.0 |
1.0 |
2017-01-02 |
-0.506825 |
1.152511 |
0.972928 |
5 |
1.0 |
1.0 |
2017-01-03 |
1.446093 |
0.083599 |
0.068672 |
5 |
2.0 |
3.0 |
2017-01-04 |
-0.312771 |
-0.635625 |
-2.005167 |
5 |
3.0 |
3.0 |
pd.isnull(df1)
|
A |
B |
C |
D |
F |
E |
2017-01-01 |
False |
False |
False |
False |
True |
False |
2017-01-02 |
False |
False |
False |
False |
False |
False |
2017-01-03 |
False |
False |
False |
False |
False |
True |
2017-01-04 |
False |
False |
False |
False |
False |
True |
五、 相关操作
统计
df.mean()
运行结果如下:
A 0.339632
B 0.064962
C 0.289508
D 5.000000
F 3.000000
dtype: float64
df.mean(1)
运行结果如下:
2017-01-01 1.549088
2017-01-02 1.523723
2017-01-03 1.719673
2017-01-04 1.009287
2017-01-05 2.059655
2017-01-06 2.281314
Freq: D, dtype: float64
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
print s
df.sub(s, axis='index')
运行结果如下:
2017-01-01 NaN
2017-01-02 NaN
2017-01-03 1.0
2017-01-04 3.0
2017-01-05 5.0
2017-01-06 NaN
Freq: D, dtype: float64
|
A |
B |
C |
D |
F |
2017-01-01 |
NaN |
NaN |
NaN |
NaN |
NaN |
2017-01-02 |
NaN |
NaN |
NaN |
NaN |
NaN |
2017-01-03 |
0.446093 |
-0.916401 |
-0.931328 |
4.0 |
1.0 |
2017-01-04 |
-3.312771 |
-3.635625 |
-5.005167 |
2.0 |
0.0 |
2017-01-05 |
-4.044076 |
-5.131035 |
-4.526615 |
0.0 |
-1.0 |
2017-01-06 |
NaN |
NaN |
NaN |
NaN |
NaN |
应用Apply
df.apply(np.cumsum)
df.apply(lambda x: x.max()-x.min())
A 1.952919 B 1.788136 C 3.201520 D 0.000000 F 4.000000 dtype: float64 ### 直方图
s = pd.Series(np.random.randint(0,7, size=10))
s.value_counts()
运行结果如下:
5 3
4 3
1 3
2 1
dtype: int64
字符串方法
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,
'CABA', 'dog', 'cat'])
s.str.lower()
运行结果如下:
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
六、 合并
pandas提供了大量方法对Series, DataFrame和Panel对象进行各种逻辑关系的合并操作;
6-1、 Concat方法
df = pd.DataFrame(np.random.randn(10,4))
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)
|
0 |
1 |
2 |
3 |
0 |
-0.361157 |
-0.331600 |
0.426531 |
-1.063448 |
1 |
0.082863 |
1.293261 |
0.729009 |
1.119674 |
2 |
-0.077758 |
0.024595 |
2.142237 |
-0.857855 |
3 |
2.477844 |
0.118069 |
1.390542 |
-0.645197 |
4 |
-1.295224 |
-0.410728 |
1.256108 |
-1.402479 |
5 |
1.135568 |
1.054733 |
-1.793129 |
-0.620808 |
6 |
-1.036741 |
1.461721 |
0.167975 |
-0.645020 |
7 |
-0.709849 |
-0.527562 |
1.129764 |
-0.610033 |
8 |
0.443899 |
1.176945 |
-0.558200 |
-0.122540 |
9 |
0.244292 |
1.108330 |
0.417284 |
-0.455202 |
2、 join方法
left = pd.DataFrame({
'key': ['foo', 'foo'],
'lval': [1,2],
})
right = pd.DataFrame({
'key' : ['foo', 'foo'],
'rval' : [4, 5],
})
pd.merge(left, right, on='key')
|
key |
lval |
rval |
0 |
foo |
1 |
4 |
1 |
foo |
1 |
5 |
2 |
foo |
2 |
4 |
3 |
foo |
2 |
5 |
6-3、 append方法
df = pd.DataFrame(np.random.randn(8,4), columns=['A', 'B', 'C', 'D'])
s = df.iloc[3]
df.append(s, ignore_index=True)
df.T
|
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
A |
-0.152964 |
1.239888 |
-0.803823 |
-0.604354 |
0.343325 |
-3.325538 |
0.997950 |
1.694225 |
B |
-0.709887 |
0.319957 |
0.307502 |
0.583405 |
0.170432 |
0.157247 |
0.087903 |
3.263398 |
C |
-2.390029 |
1.291862 |
0.472967 |
-0.945514 |
-1.349493 |
0.145808 |
0.407573 |
0.458573 |
D |
-0.117396 |
2.008384 |
-1.184914 |
0.012678 |
1.286618 |
0.399334 |
1.305697 |
0.027644 |
七、 分组
group by操作:
- splitting: 按照规则将数据分为不同的组;
- Applying: 对于每组数据分别执行一个函数;
- Combining: 将结果组合到一个数据结构中;
df = pd.DataFrame({
'A':['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'bar'],
'B':['one','one', 'two', 'three',
'two','two', 'one', 'three'],
'C': np.random.randn(8),
'D':np.random.randn(8)
})
df
|
A |
B |
C |
D |
0 |
foo |
one |
0.602525 |
-0.729163 |
1 |
bar |
one |
-0.651076 |
0.104687 |
2 |
foo |
two |
2.289475 |
0.279368 |
3 |
bar |
three |
0.348643 |
0.647659 |
4 |
foo |
two |
-0.806890 |
0.035457 |
5 |
bar |
two |
0.452640 |
1.805835 |
6 |
foo |
one |
-1.363260 |
-0.247901 |
7 |
bar |
three |
-0.680933 |
0.367421 |
df.groupby('A').sum()
|
C |
D |
A |
|
|
bar |
-0.530726 |
2.925602 |
foo |
0.721851 |
-0.662239 |
df.groupby(['A', 'B']).sum()
|
|
C |
D |
A |
B |
|
|
bar |
one |
-0.651076 |
0.104687 |
three |
-0.332290 |
1.015080 |
two |
0.452640 |
1.805835 |
foo |
one |
-0.760735 |
-0.977064 |
two |
1.482585 |
0.314825 |
八、 改变形状
Stack
myt = [
['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']
]
tuples = list(zip(*myt))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8,2), index=index, columns=['A', 'B'])
df2 = df[:4]
stacked = df2.stack()
stacked
运行结果如下:
first second
bar one A 0.031757
B 0.426813
two A 0.276267
B 0.560446
baz one A 0.552757
B 1.805644
two A -1.092327
B -0.539023
dtype: float64
stacked.unstack()
|
|
A |
B |
first |
second |
|
|
bar |
one |
0.031757 |
0.426813 |
two |
0.276267 |
0.560446 |
baz |
one |
0.552757 |
1.805644 |
two |
-1.092327 |
-0.539023 |
stacked.unstack(1)
|
second |
one |
two |
first |
|
|
|
bar |
A |
0.031757 |
0.276267 |
B |
0.426813 |
0.560446 |
baz |
A |
0.552757 |
-1.092327 |
B |
1.805644 |
-0.539023 |
stacked.unstack(0)
|
first |
bar |
baz |
second |
|
|
|
one |
A |
0.031757 |
0.552757 |
B |
0.426813 |
1.805644 |
two |
A |
0.276267 |
-1.092327 |
B |
0.560446 |
-0.539023 |
数据透视表
df = pd.DataFrame({
'A' : ['one', 'one', 'two', 'three']*3,
'B' : ['A', 'B', 'C']*4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)
})
print df
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
运行结果如下:
A B C D E
0 one A foo -0.382358 0.975949
1 one B foo -1.588232 1.168863
2 two C foo 1.286834 0.836279
3 three A bar 1.375826 -1.049638
4 one B bar 0.179641 0.714552
5 one C bar -0.912443 0.474539
6 two A foo -0.010076 -0.750666
7 three B foo -0.766371 0.048745
8 one C foo -1.100770 0.879633
9 one A bar 2.462341 1.761222
10 two B bar 1.239756 0.514073
11 three C bar -0.480783 -0.268171
|
C |
bar |
foo |
A |
B |
|
|
one |
A |
2.462341 |
-0.382358 |
B |
0.179641 |
-1.588232 |
C |
-0.912443 |
-1.100770 |
three |
A |
1.375826 |
NaN |
B |
NaN |
-0.766371 |
C |
-0.480783 |
NaN |
two |
A |
NaN |
-0.010076 |
B |
1.239756 |
NaN |
C |
NaN |
1.286834 |
九、 时间序列
pandas在对频率转换进行重新采样时拥有简单,强大且高效的功能(如将按秒采样的数据转换为按5分钟为单位进行采样的数据),这种操作在金融领域非常常见.
rng = pd.date_range('1/1/2017', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample('5Min').sum()
运行结果如下:
2017-01-01 24929
Freq: 5T, dtype: int64
9-1、 时区表示
rng = pd.date_range('3/6/2017 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
ts_utc = ts.tz_localize('UTC')
ts_utc
运行结果如下:
2017-03-06 00:00:00+00:00 -0.406935
2017-03-07 00:00:00+00:00 -2.059444
2017-03-08 00:00:00+00:00 0.375662
2017-03-09 00:00:00+00:00 1.167948
2017-03-10 00:00:00+00:00 0.927164
Freq: D, dtype: float64
9-2、 时区转换
ts_utc.tz_convert('Asia/Shanghai')
运行结果如下:
2017-03-06 08:00:00+08:00 -0.406935
2017-03-07 08:00:00+08:00 -2.059444
2017-03-08 08:00:00+08:00 0.375662
2017-03-09 08:00:00+08:00 1.167948
2017-03-10 08:00:00+08:00 0.927164
Freq: D, dtype: float64
9-3、 时间跨度转换
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ps = ts.to_period()
ps.to_timestamp()
运行结果如下:
2012-01-01 -0.213930
2012-02-01 0.974707
2012-03-01 -1.206153
2012-04-01 -0.935241
2012-05-01 0.669691
Freq: MS, dtype: float64
9-4、 时间和时间戳的转换
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 'S') + 9
ts.head()
运行结果如下:
1990-03-01 09:00 0.388727
1990-06-01 09:00 0.389497
1990-09-01 09:00 0.563586
1990-12-01 09:00 1.044088
1991-03-01 09:00 -1.392586
Freq: H, dtype: float64
十、 Categorical
pandas可以在DataFrame中支持Categorical类型的数据。
df = pd.DataFrame({
"id" : range(1,7),
"raw_grade" : ['a', 'b', 'b', 'a', 'a', 'e']
})
10-1、转换数据类型
df["grade"] = df["raw_grade"].astype('category')
df['grade']
运行结果如下:
0 a
1 b
2 b
3 a
4 a
5 e
Name: grade, dtype: category
Categories (3, object): [a, b, e]
10-2、 对Categroical类型数据重命名
df["grade"].cat.categories = ['very good', 'good', 'very bad']
df
|
id |
raw_grade |
grade |
0 |
1 |
a |
very good |
1 |
2 |
b |
good |
2 |
3 |
b |
good |
3 |
4 |
a |
very good |
4 |
5 |
a |
very good |
5 |
6 |
e |
very bad |
10-3、 对类别排序
df["grde"] = df['grade'].cat.set_categories(['very bad', 'bad', 'medium', 'good', 'very good'])
df['grade']
运行结果如下:
0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]
10-4、 按照Categorical的顺序排序
df.sort_values(by='grade')
|
id |
raw_grade |
grade |
grde |
0 |
1 |
a |
very good |
very good |
3 |
4 |
a |
very good |
very good |
4 |
5 |
a |
very good |
very good |
1 |
2 |
b |
good |
good |
2 |
3 |
b |
good |
good |
5 |
6 |
e |
very bad |
very bad |
10-5、 对Categorical列进行统计排序
df.groupby('grade').size()
运行结果如下:
grade
very good 3
good 2
very bad 1
dtype: int64
十一、 画图
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts.cumsum()
ts.plot()
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure()
df.plot()
plt.legend(loc='best')
十二、 导入和保存数据
12-1、 CSV数据
df.to_csv('foo.csv')
pd.read_csv('foo.csv')
12-2、 HDF5存储
df.to_hdf('foo.h5', 'df')
pd.read_hdf('foo.h5', 'df')
12-3、 Excel存储
df.to_excel('foo.xlsx', sheet_name='foo1')
pd.read_excel('foo.xlsx', 'foo1', index_col=None, na_values=['NA'])