时间序列

#时间序列
#日期和时间数据类型及工具
from datetime import datetime
import pandas as pd
import numpy as np
now = datetime.now()
now
datetime.datetime(2018, 11, 7, 15, 6, 50, 327155)
now.year,now.month,now.day
(2018, 11, 7)
delta = datetime(2011,1,7) - datetime(2008,6,24,8,15)
delta#时间差
datetime.timedelta(days=926, seconds=56700)
delta.days
926
delta.seconds
56700
from datetime import timedelta#时间加减
start = datetime(2011,1,7)
start + timedelta(12)
datetime.datetime(2011, 1, 19, 0, 0)
start - 2*timedelta(12)
datetime.datetime(2010, 12, 14, 0, 0)
#字符串和datetime的相互转换
stamp = datetime(2011,1,3)
str(stamp)
'2011-01-03 00:00:00'
stamp.strftime('%Y-%m-%d,%H')
'2011-01-03,00'
value ='2011-01-03'
datetime.strptime(value,'%Y-%m-%d')
datetime.datetime(2011, 1, 3, 0, 0)
datestrs = ['07/06/2011','08/06/2011']
[datetime.strptime(x,'%m/%d/%Y')for x in datestrs]
[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]
from dateutil.parser import parse
parse('2011-01-03')
datetime.datetime(2011, 1, 3, 0, 0)
 parse('Jan 31, 1997 10:45 PM')
datetime.datetime(1997, 1, 31, 22, 45)
parse('6/12/2011',dayfirst=True)
datetime.datetime(2011, 12, 6, 0, 0)
datestrs = ['2011-01-06 12:00:00','2011-01-01 00:00:00']
pd.to_datetime(datestrs)
DatetimeIndex(['2011-01-06 12:00:00', '2011-01-01 00:00:00'], dtype='datetime64[ns]', freq=None)
idx = pd.to_datetime(datestrs+[None])
idx
DatetimeIndex(['2011-01-06 12:00:00', '2011-01-01 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)
idx[2]
NaT
pd.isnull(idx)
array([False, False,  True])
#时间序列基础
from datetime import datetime
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
             datetime(2011, 1, 7), datetime(2011, 1, 8),
             datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(6),index=dates)
ts
2011-01-02   -0.889247
2011-01-05    0.472529
2011-01-07    1.458923
2011-01-08    0.125811
2011-01-10    0.878417
2011-01-12   -1.347855
dtype: float64
ts.index
DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)
ts[::2]
2011-01-02   -0.889247
2011-01-07    1.458923
2011-01-10    0.878417
dtype: float64
ts + ts[::2]
2011-01-02   -1.778493
2011-01-05         NaN
2011-01-07    2.917845
2011-01-08         NaN
2011-01-10    1.756833
2011-01-12         NaN
dtype: float64
ts.index.dtype
dtype('
stamp = ts.index[0]
stamp
Timestamp('2011-01-02 00:00:00')
#索引,选取,子集构造
stamp = ts.index[2]
ts[stamp]
1.4589225544193962
ts['1/10/2011']
0.8784166722932014
ts['20110110']
0.8784166722932014
longer_ts = pd.Series(np.random.randn(1000),
                     index = pd.date_range('1/1/2000',periods=1000))
longer_ts
2000-01-01   -1.016306
2000-01-02   -1.619848
2000-01-03   -1.030577
2000-01-04    0.782235
2000-01-05    0.380402
2000-01-06    1.867834
2000-01-07   -0.966920
2000-01-08    1.116943
2000-01-09    0.286488
2000-01-10    0.609803
2000-01-11    1.426519
2000-01-12    0.424936
2000-01-13   -1.196588
2000-01-14    0.906847
2000-01-15    0.266331
2000-01-16   -0.704134
2000-01-17    0.645805
2000-01-18    0.724744
2000-01-19    1.271031
2000-01-20   -0.622831
2000-01-21   -0.378951
2000-01-22   -0.978048
2000-01-23   -0.204337
2000-01-24    0.069203
2000-01-25    0.245822
2000-01-26    1.696828
2000-01-27   -0.754646
2000-01-28   -0.728572
2000-01-29    0.076263
2000-01-30    2.723447
                ...   
2002-08-28    1.168994
2002-08-29    1.212782
2002-08-30   -0.568776
2002-08-31   -1.392654
2002-09-01   -0.353161
2002-09-02   -0.779683
2002-09-03   -0.740422
2002-09-04    0.582901
2002-09-05    0.543077
2002-09-06   -1.046154
2002-09-07    0.031932
2002-09-08   -1.062249
2002-09-09    0.911971
2002-09-10    0.312633
2002-09-11   -0.487974
2002-09-12   -0.681178
2002-09-13    0.606661
2002-09-14   -0.246001
2002-09-15   -0.375383
2002-09-16    1.651613
2002-09-17   -0.233456
2002-09-18   -1.442711
2002-09-19    0.620685
2002-09-20    0.731800
2002-09-21   -0.151031
2002-09-22    0.752516
2002-09-23   -0.788909
2002-09-24   -0.388959
2002-09-25   -1.466894
2002-09-26   -0.429725
Freq: D, Length: 1000, dtype: float64
longer_ts['2001']
2001-01-01   -0.515375
2001-01-02   -0.438227
2001-01-03    0.678962
2001-01-04    2.457332
2001-01-05    0.007770
2001-01-06    1.532512
2001-01-07    0.237520
2001-01-08    1.425760
2001-01-09    0.684237
2001-01-10   -0.340322
2001-01-11    0.232236
2001-01-12    1.615747
2001-01-13    0.365120
2001-01-14    0.324808
2001-01-15    0.015621
2001-01-16   -0.177306
2001-01-17    0.561159
2001-01-18    0.516523
2001-01-19   -1.058772
2001-01-20   -0.136410
2001-01-21   -0.016760
2001-01-22   -0.882206
2001-01-23   -0.976625
2001-01-24   -0.189113
2001-01-25   -0.984928
2001-01-26    1.502973
2001-01-27    0.058032
2001-01-28   -0.543902
2001-01-29    0.090189
2001-01-30   -0.505144
                ...   
2001-12-02   -0.553024
2001-12-03   -0.405266
2001-12-04   -0.567354
2001-12-05    0.094730
2001-12-06   -0.548641
2001-12-07   -0.592105
2001-12-08    1.558771
2001-12-09   -1.049105
2001-12-10    2.094203
2001-12-11    0.067828
2001-12-12    0.094673
2001-12-13   -0.883690
2001-12-14    0.216863
2001-12-15   -0.011448
2001-12-16   -0.276283
2001-12-17    2.146709
2001-12-18    0.123471
2001-12-19    1.448596
2001-12-20   -0.990181
2001-12-21   -0.723119
2001-12-22    0.506099
2001-12-23   -1.410846
2001-12-24    0.077442
2001-12-25   -0.586892
2001-12-26    0.302183
2001-12-27    0.821904
2001-12-28   -0.669978
2001-12-29   -0.238159
2001-12-30    0.177509
2001-12-31   -1.527928
Freq: D, Length: 365, dtype: float64
longer_ts['2001-05']
2001-05-01   -0.434947
2001-05-02   -1.013492
2001-05-03   -1.370608
2001-05-04   -0.278787
2001-05-05   -0.527465
2001-05-06   -0.794392
2001-05-07   -0.688821
2001-05-08   -0.174579
2001-05-09    0.327301
2001-05-10    0.231338
2001-05-11   -1.600751
2001-05-12   -1.305738
2001-05-13    0.198962
2001-05-14    1.608539
2001-05-15   -1.017836
2001-05-16    1.837142
2001-05-17   -0.213202
2001-05-18   -0.372286
2001-05-19    0.139703
2001-05-20    1.092866
2001-05-21    0.089208
2001-05-22    0.404983
2001-05-23    2.775343
2001-05-24    0.077524
2001-05-25   -0.143234
2001-05-26   -0.559451
2001-05-27    0.753692
2001-05-28    0.373406
2001-05-29    0.566619
2001-05-30    0.539838
2001-05-31    0.044411
Freq: D, dtype: float64
ts[datetime(2011,1,7):]
2011-01-07    1.458923
2011-01-08    0.125811
2011-01-10    0.878417
2011-01-12   -1.347855
dtype: float64
ts['1/6/2011':'1/11/2011']
2011-01-07    1.458923
2011-01-08    0.125811
2011-01-10    0.878417
dtype: float64
ts.truncate(after='1/9/2011')
2011-01-02   -0.889247
2011-01-05    0.472529
2011-01-07    1.458923
2011-01-08    0.125811
dtype: float64
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.randn(100, 4),
                         index=dates,
                          columns=['Colorado', 'Texas',
                                    'New York', 'Ohio'])
long_df.loc['5-2001']


Colorado Texas New York Ohio
2001-05-02 0.534672 0.471547 0.102840 0.307940
2001-05-09 -1.078103 0.024223 -1.152705 0.382906
2001-05-16 -1.149617 -0.889391 -2.160858 0.730430
2001-05-23 0.970990 -0.440826 -0.329939 -1.671497
2001-05-30 0.122803 0.703133 -0.176191 -0.722155
#带重复索引的时间序列
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',
                              '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5),index=dates)
dup_ts
2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32
dup_ts['1/3/2000']
4
dup_ts['1/2/2000']
2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32
grouped = dup_ts.groupby(level=0)
grouped.mean()
2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32
grouped.count()
2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64
#日期范围,频率以及移动
ts
2011-01-02   -0.889247
2011-01-05    0.472529
2011-01-07    1.458923
2011-01-08    0.125811
2011-01-10    0.878417
2011-01-12   -1.347855
dtype: float64
resample = ts.resample('D')#D:每天
DatetimeIndexResampler [freq=, axis=0, closed=left, label=left, convention=start, base=0]
#生成日期范围
index = pd.date_range('2012-04-01','2012-06-01')
index
DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
               '2012-05-27', '2012-05-28', '2012-05-29', '2012-05-30',
               '2012-05-31', '2012-06-01'],
              dtype='datetime64[ns]', freq='D')
pd.date_range(start='2012-04-01',periods=20)
DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')
pd.date_range(end='2012-06-01',periods=20)
DatetimeIndex(['2012-05-13', '2012-05-14', '2012-05-15', '2012-05-16',
               '2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20',
               '2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24',
               '2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28',
               '2012-05-29', '2012-05-30', '2012-05-31', '2012-06-01'],
              dtype='datetime64[ns]', freq='D')
pd.date_range('2000-01-01','2000-12-01',freq='BM')
DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BM')
pd.date_range('2012-05-20 12:56:31',periods=5)
DatetimeIndex(['2012-05-20 12:56:31', '2012-05-21 12:56:31',
               '2012-05-22 12:56:31', '2012-05-23 12:56:31',
               '2012-05-24 12:56:31'],
              dtype='datetime64[ns]', freq='D')
pd.date_range('2012-05-02 12:56:31', periods=5, normalize=True)
DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')
#频率和日期偏移量
from pandas.tseries.offsets import Hour,Minute
hour =Hour()
hour

four_hours = Hour(4)
four_hours
<4 * Hours>
 pd.date_range('2000-01-01', '2000-01-03 23:59', freq='4h')
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')
Hour(2) + Minute(30)
<150 * Minutes>
pd.date_range('2000-01-01', periods=10, freq='1h30min')
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')
#WOM日期
rng = pd.date_range('2012-01-01', '2012-09-01', freq='WOM-3FRI')#每月第三个星期五
list(rng)

[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')]
#移动数据
ts = pd.Series(np.random.randn(4),
                   index=pd.date_range('1/1/2000', periods=4, freq='M'))
ts
2000-01-31    0.177721
2000-02-29   -0.202120
2000-03-31   -0.701722
2000-04-30   -0.938393
Freq: M, dtype: float64
ts.shift(2)
2000-01-31         NaN
2000-02-29         NaN
2000-03-31    0.177721
2000-04-30   -0.202120
Freq: M, dtype: float64
#移动时间戳
ts.shift(2, freq='M')
2000-03-31    0.177721
2000-04-30   -0.202120
2000-05-31   -0.701722
2000-06-30   -0.938393
Freq: M, dtype: float64
#通过偏移量对日期进行位移
from pandas.tseries.offsets import Day, MonthEnd
now = datetime(2011, 11, 17)
now + 3 * Day()
Timestamp('2011-11-20 00:00:00')
 now + MonthEnd()
Timestamp('2011-11-30 00:00:00')
offset = MonthEnd()
offset.rollforward(now)
Timestamp('2011-11-30 00:00:00')
offset.rollback(now)
Timestamp('2011-10-31 00:00:00')
ts = pd.Series(np.random.randn(20),
                index=pd.date_range('1/15/2000', periods=20, freq='4d'))
ts
2000-01-15    2.441955
2000-01-19    0.062600
2000-01-23   -0.300315
2000-01-27    1.181993
2000-01-31    0.572116
2000-02-04    1.209064
2000-02-08   -0.996346
2000-02-12    0.890940
2000-02-16    0.194561
2000-02-20    0.233563
2000-02-24    0.380644
2000-02-28    0.117173
2000-03-03   -0.088754
2000-03-07   -0.206632
2000-03-11   -1.792495
2000-03-15   -0.443086
2000-03-19   -0.229585
2000-03-23   -1.429532
2000-03-27   -1.059541
2000-03-31   -1.274479
Freq: 4D, dtype: float64
ts.groupby(offset.rollforward).mean()
2000-01-31    0.791670
2000-02-29    0.289943
2000-03-31   -0.815513
dtype: float64
#时区处理
import pytz
pytz.common_timezones[-5:]
['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']
#时区本地化和转换
rng = pd.date_range('3/9/2012 9:30',periods=6,freq='D')
ts = pd.Series(np.random.randn(len(rng)),index=rng)
ts
2012-03-09 09:30:00   -0.351911
2012-03-10 09:30:00    0.693718
2012-03-11 09:30:00    0.634782
2012-03-12 09:30:00    0.025861
2012-03-13 09:30:00   -0.142429
2012-03-14 09:30:00   -0.464828
Freq: D, dtype: float64
print(ts.index.tz)
None
pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC')
DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00',
               '2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')
ts_utc = ts.tz_localize('UTC')
ts_utc
2012-03-09 09:30:00+00:00   -0.351911
2012-03-10 09:30:00+00:00    0.693718
2012-03-11 09:30:00+00:00    0.634782
2012-03-12 09:30:00+00:00    0.025861
2012-03-13 09:30:00+00:00   -0.142429
2012-03-14 09:30:00+00:00   -0.464828
Freq: D, dtype: float64
ts_utc.tz_convert('America/New_York')
2012-03-09 04:30:00-05:00   -0.351911
2012-03-10 04:30:00-05:00    0.693718
2012-03-11 05:30:00-04:00    0.634782
2012-03-12 05:30:00-04:00    0.025861
2012-03-13 05:30:00-04:00   -0.142429
2012-03-14 05:30:00-04:00   -0.464828
Freq: D, dtype: float64
#操作时区意识型Timestamp对象
stamp = pd.Timestamp('2011-03-12 04:00')
stamp_utc = stamp.tz_localize('utc')
stamp_utc.tz_convert('America/New_York')
Timestamp('2011-03-11 23:00:00-0500', tz='America/New_York')
stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow')
stamp_moscow
Timestamp('2011-03-12 04:00:00+0300', tz='Europe/Moscow')
stamp_utc.value
1299902400000000000
#不同时区之间的运算
rng = pd.date_range('3/7/2012 9:30', periods=10, freq='B')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
2012-03-07 09:30:00   -0.454626
2012-03-08 09:30:00   -0.102485
2012-03-09 09:30:00    0.309211
2012-03-12 09:30:00    0.249732
2012-03-13 09:30:00   -0.254658
2012-03-14 09:30:00   -0.503452
2012-03-15 09:30:00    1.722681
2012-03-16 09:30:00    0.090046
2012-03-19 09:30:00    1.414297
2012-03-20 09:30:00    0.126887
Freq: B, dtype: float64
ts1 = ts[:7].tz_localize('Europe/London')
ts2 = ts1[2:].tz_convert('Europe/Moscow')
result = ts1 + ts2 
result.index
DatetimeIndex(['2012-03-07 09:30:00+00:00', '2012-03-08 09:30:00+00:00',
               '2012-03-09 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='B')
#时期及其算术运算
p = pd.Period(2007,freq='A-DEC')
p
Period('2007', 'A-DEC')
p + 5
Period('2012', 'A-DEC')
p - 2
Period('2005', 'A-DEC')
pd.Period('2014',freq='A-DEC') - p
7
rng = pd.period_range('2000-01-01','2000-06-30',freq='M')
rng
PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]', freq='M')
pd.Series(np.random.randn(6),index=rng)
2000-01    0.143415
2000-02   -0.355323
2000-03   -1.136658
2000-04    0.533440
2000-05    0.504703
2000-06   -0.723782
Freq: M, dtype: float64
values = ['2001Q3','2002Q2','2003Q1']
index = pd.PeriodIndex(values,freq='Q-DEC')
index

PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]', freq='Q-DEC')
#时期的频率转换
p = pd.Period('2007',freq='A-DEC')
p
Period('2007', 'A-DEC')
p.asfreq('M',how='start')
Period('2007-01', 'M')
p.asfreq('M',how='end')
Period('2007-12', 'M')
#按季度计算的时期频率
p = pd.Period('2012Q4',freq='Q-JAN')
p
Period('2012Q4', 'Q-JAN')
#将Timestamp转化为Period
rng = pd.date_range('2000-01-01',periods=3,freq='M')
ts = pd.Series(np.random.randn(3),index=rng)
ts
2000-01-31   -0.510447
2000-02-29   -0.624095
2000-03-31   -1.085284
Freq: M, dtype: float64
pts=ts.to_period()
pts
2000-01   -0.510447
2000-02   -0.624095
2000-03   -1.085284
Freq: M, dtype: float64
rng = pd.date_range('1/29/2000',periods=6,freq='D')
ts2 = pd.Series(np.random.randn(6),index=rng)
ts2

2000-01-29   -0.869026
2000-01-30    0.206545
2000-01-31   -0.147305
2000-02-01   -0.151367
2000-02-02    0.950917
2000-02-03    1.198447
Freq: D, dtype: float64
ts2.to_period('M')
2000-01   -0.869026
2000-01    0.206545
2000-01   -0.147305
2000-02   -0.151367
2000-02    0.950917
2000-02    1.198447
Freq: M, dtype: float64
pts = ts2.to_period()
pts
2000-01-29   -0.869026
2000-01-30    0.206545
2000-01-31   -0.147305
2000-02-01   -0.151367
2000-02-02    0.950917
2000-02-03    1.198447
Freq: D, dtype: float64
pts.to_timestamp(how='end')
2000-01-29   -0.869026
2000-01-30    0.206545
2000-01-31   -0.147305
2000-02-01   -0.151367
2000-02-02    0.950917
2000-02-03    1.198447
Freq: D, dtype: float64
#通过数组创建PeriodIndex
data = pd.read_csv('examples/macrodata.csv')
data.head(5)
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint
0 1959.0 1.0 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 5.8 177.146 0.00 0.00
1 1959.0 2.0 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 5.1 177.830 2.34 0.74
2 1959.0 3.0 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 5.3 178.657 2.74 1.09
3 1959.0 4.0 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 5.6 179.386 0.27 4.06
4 1960.0 1.0 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 5.2 180.007 2.31 1.19
data.year
0      1959.0
1      1959.0
2      1959.0
3      1959.0
4      1960.0
5      1960.0
6      1960.0
7      1960.0
8      1961.0
9      1961.0
10     1961.0
11     1961.0
12     1962.0
13     1962.0
14     1962.0
15     1962.0
16     1963.0
17     1963.0
18     1963.0
19     1963.0
20     1964.0
21     1964.0
22     1964.0
23     1964.0
24     1965.0
25     1965.0
26     1965.0
27     1965.0
28     1966.0
29     1966.0
        ...  
173    2002.0
174    2002.0
175    2002.0
176    2003.0
177    2003.0
178    2003.0
179    2003.0
180    2004.0
181    2004.0
182    2004.0
183    2004.0
184    2005.0
185    2005.0
186    2005.0
187    2005.0
188    2006.0
189    2006.0
190    2006.0
191    2006.0
192    2007.0
193    2007.0
194    2007.0
195    2007.0
196    2008.0
197    2008.0
198    2008.0
199    2008.0
200    2009.0
201    2009.0
202    2009.0
Name: year, Length: 203, dtype: float64
data.quarter
0      1.0
1      2.0
2      3.0
3      4.0
4      1.0
5      2.0
6      3.0
7      4.0
8      1.0
9      2.0
10     3.0
11     4.0
12     1.0
13     2.0
14     3.0
15     4.0
16     1.0
17     2.0
18     3.0
19     4.0
20     1.0
21     2.0
22     3.0
23     4.0
24     1.0
25     2.0
26     3.0
27     4.0
28     1.0
29     2.0
      ... 
173    2.0
174    3.0
175    4.0
176    1.0
177    2.0
178    3.0
179    4.0
180    1.0
181    2.0
182    3.0
183    4.0
184    1.0
185    2.0
186    3.0
187    4.0
188    1.0
189    2.0
190    3.0
191    4.0
192    1.0
193    2.0
194    3.0
195    4.0
196    1.0
197    2.0
198    3.0
199    4.0
200    1.0
201    2.0
202    3.0
Name: quarter, Length: 203, dtype: float64
index = pd.PeriodIndex(year=data.year,quarter=data.quarter,freq='Q-DEC')
index
PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', length=203, freq='Q-DEC')
data.index = index
data
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint
1959Q1 1959.0 1.0 2710.349 1707.4 286.898 470.045 1886.9 28.980 139.7 2.82 5.8 177.146 0.00 0.00
1959Q2 1959.0 2.0 2778.801 1733.7 310.859 481.301 1919.7 29.150 141.7 3.08 5.1 177.830 2.34 0.74
1959Q3 1959.0 3.0 2775.488 1751.8 289.226 491.260 1916.4 29.350 140.5 3.82 5.3 178.657 2.74 1.09
1959Q4 1959.0 4.0 2785.204 1753.7 299.356 484.052 1931.3 29.370 140.0 4.33 5.6 179.386 0.27 4.06
1960Q1 1960.0 1.0 2847.699 1770.5 331.722 462.199 1955.5 29.540 139.6 3.50 5.2 180.007 2.31 1.19
1960Q2 1960.0 2.0 2834.390 1792.9 298.152 460.400 1966.1 29.550 140.2 2.68 5.2 180.671 0.14 2.55
1960Q3 1960.0 3.0 2839.022 1785.8 296.375 474.676 1967.8 29.750 140.9 2.36 5.6 181.528 2.70 -0.34
1960Q4 1960.0 4.0 2802.616 1788.2 259.764 476.434 1966.6 29.840 141.1 2.29 6.3 182.287 1.21 1.08
1961Q1 1961.0 1.0 2819.264 1787.7 266.405 475.854 1984.5 29.810 142.1 2.37 6.8 182.992 -0.40 2.77
1961Q2 1961.0 2.0 2872.005 1814.3 286.246 480.328 2014.4 29.920 142.9 2.29 7.0 183.691 1.47 0.81
1961Q3 1961.0 3.0 2918.419 1823.1 310.227 493.828 2041.9 29.980 144.1 2.32 6.8 184.524 0.80 1.52
1961Q4 1961.0 4.0 2977.830 1859.6 315.463 502.521 2082.0 30.040 145.2 2.60 6.2 185.242 0.80 1.80
1962Q1 1962.0 1.0 3031.241 1879.4 334.271 520.960 2101.7 30.210 146.4 2.73 5.6 185.874 2.26 0.47
1962Q2 1962.0 2.0 3064.709 1902.5 331.039 523.066 2125.2 30.220 146.5 2.78 5.5 186.538 0.13 2.65
1962Q3 1962.0 3.0 3093.047 1917.9 336.962 538.838 2137.0 30.380 146.7 2.78 5.6 187.323 2.11 0.67
1962Q4 1962.0 4.0 3100.563 1945.1 325.650 535.912 2154.6 30.440 148.3 2.87 5.5 188.013 0.79 2.08
1963Q1 1963.0 1.0 3141.087 1958.2 343.721 522.917 2172.5 30.480 149.7 2.90 5.8 188.580 0.53 2.38
1963Q2 1963.0 2.0 3180.447 1976.9 348.730 518.108 2193.1 30.690 151.3 3.03 5.7 189.242 2.75 0.29
1963Q3 1963.0 3.0 3240.332 2003.8 360.102 546.893 2217.9 30.750 152.6 3.38 5.5 190.028 0.78 2.60
1963Q4 1963.0 4.0 3264.967 2020.6 364.534 532.383 2254.6 30.940 153.7 3.52 5.6 190.668 2.46 1.06
1964Q1 1964.0 1.0 3338.246 2060.5 379.523 529.686 2299.6 30.950 154.8 3.51 5.5 191.245 0.13 3.38
1964Q2 1964.0 2.0 3376.587 2096.7 377.778 526.175 2362.1 31.020 156.8 3.47 5.2 191.889 0.90 2.57
1964Q3 1964.0 3.0 3422.469 2135.2 386.754 522.008 2392.7 31.120 159.2 3.53 5.0 192.631 1.29 2.25
1964Q4 1964.0 4.0 3431.957 2141.2 389.910 514.603 2420.4 31.280 160.7 3.76 5.0 193.223 2.05 1.71
1965Q1 1965.0 1.0 3516.251 2188.8 429.145 508.006 2447.4 31.380 162.0 3.93 4.9 193.709 1.28 2.65
1965Q2 1965.0 2.0 3563.960 2213.0 429.119 508.931 2474.5 31.580 163.1 3.84 4.7 194.303 2.54 1.30
1965Q3 1965.0 3.0 3636.285 2251.0 444.444 529.446 2542.6 31.650 166.0 3.93 4.4 194.997 0.89 3.04
1965Q4 1965.0 4.0 3724.014 2314.3 446.493 544.121 2594.1 31.880 169.1 4.35 4.1 195.539 2.90 1.46
1966Q1 1966.0 1.0 3815.423 2348.5 484.244 556.593 2618.4 32.280 171.8 4.62 3.9 195.999 4.99 -0.37
1966Q2 1966.0 2.0 3828.124 2354.5 475.408 571.371 2624.7 32.450 170.3 4.65 3.8 196.560 2.10 2.55
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2002Q2 2002.0 2.0 11538.770 7997.8 1810.779 774.408 8658.9 180.000 1199.5 1.70 5.8 288.028 1.56 0.14
2002Q3 2002.0 3.0 11596.430 8052.0 1814.531 786.673 8629.2 181.200 1204.0 1.61 5.7 288.783 2.66 -1.05
2002Q4 2002.0 4.0 11598.824 8080.6 1813.219 799.967 8649.6 182.600 1226.8 1.20 5.8 289.421 3.08 -1.88
2003Q1 2003.0 1.0 11645.819 8122.3 1813.141 800.196 8681.3 183.200 1248.4 1.14 5.9 290.019 1.31 -0.17
2003Q2 2003.0 2.0 11738.706 8197.8 1823.698 838.775 8812.5 183.700 1287.9 0.96 6.2 290.704 1.09 -0.13
2003Q3 2003.0 3.0 11935.461 8312.1 1889.883 839.598 8935.4 184.900 1297.3 0.94 6.1 291.449 2.60 -1.67
2003Q4 2003.0 4.0 12042.817 8358.0 1959.783 845.722 8986.4 186.300 1306.1 0.90 5.8 292.057 3.02 -2.11
2004Q1 2004.0 1.0 12127.623 8437.6 1970.015 856.570 9025.9 187.400 1332.1 0.94 5.7 292.635 2.35 -1.42
2004Q2 2004.0 2.0 12213.818 8483.2 2055.580 861.440 9115.0 189.100 1340.5 1.21 5.6 293.310 3.61 -2.41
2004Q3 2004.0 3.0 12303.533 8555.8 2082.231 876.385 9175.9 190.800 1361.0 1.63 5.4 294.066 3.58 -1.95
2004Q4 2004.0 4.0 12410.282 8654.2 2125.152 865.596 9303.4 191.800 1366.6 2.20 5.4 294.741 2.09 0.11
2005Q1 2005.0 1.0 12534.113 8719.0 2170.299 869.204 9189.6 193.800 1357.8 2.69 5.3 295.308 4.15 -1.46
2005Q2 2005.0 2.0 12587.535 8802.9 2131.468 870.044 9253.0 194.700 1366.6 3.01 5.1 295.994 1.85 1.16
2005Q3 2005.0 3.0 12683.153 8865.6 2154.949 890.394 9308.0 199.200 1375.0 3.52 5.0 296.770 9.14 -5.62
2005Q4 2005.0 4.0 12748.699 8888.5 2232.193 875.557 9358.7 199.400 1380.6 4.00 4.9 297.435 0.40 3.60
2006Q1 2006.0 1.0 12915.938 8986.6 2264.721 900.511 9533.8 200.700 1380.5 4.51 4.7 298.061 2.60 1.91
2006Q2 2006.0 2.0 12962.462 9035.0 2261.247 892.839 9617.3 202.700 1369.2 4.82 4.7 298.766 3.97 0.85
2006Q3 2006.0 3.0 12965.916 9090.7 2229.636 892.002 9662.5 201.900 1369.4 4.90 4.7 299.593 -1.58 6.48
2006Q4 2006.0 4.0 13060.679 9181.6 2165.966 894.404 9788.8 203.574 1373.6 4.92 4.4 300.320 3.30 1.62
2007Q1 2007.0 1.0 13099.901 9265.1 2132.609 882.766 9830.2 205.920 1379.7 4.95 4.5 300.977 4.58 0.36
2007Q2 2007.0 2.0 13203.977 9291.5 2162.214 898.713 9842.7 207.338 1370.0 4.72 4.5 301.714 2.75 1.97
2007Q3 2007.0 3.0 13321.109 9335.6 2166.491 918.983 9883.9 209.133 1379.2 4.00 4.7 302.509 3.45 0.55
2007Q4 2007.0 4.0 13391.249 9363.6 2123.426 925.110 9886.2 212.495 1377.4 3.01 4.8 303.204 6.38 -3.37
2008Q1 2008.0 1.0 13366.865 9349.6 2082.886 943.372 9826.8 213.997 1384.0 1.56 4.9 303.803 2.82 -1.26
2008Q2 2008.0 2.0 13415.266 9351.0 2026.518 961.280 10059.0 218.610 1409.3 1.74 5.4 304.483 8.53 -6.79
2008Q3 2008.0 3.0 13324.600 9267.7 1990.693 991.551 9838.3 216.889 1474.7 1.17 6.0 305.270 -3.16 4.33
2008Q4 2008.0 4.0 13141.920 9195.3 1857.661 1007.273 9920.4 212.174 1576.5 0.12 6.9 305.952 -8.79 8.91
2009Q1 2009.0 1.0 12925.410 9209.2 1558.494 996.287 9926.4 212.671 1592.8 0.22 8.1 306.547 0.94 -0.71
2009Q2 2009.0 2.0 12901.504 9189.0 1456.678 1023.528 10077.5 214.469 1653.6 0.18 9.2 307.226 3.37 -3.19
2009Q3 2009.0 3.0 12990.341 9256.0 1486.398 1044.088 10040.6 216.385 1673.9 0.12 9.6 308.013 3.56 -3.44

203 rows × 14 columns

#重采样及频率转换
#重采样(resampling)指的是将时间序列从一个频率转换到另一个频率的处理过程
rng = pd.date_range('2000-01-01',periods=100,freq='D')
ts = pd.Series(np.random.randn(len(rng)),index=rng)
ts
2000-01-01    0.014440
2000-01-02    0.676634
2000-01-03    0.828413
2000-01-04    1.809659
2000-01-05    0.346060
2000-01-06   -0.906748
2000-01-07   -0.144737
2000-01-08   -0.376248
2000-01-09   -1.811051
2000-01-10    0.422918
2000-01-11   -0.291923
2000-01-12   -0.947770
2000-01-13    2.794186
2000-01-14   -1.314019
2000-01-15   -0.474217
2000-01-16    0.657007
2000-01-17   -1.604424
2000-01-18   -0.387496
2000-01-19    0.493891
2000-01-20   -0.963368
2000-01-21    1.689601
2000-01-22   -0.318659
2000-01-23    0.960378
2000-01-24   -0.241049
2000-01-25   -0.068125
2000-01-26   -0.755140
2000-01-27    1.123554
2000-01-28    0.775620
2000-01-29    0.141545
2000-01-30    0.200428
                ...   
2000-03-11    0.695305
2000-03-12   -0.418060
2000-03-13   -0.143606
2000-03-14   -1.265208
2000-03-15    0.016866
2000-03-16   -1.072653
2000-03-17    0.491403
2000-03-18   -0.973830
2000-03-19    1.357179
2000-03-20   -0.856155
2000-03-21    0.594472
2000-03-22   -1.938699
2000-03-23    1.890984
2000-03-24   -0.635768
2000-03-25   -0.239094
2000-03-26    0.281482
2000-03-27   -1.499055
2000-03-28   -1.219709
2000-03-29   -0.996651
2000-03-30    0.779328
2000-03-31   -1.139721
2000-04-01    1.624966
2000-04-02    0.988920
2000-04-03    0.940181
2000-04-04   -0.195757
2000-04-05   -0.140197
2000-04-06   -1.513569
2000-04-07   -1.563758
2000-04-08    0.216563
2000-04-09   -0.857682
Freq: D, Length: 100, dtype: float64
ts.resample('M',kind='Period').mean()
2000-01-31    0.079785
2000-02-29    0.057120
2000-03-31   -0.337746
2000-04-30   -0.055593
Freq: M, dtype: float64
#降采样
rng = pd.date_range('2000-01-01', periods=12, freq='T')
ts = pd.Series(np.arange(12), index=rng)
ts
2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int32
ts.resample('5min',closed='right').sum()
1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int32
ts.resample('5min',closed='right',label='right').sum()
2000-01-01 00:00:00     0
2000-01-01 00:05:00    15
2000-01-01 00:10:00    40
2000-01-01 00:15:00    11
Freq: 5T, dtype: int32
ts.resample('5min').ohlc()
open high low close
2000-01-01 00:00:00 0 4 0 4
2000-01-01 00:05:00 5 9 5 9
2000-01-01 00:10:00 10 11 10 11
#通过时期进行重采样
frame = pd.DataFrame(np.random.randn(24, 4),
                         index=pd.period_range('1-2000', '12-2001',
                                              freq='M'),
                        columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame

Colorado Texas New York Ohio
2000-01 0.966322 -1.097520 0.278189 1.351895
2000-02 -0.440050 -1.486752 -0.281330 -0.083094
2000-03 -0.148903 0.074773 0.432271 -0.606501
2000-04 -0.324102 1.150940 -1.007997 0.192719
2000-05 0.018921 -0.463694 -0.915002 -1.167408
2000-06 1.098821 2.595297 -0.289931 0.624960
2000-07 1.173885 -1.247856 -1.546760 1.148165
2000-08 -2.465409 -0.976749 -0.565111 -2.461652
2000-09 -0.337638 -0.860959 0.585042 0.935459
2000-10 -0.422014 -0.233506 -1.239120 1.692123
2000-11 -0.010893 0.428696 -0.574149 -0.127551
2000-12 0.044093 1.085275 0.995065 0.893955
2001-01 -1.007087 -1.721242 0.328790 -0.536349
2001-02 -1.525735 0.391245 0.649280 1.114608
2001-03 0.398916 0.423673 0.442252 0.922619
2001-04 1.172961 0.726157 -0.860389 -0.263490
2001-05 1.293229 -0.104286 -0.973742 1.284994
2001-06 -0.620977 -0.288092 0.509420 -0.775555
2001-07 -0.249990 -0.075546 0.064330 -0.077252
2001-08 0.010249 -0.009822 1.467907 -0.932622
2001-09 -0.849710 -0.144258 0.384650 -1.102285
2001-10 -1.594351 0.561065 0.171580 0.385832
2001-11 -0.164112 1.495407 -0.688782 -1.025762
2001-12 -1.392290 1.441935 -0.185071 0.306422
annual_frame = frame.resample('A-DEC').mean()
annual_frame
Colorado Texas New York Ohio
2000 -0.070581 -0.086005 -0.344069 0.199423
2001 -0.377408 0.224686 0.109186 -0.058237
#移动窗口函数
close_px_all = pd.read_csv('examples/stock_px_2.csv',
                             parse_dates=True, index_col=0)
close_px = close_px_all[['AAPL', 'MSFT', 'XOM']]
close_px = close_px.resample('B').ffill()
close_px
AAPL MSFT XOM
2003-01-02 7.40 21.11 29.22
2003-01-03 7.45 21.14 29.24
2003-01-06 7.45 21.52 29.96
2003-01-07 7.43 21.93 28.95
2003-01-08 7.28 21.31 28.83
2003-01-09 7.34 21.93 29.44
2003-01-10 7.36 21.97 29.03
2003-01-13 7.32 22.16 28.91
2003-01-14 7.30 22.39 29.17
2003-01-15 7.22 22.11 28.77
2003-01-16 7.31 21.75 28.90
2003-01-17 7.05 20.22 28.60
2003-01-20 7.05 20.22 28.60
2003-01-21 7.01 20.17 27.94
2003-01-22 6.94 20.04 27.58
2003-01-23 7.09 20.54 27.52
2003-01-24 6.90 19.59 26.93
2003-01-27 7.07 19.32 26.21
2003-01-28 7.29 19.18 26.90
2003-01-29 7.47 19.61 27.88
2003-01-30 7.16 18.95 27.37
2003-01-31 7.18 18.65 28.13
2003-02-03 7.33 19.08 28.52
2003-02-04 7.30 18.59 28.52
2003-02-05 7.22 18.45 28.11
2003-02-06 7.22 18.63 27.87
2003-02-07 7.07 18.30 27.66
2003-02-10 7.18 18.62 27.87
2003-02-11 7.18 18.25 27.67
2003-02-12 7.20 18.25 27.12
... ... ... ...
2011-09-05 374.05 25.80 72.14
2011-09-06 379.74 25.51 71.15
2011-09-07 383.93 26.00 73.65
2011-09-08 384.14 26.22 72.82
2011-09-09 377.48 25.74 71.01
2011-09-12 379.94 25.89 71.84
2011-09-13 384.62 26.04 71.65
2011-09-14 389.30 26.50 72.64
2011-09-15 392.96 26.99 74.01
2011-09-16 400.50 27.12 74.55
2011-09-19 411.63 27.21 73.70
2011-09-20 413.45 26.98 74.01
2011-09-21 412.14 25.99 71.97
2011-09-22 401.82 25.06 69.24
2011-09-23 404.30 25.06 69.31
2011-09-26 403.17 25.44 71.72
2011-09-27 399.26 25.67 72.91
2011-09-28 397.01 25.58 72.07
2011-09-29 390.57 25.45 73.88
2011-09-30 381.32 24.89 72.63
2011-10-03 374.60 24.53 71.15
2011-10-04 372.50 25.34 72.83
2011-10-05 378.25 25.89 73.95
2011-10-06 377.37 26.34 73.89
2011-10-07 369.80 26.25 73.56
2011-10-10 388.81 26.94 76.28
2011-10-11 400.29 27.00 76.27
2011-10-12 402.19 26.96 77.16
2011-10-13 408.43 27.18 76.37
2011-10-14 422.00 27.27 78.11

2292 rows × 3 columns

close_px.AAPL.plot()

close_px.AAPL.rolling(250).mean().plot()

appl_std250 = close_px.AAPL.rolling(250, min_periods=10).std()
appl_std250[5:12]
2003-01-09         NaN
2003-01-10         NaN
2003-01-13         NaN
2003-01-14         NaN
2003-01-15    0.077496
2003-01-16    0.074760
2003-01-17    0.112368
Freq: B, Name: AAPL, dtype: float64
appl_std250.plot()


你可能感兴趣的:(利用python进行数据分析)