import pandas as pd
import numpy as np
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Ywu2mJqc-1593446125517)(attachment:image.png)]
pd.to_datetime('2020.1.1')
pd.to_datetime('2020 1.1')
pd.to_datetime('2020 1 1')
pd.to_datetime('2020 1-1')
pd.to_datetime('2020-1 1')
pd.to_datetime('2020-1-1')
pd.to_datetime('2020/1/1')
pd.to_datetime('1.1.2020')
pd.to_datetime('1.1 2020')
pd.to_datetime('1 1 2020')
pd.to_datetime('1 1-2020')
pd.to_datetime('1-1 2020')
pd.to_datetime('1-1-2020')
pd.to_datetime('1/1/2020')
pd.to_datetime('20200101')
pd.to_datetime('2020.0101')
Timestamp('2020-01-01 00:00:00')
pd.to_datetime('2020`1`1',format='%Y`%m`%d')
Timestamp('2020-01-01 00:00:00')
pd.Series(range(2),index=pd.to_datetime(['2020/1/1','2020/1/2']))
2020-01-01 0
2020-01-02 1
dtype: int64
type(pd.to_datetime(['2020/1/1','2020/1/2']))
pandas.core.indexes.datetimes.DatetimeIndex
data=pd.DataFrame({'year':[2020,2020],'month':[1,1],'day':[1,2]})
pd.to_datetime(data)
0 2020-01-01
1 2020-01-02
dtype: datetime64[ns]
pd.to_datetime('2020/1/1 00:00:00.123456789')
Timestamp('2020-01-01 00:00:00.123456789')
pd.date_range(start='2020/1/1',end='2020/1/10',periods=5)
DatetimeIndex(['2020-01-01 00:00:00', '2020-01-03 06:00:00',
'2020-01-05 12:00:00', '2020-01-07 18:00:00',
'2020-01-10 00:00:00'],
dtype='datetime64[ns]', freq=None)
pd.date_range(start='2020/1/1',end='2020/1/10',freq='D')
DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
'2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
'2020-01-09', '2020-01-10'],
dtype='datetime64[ns]', freq='D')
pd.date_range(start='2020/1/1',periods=4,freq='D')
DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04'], dtype='datetime64[ns]', freq='D')
pd.date_range(start='2020/1/1',periods=4,freq='T')
DatetimeIndex(['2020-01-01 00:00:00', '2020-01-01 00:01:00',
'2020-01-01 00:02:00', '2020-01-01 00:03:00'],
dtype='datetime64[ns]', freq='T')
pd.date_range(start='2020/1/1',periods=4,freq='M') #一直到月末
DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30'], dtype='datetime64[ns]', freq='M')
pd.date_range(start='2020/1/1',periods=4,freq='BYS')
DatetimeIndex(['2020-01-01', '2021-01-01', '2022-01-03', '2023-01-02'], dtype='datetime64[ns]', freq='BAS-JAN')
bdate_range是一个类似与date_range的方法,特点在于可以在自带的工作日间隔设置上,再选择weekmask参数和holidays参数
它的freq中有一个特殊的’C’/‘CBM’/'CBMS’选项,表示定制,需要联合weekmask参数和holidays参数使用
例如现在需要将工作日中的周一、周二、周五3天保留,并将部分holidays剔除
weekmask='Mon Tue Fri'
holidays=[pd.Timestamp('2020/1/%s'%i) for i in range(7,13)]
pd.bdate_range(start='2020/1/1',end='2020/1/20',freq='C',weekmask=weekmask
,holidays=holidays)
DatetimeIndex(['2020-01-03', '2020-01-06', '2020-01-13', '2020-01-14',
'2020-01-17', '2020-01-20'],
dtype='datetime64[ns]', freq='C')
ts=pd.Timestamp('2020/1/1 02:00:00',tz='Europe/Helsinki')
ts+pd.Timedelta(days=2)
Timestamp('2020-01-03 02:00:00+0200', tz='Europe/Helsinki')
ts+pd.DateOffset(days=2)
Timestamp('2020-01-03 02:00:00+0200', tz='Europe/Helsinki')
pd.Timestamp('2020/1/1')+pd.DateOffset(minutes=20)-pd.DateOffset(weeks=2)
Timestamp('2019-12-18 00:20:00')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lGDtzKoy-1593446125519)(attachment:image.png)]
pd.Timestamp('2020/1/1')+pd.offsets.Week(2)
Timestamp('2020-01-15 00:00:00')
pd.Timestamp('2020/01/01') + pd.offsets.BQuarterBegin(1)
Timestamp('2020-03-02 00:00:00')
pd.Series(pd.offsets.BYearBegin(3).apply(i) for i in pd.date_range('20200101',periods=3,freq='Y'))
0 2023-01-02
1 2024-01-01
2 2025-01-01
dtype: datetime64[ns]
pd.date_range('2020/1/1',periods=3,freq='Y')+pd.offsets.BYearBegin(3)
DatetimeIndex(['2023-01-02', '2024-01-01', '2025-01-01'], dtype='datetime64[ns]', freq='A-DEC')
pd.Series(pd.offsets.CDay(3,weekmask='Wed Fri').apply(i)
for i in pd.date_range('20200105',periods=3,freq='D'))
0 2020-01-15
1 2020-01-15
2 2020-01-15
dtype: datetime64[ns]
pd.date_range('20200105',periods=3,freq='D')
DatetimeIndex(['2020-01-05', '2020-01-06', '2020-01-07'], dtype='datetime64[ns]', freq='D')
rng=pd.date_range('2020','2021',freq='W')
ts=pd.Series(np.random.randn(len(rng)),index=rng)
ts.head()
2020-01-05 -0.243773
2020-01-12 0.821945
2020-01-19 1.234821
2020-01-26 -0.345294
2020-02-02 0.209237
Freq: W-SUN, dtype: float64
ts['2020/1/26']
-0.3452940253341838
data_r=pd.DataFrame(np.random.randn(1000,3),index=pd.date_range('1/1/2020',freq='S',periods=1000),columns=['A','B','C'])
data_r
A | B | C | |
---|---|---|---|
2020-01-01 00:00:00 | -0.099414 | -0.192917 | 1.066336 |
2020-01-01 00:00:01 | 2.194989 | -0.119435 | -0.337781 |
2020-01-01 00:00:02 | -1.278629 | 0.575146 | 0.029983 |
2020-01-01 00:00:03 | 1.575472 | 0.060964 | -1.603794 |
2020-01-01 00:00:04 | 0.749323 | 1.865636 | 0.146967 |
... | ... | ... | ... |
2020-01-01 00:16:35 | 0.695445 | -0.070806 | -0.513009 |
2020-01-01 00:16:36 | 0.204610 | -1.123102 | -0.036155 |
2020-01-01 00:16:37 | 0.083128 | 2.143166 | 1.597349 |
2020-01-01 00:16:38 | -0.829676 | -0.948882 | -1.235877 |
2020-01-01 00:16:39 | 0.254903 | 0.246231 | 1.211078 |
1000 rows × 3 columns
r=data_r.resample('3min')
r
r.sum()#每3分钟采样一次
A | B | C | |
---|---|---|---|
2020-01-01 00:00:00 | 19.188984 | -4.217824 | 21.772775 |
2020-01-01 00:03:00 | -2.906431 | -6.085502 | -8.279348 |
2020-01-01 00:06:00 | 2.325963 | -7.106872 | 14.082200 |
2020-01-01 00:09:00 | -14.955132 | 2.491868 | -11.081364 |
2020-01-01 00:12:00 | -1.064336 | 19.658090 | 15.028689 |
2020-01-01 00:15:00 | 6.039573 | -3.221933 | -1.965375 |
data_r2=pd.DataFrame(np.random.randn(200,3)
,index=pd.date_range('2020/1/1',freq='D'
,periods=200),columns=['A','B','C'])
r=data_r2.resample('CBMS')
r.sum()
A | B | C | |
---|---|---|---|
2020-01-01 | -1.040047 | -2.799846 | 1.522516 |
2020-02-03 | -0.647010 | 0.828069 | 3.202404 |
2020-03-02 | 2.118657 | -10.868392 | -1.208521 |
2020-04-01 | 1.772132 | -0.610901 | 6.605194 |
2020-05-01 | 9.023414 | 1.690500 | -0.139112 |
2020-06-01 | 9.431974 | 1.707741 | -4.933881 |
2020-07-01 | -9.096084 | 0.484947 | 10.837215 |
r=data_r.resample('3T')
r['A'].mean()
2020-01-01 00:00:00 0.106605
2020-01-01 00:03:00 -0.016147
2020-01-01 00:06:00 0.012922
2020-01-01 00:09:00 -0.083084
2020-01-01 00:12:00 -0.005913
2020-01-01 00:15:00 0.060396
Freq: 3T, Name: A, dtype: float64
r['A'].agg([np.sum,np.mean,np.std])
sum | mean | std | |
---|---|---|---|
2020-01-01 00:00:00 | 19.188984 | 0.106605 | 0.993201 |
2020-01-01 00:03:00 | -2.906431 | -0.016147 | 1.014105 |
2020-01-01 00:06:00 | 2.325963 | 0.012922 | 0.974205 |
2020-01-01 00:09:00 | -14.955132 | -0.083084 | 1.013231 |
2020-01-01 00:12:00 | -1.064336 | -0.005913 | 1.054681 |
2020-01-01 00:15:00 | 6.039573 | 0.060396 | 0.870126 |
r.agg({'A':np.sum,'B':lambda x:max(x)-min(x)})
A | B | |
---|---|---|
2020-01-01 00:00:00 | 19.188984 | 4.719555 |
2020-01-01 00:03:00 | -2.906431 | 5.455282 |
2020-01-01 00:06:00 | 2.325963 | 5.365256 |
2020-01-01 00:09:00 | -14.955132 | 5.278425 |
2020-01-01 00:12:00 | -1.064336 | 5.027111 |
2020-01-01 00:15:00 | 6.039573 | 4.847176 |
small=pd.Series(range(6),index=pd.to_datetime(['2020-01-01 00:00:00', '2020-01-01 00:30:00'
, '2020-01-01 00:31:00','2020-01-01 01:00:00'
,'2020-01-01 03:00:00','2020-01-01 03:05:00']))
resampled = small.resample('H')
for name,group in resampled:
print('group:',name)
print('-'*27)
print(group,end='\n\n')
group: 2020-01-01 00:00:00
---------------------------
2020-01-01 00:00:00 0
2020-01-01 00:30:00 1
2020-01-01 00:31:00 2
dtype: int64
group: 2020-01-01 01:00:00
---------------------------
2020-01-01 01:00:00 3
dtype: int64
group: 2020-01-01 02:00:00
---------------------------
Series([], dtype: int64)
group: 2020-01-01 03:00:00
---------------------------
2020-01-01 03:00:00 4
2020-01-01 03:05:00 5
dtype: int64
s=pd.Series(np.random.randn(1000),index=pd.date_range('2020/1/1',periods=1000))
s.head()
2020-01-01 1.329339
2020-01-02 -0.578495
2020-01-03 0.144955
2020-01-04 -0.388870
2020-01-05 -1.020317
Freq: D, dtype: float64
s.rolling(window=50)
Rolling [window=50,center=False,axis=0]
s.rolling(window=50).mean()
2020-01-01 NaN
2020-01-02 NaN
2020-01-03 NaN
2020-01-04 NaN
2020-01-05 NaN
...
2022-09-22 0.100815
2022-09-23 0.135331
2022-09-24 0.136454
2022-09-25 0.175283
2022-09-26 0.169613
Freq: D, Length: 1000, dtype: float64
s.rolling(window=50,min_periods=3).mean().head()
2020-01-01 NaN
2020-01-02 NaN
2020-01-03 0.298600
2020-01-04 0.126733
2020-01-05 -0.102677
Freq: D, dtype: float64
s.rolling(window=50,min_periods=3).std()
2020-01-01 NaN
2020-01-02 NaN
2020-01-03 0.963152
2020-01-04 0.858251
2020-01-05 0.903101
...
2022-09-22 0.872299
2022-09-23 0.907054
2022-09-24 0.906309
2022-09-25 0.889243
2022-09-26 0.893184
Freq: D, Length: 1000, dtype: float64
s.rolling(window=50,min_periods=3).apply(lambda x:x.std()/x.mean()).head()
2020-01-01 NaN
2020-01-02 NaN
2020-01-03 3.225560
2020-01-04 6.772149
2020-01-05 -8.795515
Freq: D, dtype: float64
s.rolling('15D').max().head()
2020-01-01 1.329339
2020-01-02 1.329339
2020-01-03 1.329339
2020-01-04 1.329339
2020-01-05 1.329339
Freq: D, dtype: float64
s.rolling(window=len(s),min_periods=1).sum()
2020-01-01 1.329339
2020-01-02 0.750845
2020-01-03 0.895800
2020-01-04 0.506930
2020-01-05 -0.513387
...
2022-09-22 -45.244308
2022-09-23 -43.385423
2022-09-24 -43.810607
2022-09-25 -43.070710
2022-09-26 -43.646916
Freq: D, Length: 1000, dtype: float64
s.expanding().sum().head()
2020-01-01 1.329339
2020-01-02 0.750845
2020-01-03 0.895800
2020-01-04 0.506930
2020-01-05 -0.513387
Freq: D, dtype: float64
shift/diff/pct_change都是涉及到了元素关系
s.head(10)
2020-01-01 1.329339
2020-01-02 -0.578495
2020-01-03 0.144955
2020-01-04 -0.388870
2020-01-05 -1.020317
2020-01-06 -0.751869
2020-01-07 0.678827
2020-01-08 -0.957797
2020-01-09 0.578865
2020-01-10 -0.775079
Freq: D, dtype: float64
s.shift(2).head(5)
2020-01-01 NaN
2020-01-02 NaN
2020-01-03 1.329339
2020-01-04 -0.578495
2020-01-05 0.144955
Freq: D, dtype: float64
s.shift(3).head(5)
2020-01-01 NaN
2020-01-02 NaN
2020-01-03 NaN
2020-01-04 1.329339
2020-01-05 -0.578495
Freq: D, dtype: float64
s.diff(3).head()
2020-01-01 NaN
2020-01-02 NaN
2020-01-03 NaN
2020-01-04 -1.718209
2020-01-05 -0.441823
Freq: D, dtype: float64
s.diff(2).head()
2020-01-01 NaN
2020-01-02 NaN
2020-01-03 -1.184384
2020-01-04 0.189624
2020-01-05 -1.165273
Freq: D, dtype: float64
s.pct_change(3).head()
2020-01-01 NaN
2020-01-02 NaN
2020-01-03 NaN
2020-01-04 -1.292529
2020-01-05 0.763746
Freq: D, dtype: float64
如何对date_range进行批量加帧操作或对某一时间段加大时间戳密度?
因为这个是确定3个参数,主要还是在时间点个数与间隔上进行处理
如何批量增加TimeStamp的精度
import time
import datetime
t = time.time()
print (t) #原始时间数据
print (int(t)) #秒级时间戳
print (int(round(t * 1000))) #毫秒级时间戳
print (int(round(t * 1000000))) #微秒级时间戳
1593443476.5105758
1593443476
1593443476511
1593443476510576
对于超出处理时间的时间点,是否真的完全没有处理方法?
网上搜了下,没有看到相关方法,自己也不会处理超过这个的数据
给定一组非连续的日期,怎么快速找出位于其最大日期和最小日期之间,且没有出现在该组日期中的日期?
s.index.max()
Timestamp('2022-09-26 00:00:00', freq='D')
s.index.min()
Timestamp('2020-01-01 00:00:00', freq='D')
第三个不会
现有一份关于某超市牛奶销售额的时间序列数据,请完成下列问题:
1. 销售额出现最大值的是星期几?
data_sale=pd.read_csv(r'D:\jupyter Notebook\天池比赛\pandas学习\joyful-pandas-master\data\time_series_one.csv',parse_dates=['日期'])
data_sale
日期 | 销售额 | |
---|---|---|
0 | 2017-02-17 | 2154 |
1 | 2017-02-18 | 2095 |
2 | 2017-02-19 | 3459 |
3 | 2017-02-20 | 2198 |
4 | 2017-02-21 | 2413 |
... | ... | ... |
995 | 2019-11-09 | 3022 |
996 | 2019-11-10 | 2961 |
997 | 2019-11-11 | 3984 |
998 | 2019-11-12 | 2799 |
999 | 2019-11-13 | 2941 |
1000 rows × 2 columns
data_sale.shape
(1000, 2)
data_sale['销售额'].max()
4333
data_sale.loc[data_sale['销售额']==4333]
日期 | 销售额 | |
---|---|---|
926 | 2019/9/1 | 4333 |
2. 计算除去春节、国庆、五一节假日的月度销售总额
#先输入假期
holiday = pd.date_range(start='20170501', end='20170503').append(
pd.date_range(start='20171001', end='20171007')).append(
pd.date_range(start='20180215', end='20180221')).append(
pd.date_range(start='20180501', end='20180503')).append(
pd.date_range(start='20181001', end='20181007')).append(
pd.date_range(start='20190204', end='20190224')).append(
pd.date_range(start='20190501', end='20190503')).append(
pd.date_range(start='20191001', end='20191007'))
result_1 = data_sale[~data_sale['日期'].isin(holiday)].set_index('日期').resample('MS').sum()
result_1
销售额 | |
---|---|
日期 | |
2017-02-01 | 31740 |
2017-03-01 | 80000 |
2017-04-01 | 74734 |
2017-05-01 | 76237 |
2017-06-01 | 80750 |
2017-07-01 | 83107 |
2017-08-01 | 85715 |
2017-09-01 | 79604 |
2017-10-01 | 61197 |
2017-11-01 | 78877 |
2017-12-01 | 84900 |
2018-01-01 | 85869 |
2018-02-01 | 61838 |
2018-03-01 | 88339 |
2018-04-01 | 82011 |
2018-05-01 | 76932 |
2018-06-01 | 85307 |
2018-07-01 | 82316 |
2018-08-01 | 85917 |
2018-09-01 | 82740 |
2018-10-01 | 67018 |
2018-11-01 | 81333 |
2018-12-01 | 91381 |
2019-01-01 | 89407 |
2019-02-01 | 21219 |
2019-03-01 | 87565 |
2019-04-01 | 90202 |
2019-05-01 | 85665 |
2019-06-01 | 90301 |
2019-07-01 | 90902 |
2019-08-01 | 93664 |
2019-09-01 | 89077 |
2019-10-01 | 72099 |
2019-11-01 | 38423 |
3. 按季度计算周末(周六和周日)的销量总额
result_2=data_sale[data_sale['日期'].dt.dayofweek.isin([5,6])].set_index('日期').resample('QS').sum()
result_2
销售额 | |
---|---|
日期 | |
2017-01-01 | 32894 |
2017-04-01 | 66692 |
2017-07-01 | 69099 |
2017-10-01 | 70384 |
2018-01-01 | 74671 |
2018-04-01 | 69950 |
2018-07-01 | 74245 |
2018-10-01 | 74699 |
2019-01-01 | 77835 |
2019-04-01 | 77042 |
2019-07-01 | 76276 |
2019-10-01 | 35994 |
1. 对于时序数据对于自己十分重要,这个是需要重点掌握的
2. 课后习题难度真的有点太大了。自己完全是不能完成了!