import numpy as np
import pandas as pd
date = np.array('2021-02-03', dtype=np.datetime64)
'''
可以被解析的时间格式一般为year-mouth-day
numpy的时间类型:np.datetime64
'''
'\n可以被解析的时间格式一般为year-mouth-day\nnumpy的时间类型:np.datetime64\n'
date
array('2021-02-03', dtype='datetime64[D]')
np.arange(12)
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
# WTF???
date + np.arange(12)
array(['2021-02-03', '2021-02-04', '2021-02-05', '2021-02-06',
'2021-02-07', '2021-02-08', '2021-02-09', '2021-02-10',
'2021-02-11', '2021-02-12', '2021-02-13', '2021-02-14'],
dtype='datetime64[D]')
date = np.datetime64('2020-12-21T13:00', 'ms')
'''
T 分割天和小时
第二个参数指定单位:
YMWDHMS ms us ns ps
'''
'\nT 分割天和小时\n\n第二个参数指定单位:\nYMWDHMS ms us ns ps\n'
date
numpy.datetime64('2020-12-21T13:00:00.000')
# string -> time
pd.to_datetime('10/11/12')
Timestamp('2012-10-11 00:00:00')
pd.to_datetime('10/11/12', yearfirst=True)
'''
dayfirst=True: 日先
yearfirst=True
'''
Timestamp('2010-11-12 00:00:00')
date = pd.to_datetime('1th July 2020 13:12:00')
date
Timestamp('2020-07-01 13:12:00')
# 用format解决异常时间字符串的问题
pd.to_datetime('018-02-03', format='0%y-%m-%d')
Timestamp('2018-02-03 00:00:00')
# time -> string
date.strftime('%Y-%m-%d %H:%M:%S')
'2020-07-01 13:12:00'
%a
Weekday as locale’s abbreviated name.
Sun, Mon, …, Sat (en_US);
%A
Weekday as locale’s full name.
Sunday, Monday, …, Saturday (en_US);
%w
Weekday as a decimal number, where 0 is Sunday and 6 is Saturday.
0, 1, …, 6
%d
Day of the month as a zero-padded decimal number.
01, 02, …, 31
%B
Month as locale’s full name.
January, February, …, December (en_US);
Januar, Februar, …, Dezember (de_DE)
%m
Month as a zero-padded decimal number.
01, 02, …, 12
%y
Year without century as a zero-padded decimal number.
00, 01, …, 99
%Y
Year with century as a decimal number.
0001, 0002, …, 2013, 2014, …, 9998, 9999
%H
Hour (24-hour clock) as a zero-padded decimal number.
00, 01, …, 23
%p
Locale’s equivalent of either AM or PM.
AM, PM (en_US);
%M
Minute as a zero-padded decimal number.
00, 01, …, 59
%S
Second as a zero-padded decimal number.
00, 01, …, 59
%f
Microsecond as a decimal number, zero-padded on the left.
000000, 000001, …, 999999
%Z
Time zone name (empty string if the object is naive).
(empty), UTC, GMT
%x
Locale’s appropriate date representation.
08/16/88 (None);
08/16/1988 (en_US);
16.08.1988 (de_DE)
%X
Locale’s appropriate time representation.
21:30:00 (en_US);
21:30:00 (de_DE)
(1)
%%
A literal '%' character.
# 获取时间信息1
# 注意是Timestamp类型
print(
'\n1:',date.year,
'\n2:',date.month,
'\n3:',date.week,# == date.weekofyear
'\n4:',date.day,
'\n5:',date.hour,
'\n6:',date.minute,
'\n7:',date.second,
'\n8:',date.microsecond,
'\n9:',date.nanosecond,
'\n10:',date.day_name(),
'\n11:',date.dayofweek, # 周一 = 0,周日 = 6
'\n12:',date.dayofyear,
'\n13:',date.weekofyear,
'\n14:',date.days_in_month,# number of days in the month
'\n15:',date.weekday(), # == dayofweek
)
1: 2020
2: 7
3: 27
4: 1
5: 13
6: 12
7: 0
8: 0
9: 0
10: Wednesday
11: 2
12: 183
13: 27
14: 31
15: 2
# 获取时间信息2
print(
date.is_leap_year, #闰年
date.is_month_start,
date.is_month_end,
date.is_quarter_start, #季度
date.is_quarter_end,
date.is_year_start,
date.is_year_end,
)
True True False True False False False 3
# 获取时间信息3
# 转换为Series从而引入apply方法
date_series = pd.Series(date)
date_series
0 2020-07-01 13:12:00
dtype: datetime64[ns]
# Series类型的日期(即一系列时间戳,或dataframe的一整列
# 都可以使用.dt的方式访问上面的方法
# 判断周末 weekend
print(date_series.dt.dayofweek.apply(lambda x: 1 if x >= 4 and x <= 6 else 0),
# 判断 13:00 ~ 15:00 之间
date_series.dt.hour.apply(lambda x: 1 if x >= 13 and x <= 15 else 0),
sep = '\n'
)
0 0
dtype: int64
0 1
dtype: int64
# 构造datetime index
date_index = pd.DatetimeIndex(['2010-01-04', '2010-01-05'])
data = pd.Series([1, 2], index=date_index)
data
2010-01-04 1
2010-01-05 2
dtype: int64
data['2010']
2010-01-04 1
2010-01-05 2
dtype: int64
test_tsv = pd.read_csv('./test_tsv.tsv', usecols=['customer_id', 'review_date'], sep = '\t')
test_tsv
customer_id | review_date | |
---|---|---|
0 | 40626522 | 8/31/2015 |
1 | 16290022 | 8/31/2015 |
2 | 10216509 | 8/31/2015 |
3 | 114040 | 8/31/2015 |
4 | 27971579 | 8/31/2015 |
... | ... | ... |
18934 | 21573136 | 5/24/2004 |
18935 | 19606706 | 4/4/2004 |
18936 | 25764155 | 4/4/2004 |
18937 | 28162301 | 12/2/2003 |
18938 | 40173284 | 4/27/2003 |
18939 rows × 2 columns
test_tsv = pd.read_csv('./test_tsv.tsv', usecols=['customer_id', 'review_date'], \
parse_dates=['review_date'], sep = '\t')[['review_date', 'customer_id']]
test_tsv.sort_values(by='review_date', inplace=True)
test_tsv.reset_index(inplace=True, drop=True)
test_tsv
review_date | customer_id | |
---|---|---|
0 | 2003-04-27 | 40173284 |
1 | 2003-12-02 | 28162301 |
2 | 2004-04-04 | 19606706 |
3 | 2004-04-04 | 25764155 |
4 | 2004-05-24 | 21573136 |
... | ... | ... |
18934 | 2015-08-31 | 2761934 |
18935 | 2015-08-31 | 12996130 |
18936 | 2015-08-31 | 9603909 |
18937 | 2015-08-31 | 15312194 |
18938 | 2015-08-31 | 40626522 |
18939 rows × 2 columns