日期格式数据处理
- Pandas中使用to_datetime()方法将文本格式转换为日期格式
- dataframe数据类型如果为datetime64,可以使用dt方法取出年月日等
- 对于时间差数据,可以使用timedelta函数将其转换为指定时间单位的数值
- 时间差数据,可以使用dt方法访问其常用属性
import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据转换'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('baby_trade_history.csv', encoding='utf-8', dtype={'user_id':str})
df.head(5)
|
user_id |
auction_id |
cat_id |
cat1 |
property |
buy_mount |
day |
0 |
786295544 |
41098319944 |
50014866 |
50022520 |
21458:86755362;13023209:3593274;10984217:21985... |
2 |
20140919 |
1 |
532110457 |
17916191097 |
50011993 |
28 |
21458:11399317;1628862:3251296;21475:137325;16... |
1 |
20131011 |
2 |
249013725 |
21896936223 |
50012461 |
50014815 |
21458:30992;1628665:92012;1628665:3233938;1628... |
1 |
20131011 |
3 |
917056007 |
12515996043 |
50018831 |
50014815 |
21458:15841995;21956:3494076;27000458:59723383... |
2 |
20141023 |
4 |
444069173 |
20487688075 |
50013636 |
50008168 |
21458:30992;13658074:3323064;1628665:3233941;1... |
1 |
20141103 |
df.info()
RangeIndex: 29971 entries, 0 to 29970
Data columns (total 7 columns):
user_id 29971 non-null object
auction_id 29971 non-null int64
cat_id 29971 non-null int64
cat1 29971 non-null int64
property 29827 non-null object
buy_mount 29971 non-null int64
day 29971 non-null int64
dtypes: int64(5), object(2)
memory usage: 1.6+ MB
df['buy_date'] = pd.to_datetime(df['day'], format='%Y%m%d', errors='coerce')
df.info()
RangeIndex: 29971 entries, 0 to 29970
Data columns (total 8 columns):
user_id 29971 non-null object
auction_id 29971 non-null int64
cat_id 29971 non-null int64
cat1 29971 non-null int64
property 29827 non-null object
buy_mount 29971 non-null int64
day 29971 non-null int64
buy_date 29971 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(2)
memory usage: 1.8+ MB
df.head(5)
|
user_id |
auction_id |
cat_id |
cat1 |
property |
buy_mount |
day |
buy_date |
0 |
786295544 |
41098319944 |
50014866 |
50022520 |
21458:86755362;13023209:3593274;10984217:21985... |
2 |
20140919 |
2014-09-19 |
1 |
532110457 |
17916191097 |
50011993 |
28 |
21458:11399317;1628862:3251296;21475:137325;16... |
1 |
20131011 |
2013-10-11 |
2 |
249013725 |
21896936223 |
50012461 |
50014815 |
21458:30992;1628665:92012;1628665:3233938;1628... |
1 |
20131011 |
2013-10-11 |
3 |
917056007 |
12515996043 |
50018831 |
50014815 |
21458:15841995;21956:3494076;27000458:59723383... |
2 |
20141023 |
2014-10-23 |
4 |
444069173 |
20487688075 |
50013636 |
50008168 |
21458:30992;13658074:3323064;1628665:3233941;1... |
1 |
20141103 |
2014-11-03 |
df['diff_day'] = pd.datetime.now() - df['buy_date']
df.head(5)
|
user_id |
auction_id |
cat_id |
cat1 |
property |
buy_mount |
day |
buy_date |
diff_day |
0 |
786295544 |
41098319944 |
50014866 |
50022520 |
21458:86755362;13023209:3593274;10984217:21985... |
2 |
20140919 |
2014-09-19 |
2034 days 22:32:35.614788 |
1 |
532110457 |
17916191097 |
50011993 |
28 |
21458:11399317;1628862:3251296;21475:137325;16... |
1 |
20131011 |
2013-10-11 |
2377 days 22:32:35.614788 |
2 |
249013725 |
21896936223 |
50012461 |
50014815 |
21458:30992;1628665:92012;1628665:3233938;1628... |
1 |
20131011 |
2013-10-11 |
2377 days 22:32:35.614788 |
3 |
917056007 |
12515996043 |
50018831 |
50014815 |
21458:15841995;21956:3494076;27000458:59723383... |
2 |
20141023 |
2014-10-23 |
2000 days 22:32:35.614788 |
4 |
444069173 |
20487688075 |
50013636 |
50008168 |
21458:30992;13658074:3323064;1628665:3233941;1... |
1 |
20141103 |
2014-11-03 |
1989 days 22:32:35.614788 |
df.dtypes
user_id object
auction_id int64
cat_id int64
cat1 int64
property object
buy_mount int64
day int64
buy_date datetime64[ns]
diff_day timedelta64[ns]
dtype: object
df['时间差'] = df['diff_day']/pd.Timedelta('1 D')
df['时间差'].head(5)
0 2034.939301
1 2377.939301
2 2377.939301
3 2000.939301
4 1989.939301
Name: 时间差, dtype: float64
df['时间差'] = df['diff_day']/pd.Timedelta('1 H')
df['时间差'].head(5)
0 48838.543226
1 57070.543226
2 57070.543226
3 48022.543226
4 47758.543226
Name: 时间差, dtype: float64
df['时间差'] = df['diff_day']/pd.Timedelta('1 M')
df['时间差'].head(5)
0 2.930313e+06
1 3.424233e+06
2 3.424233e+06
3 2.881353e+06
4 2.865513e+06
Name: 时间差, dtype: float64
df['时间差'].head(5).round(decimals=3)
0 2930312.594
1 3424232.594
2 3424232.594
3 2881352.594
4 2865512.594
Name: 时间差, dtype: float64
df['diff_day'].astype('timedelta64[D]').head(5)
0 2034.0
1 2377.0
2 2377.0
3 2000.0
4 1989.0
Name: diff_day, dtype: float64
df['diff_day'].astype('timedelta64[M]').head(5)
0 66.0
1 78.0
2 78.0
3 65.0
4 65.0
Name: diff_day, dtype: float64
df['diff_day'].astype('timedelta64[Y]').head(5)
0 5.0
1 6.0
2 6.0
3 5.0
4 5.0
Name: diff_day, dtype: float64
df['diff_day'].astype('timedelta64[h]').head(5)
0 48838.0
1 57070.0
2 57070.0
3 48022.0
4 47758.0
Name: diff_day, dtype: float64