dateframe 日期数据,字符型转换成日期格式:
pd.to_datetime(dateframe_timedate,format='%Y/%m/%d %H:%M:%S')
常见的报错检查点:
np.where(条件,x,y)#条件正确取值为x,条件错误取值为y。
np.where()可见np.where()用法
TFS_data_origin['Date_of_submission'] = np.where((len(TFS_data_origin['Date_of_submission'])<19)|\
(TFS_data_origin['Date_of_submission']=='0001/01/01 08:00:00'),\
None, TFS_data_origin['Date_of_submission'])
多个日期比较前,先进行时间格式转换,字符型时间相互比较会有问题。
np.where(条件,x,y)#条件正确取值为x,条件错误取值为y。
TFS_data_origin['Delivered_date'] = np.where((TFS_data_origin['Delivered_date']<TFS_data_origin['Date_of_submission'])|\
(TFS_data_origin['Delivered_date']<TFS_data_origin['Adopted_date'])|\
(TFS_data_origin['Delivered_date']<TFS_data_origin['Delivery_date']) ,\
None, TFS_data_origin['Delivered_date'])
多个日期计算前,先进行时间格式转换。
dataframe格式日期天数计算参见另一篇博文dataframe计算两个日期之间天数 两种思路(推荐思路2),这里采用的是思路2.
FS_data_origin['Req_delivery_cycle'] = (TFS_data_origin['Delivered_date'] - TFS_data_origin['Date_of_submission']).dt.days + 1
group by 结合describe()或者聚合函数sum()等。
dateframe中多种数据类型,可以选择其中的某几列进行统计。
dateframe某一列中多种类型,可以强制转换成数值型再进行统计。
TFS_data[['Req_delivery_cycle','Req_delivery_waiting_cycle','Req_RD_cycle',\
'Req_plan_cycle']]=TFS_data[['Req_delivery_cycle','Req_delivery_waiting_cycle',\
'Req_RD_cycle','Req_plan_cycle']].astype('float')
TFS_statistic = TFS_data.groupby("project_name").describe(percentiles=[.5, .8])
'''
2010-01-08
主题:python dataframe时间类型数据预处理以及周期统计分析
Dandan
'''
import pandas as pd
import numpy as np
import time, datetime
from datetime import datetime
#导入数据
#encoding = ‘gbk’ windows下中文的编码
TFS_data_origin = pd.read_csv("data_input/timedata.csv",encoding = 'gbk')
#原始数据预览
print('*'*10+'TFS_data_origin shape'+'*'*10)
TFS_data_origin.describe()
print(TFS_data_origin.shape)
#数据预处理
#TFS数据预处理
#日期填写率统计
TFS_data_origin.isnull().any()
print(TFS_data_origin.isnull().sum())
#1-日期异常值处理
#日期填写内容错误,比如“0000-01-01 00:00:00”,日期长度格式填写错误,比如'2019-01-01 00:00'
TFS_data_origin['Date_of_submission'] = np.where((len(TFS_data_origin['Date_of_submission'])<19)|\
(TFS_data_origin['Date_of_submission']=='0001/01/01 08:00:00'),\
None, TFS_data_origin['Date_of_submission'])
TFS_data_origin['Adopted_date'] = np.where((TFS_data_origin['Adopted_date']<'2019/01/01 00:00:00')|\
(len(TFS_data_origin['Adopted_date'])<19)|\
(TFS_data_origin['Date_of_submission']=='0001/01/01 08:00:00'),\
None, TFS_data_origin['Adopted_date'])
TFS_data_origin['Delivery_date'] = np.where((TFS_data_origin['Delivery_date']<'2019/01/01 00:00:00')|\
(len(TFS_data_origin['Delivery_date'])<19)|\
(TFS_data_origin['Date_of_submission']=='0001/01/01 08:00:00'),\
None, TFS_data_origin['Delivery_date'])
TFS_data_origin['Delivered_date'] = np.where((TFS_data_origin['Delivered_date']<'2019/01/01 00:00:00')|\
(len(TFS_data_origin['Delivered_date'])<19)|\
(TFS_data_origin['Date_of_submission']=='0001/01/01 08:00:00'),\
None, TFS_data_origin['Delivered_date'])
#日期字符串日期化,字符串转日期格式,以便进行日期比较。字符型日期比较会出错。'2020/1/9'>'2020/1/13'
TFS_data_origin['Date_of_submission'] = pd.to_datetime(TFS_data_origin['Date_of_submission'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Adopted_date'] = pd.to_datetime(TFS_data_origin['Adopted_date'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Delivery_date'] = pd.to_datetime(TFS_data_origin['Delivery_date'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Delivered_date'] = pd.to_datetime(TFS_data_origin['Delivered_date'],format='%Y/%m/%d %H:%M:%S')
#日期之间关系异常,理论上'Date_of_submission'<='Adopted_date'<='Delivery_date'<='Delivered_date'
TFS_data_origin['Delivered_date'] = np.where((TFS_data_origin['Delivered_date']<TFS_data_origin['Date_of_submission'])|\
(TFS_data_origin['Delivered_date']<TFS_data_origin['Adopted_date'])|\
(TFS_data_origin['Delivered_date']<TFS_data_origin['Delivery_date']) ,\
None, TFS_data_origin['Delivered_date'])
TFS_data_origin['Delivery_date'] = np.where((TFS_data_origin['Delivery_date']<TFS_data_origin['Date_of_submission'])|\
(TFS_data_origin['Delivery_date']<TFS_data_origin['Adopted_date']),\
None, TFS_data_origin['Delivery_date'])
TFS_data_origin['Adopted_date'] = np.where((TFS_data_origin['Adopted_date']<TFS_data_origin['Date_of_submission']),\
None, TFS_data_origin['Adopted_date'])
# TFS_data_origin.describe()
print('*'*20)
TFS_data_origin.isnull().any()
print(TFS_data_origin.isnull().sum())
#日期字符串日期化,字符串转日期格式,以便进行日期比较。字符型日期会存在问题。
TFS_data_origin['Date_of_submission'] = pd.to_datetime(TFS_data_origin['Date_of_submission'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Adopted_date'] = pd.to_datetime(TFS_data_origin['Adopted_date'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Delivery_date'] = pd.to_datetime(TFS_data_origin['Delivery_date'],format='%Y/%m/%d %H:%M:%S')
TFS_data_origin['Delivered_date'] = pd.to_datetime(TFS_data_origin['Delivered_date'],format='%Y/%m/%d %H:%M:%S')
'''
计算周期,没有考虑状态!
TFS_data_origin['Req_delivery_cycle'] = (TFS_data_origin['Delivered_date'] - TFS_data_origin['Date_of_submission']).dt.days + 1
print(TFS_data_origin['Req_delivery_cycle'].isnull().sum())
print(TFS_data_origin['Req_delivery_cycle'].mean())
TFS_data_origin['Req_RD_cycle'] = (TFS_data_origin['Delivery_date'] - TFS_data_origin['Adopted_date']).dt.days + 1
print(TFS_data_origin['Req_RD_cycle'].isnull().sum())
print(TFS_data_origin['Req_RD_cycle'].mean())
TFS_data_origin['Req_delivery_waiting_cycle'] = (TFS_data_origin['Delivered_date'] - TFS_data_origin['Delivery_date']).dt.days + 1
print(TFS_data_origin['Req_delivery_waiting_cycle'].isnull().sum())
print(TFS_data_origin['Req_delivery_waiting_cycle'].mean())
TFS_data_origin['Req_plan_cycle'] = (TFS_data_origin['Adopted_date'] - TFS_data_origin['Date_of_submission']).dt.days + 1
print(TFS_data_origin['Req_plan_cycle'].isnull().sum())
print(TFS_data_origin['Req_plan_cycle'].mean())
'''
#特定状态下计算周期,其余状态置空。
TFS_data_origin['Req_delivery_cycle'] = np.where(TFS_data_origin['standard_state'].map(lambda x:x=='已交付'),\
((TFS_data_origin['Delivered_date'] - TFS_data_origin['Date_of_submission']).dt.days + 1),\
None)
TFS_data_origin['Req_delivery_waiting_cycle'] = np.where(TFS_data_origin['standard_state'].map(lambda x:x=='已交付'),\
((TFS_data_origin['Delivered_date'] - TFS_data_origin['Delivery_date']).dt.days + 1),\
None)
TFS_data_origin['Req_RD_cycle'] = np.where(TFS_data_origin['standard_state'].map(lambda x:x in ('已交付','可交付')),\
((TFS_data_origin['Delivery_date'] - TFS_data_origin['Adopted_date']).dt.days + 1),\
None)
TFS_data_origin['Req_plan_cycle'] = np.where(TFS_data_origin['standard_state'].map(lambda x:x in ('已采纳','已认领'),\ ((TFS_data_origin['Adopted_date'] - TFS_data_origin['Date_of_submission']).dt.days + 1),\
None)
print(TFS_data_origin['Req_delivery_cycle'], TFS_data_origin['Req_delivery_cycle'])
#选取大于等于1的有效需求交付周期数据
TFS_data = TFS_data_origin[(TFS_data_origin['Req_delivery_cycle']>=1)]
TFS_data.describe()
outputpath0='data_output/TFS_data_new.csv'
TFS_data.to_csv(outputpath0,sep=',',index=True,header=True)
#将周期列转换为数值型数据,方便进行分组统计
TFS_data[['Req_delivery_cycle','Req_delivery_waiting_cycle','Req_RD_cycle',\
'Req_plan_cycle']]=TFS_data[['Req_delivery_cycle','Req_delivery_waiting_cycle',\
'Req_RD_cycle','Req_plan_cycle']].astype('float')
#数据分组统计
#percentiles=[.5, .8] 增加80%分位数:
TFS_statistic = TFS_data.groupby("project_name").describe(percentiles=[.5, .8])
print('*'*10+'TFS_statistic shape'+'*'*10)
print(TFS_statistic.shape)
print('*'*20)
outputpath1='data_output/TFS_statistics_new.csv'
TFS_statistic.to_csv(outputpath1,sep=',',index=True,header=True)