选取时段
导入数据包
import pandas as pd
import numpy as np
import sklearn as skr
import datetime
import matplotlib as plt
import seaborn as sns
from dateutil import relativedelta
def load_data(file_path):
data_balance = pd.read_csv(file_path)
data_balance = add_timestamp(data_balance,"report_date")
return data_balance.reset_index(drop=True)
#给数据集添加时间戳
def add_timestamp(data,date):
data_balance = data.copy()
data_balance["date"] = pd.to_datetime(data_balance[date],format="%Y%m%d")
data_balance["day"] = data_balance["date"].dt.day
data_balance["month"]= data_balance["date"].dt.month
data_balance["year"] = data_balance["date"].dt.year
data_balance["week"] = data_balance["date"].dt.week
data_balance["weekday"] = data_balance["date"].dt.weekday
return data_balance.reset_index(drop=True)
#按照日期统计申购/赎回金额总量
def total_amt(data,date):
data_temp = data.copy()
data_temp = data.groupby("date",as_index=False)["total_purchase_amt","total_redeem_amt"].sum()
return data_temp[data_temp["date"]>=date].reset_index(drop=True)
#生成测试数据
def generate_data(data,start_date,end_date):
total_balance = data.copy()
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)
test_data=[]
while start_date!=end_date:
temp = [start_date,np.nan,np.nan]
test_data.append(temp)
start_date += datetime.timedelta(days = 1)
test_data = pd.DataFrame(test_data)
test_data.columns = total_balance.columns
total_balance = pd.concat([total_balance,test_data],axis=0)
return total_balance.reset_index(drop=True)
user_balance_file_path = r"./Data/user_balance_table.csv"
user_info_file_path = r"./Data/user_profile_table.csv"
data_balance = load_data(user_balance_file_path)
total_balance = total_amt(data_balance,"2014-03-01")
total_balance = generate_data(total_balance,"2014-08-04","2014-8-31")
total_balance = add_timestamp(total_balance,"date")
#定义时间序列规则预测结果的方法
def generate_base(data,month_index):
#选中固定时间短的数据集
total_balance = data.copy()
total_balance = total_balance[["date","total_purchase_amt","total_redeem_amt"]]
total_balance = total_balance[(total_balance["date"]>="2014-03-01")&(total_balance["date"]<pd.Timestamp(2014,month_index,1))]
#加入时间戳
total_balance["day"] = total_balance["date"].dt.day
total_balance["month"] = total_balance["date"].dt.month
total_balance["week"] = total_balance["date"].dt.week
total_balance["weekday"] = total_balance["date"].dt.weekday
#统计每日因子,按照星期聚合的均值/所有数据的均值
mean_of_each_weekday = total_balance[["weekday","total_purchase_amt","total_redeem_amt"]].groupby("weekday",as_index=False).mean()
for name in ["total_purchase_amt","total_redeem_amt"]:
mean_of_each_weekday = mean_of_each_weekday.rename(columns={name:name+"_weekdaymean"})
mean_of_each_weekday["total_purchase_amt_weekdaymean"] /=np.mean(total_balance["total_purchase_amt"])
mean_of_each_weekday["total_redeem_amt_weekdaymean"] /=np.mean(total_balance["total_redeem_amt"])
#将统计结果左联到原数据集中
total_balance = pd.merge(total_balance,mean_of_each_weekday,on="weekday",how="left")
#统计1-31号中星期出现的频率
weekday_count = total_balance[["date","weekday","day"]].groupby(["day","weekday"],as_index=False).count()
weekday_count = pd.merge(weekday_count,mean_of_each_weekday,on="weekday")
#根据频率对每日因子加权,获得日期因子
weekday_count["total_purchase_amt_weekdaymean"] *= weekday_count["date"]/len(np.unique(total_balance["month"]))
weekday_count["total_redeem_amt_weekdaymean"]*=weekday_count["date"]/len(np.unique(total_balance["month"]))
day_rate = weekday_count.drop(["weekday","date"],axis=1).groupby("day",as_index=False).sum()
#将测试数据集中所有日期的均值剔除日期残差得到base
day_mean = total_balance[["day","total_purchase_amt","total_redeem_amt"]].groupby("day",as_index=False).mean()
day_pre = pd.merge(day_mean,day_rate,on="day",how="left")
day_pre["total_purchase_amt"] /=day_pre["total_purchase_amt_weekdaymean"]
day_pre["total_purchase_amt"] /=day_pre["total_redeem_amt_weekdaymean"]
#生成测试数据集
for index,row in day_pre.iterrows():
if month_index in (2,4,6,9) and row["day"]==31:
break
day_pre.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))
#基于base和每日因子计算最终的预测结果
day_pre["weekday"] = day_pre["date"].dt.weekday
day_pre = day_pre[["date","weekday","total_purchase_amt","total_redeem_amt"]]
day_pre = pd.merge(day_pre,mean_of_each_weekday,on="weekday")
day_pre["total_purchase_amt"]*=day_pre["total_purchase_amt_weekdaymean"]
day_pre["total_purchase_amt"]*=day_pre["total_redeem_amt_weekdaymean"]
day_pre = day_pre.sort_values("date")[["date","total_purchase_amt","total_redeem_amt"]]
return day_pre