import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
plt.rcParams["font.sans-serif"] = ["Microsoft YaHei"]
plt.rcParams["axes.unicode_minus"] = False
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import gc
import tqdm
import warnings
warnings.filterwarnings("ignore")
path_train = "../preocess_data/train_data_o.csv"
path_test = "../data/test_data.csv"
data = pd.read_csv(path_train)
data_test = pd.read_csv(path_test)
data["运营日期"] = pd.to_datetime(data["运营日期"] )
data_test["运营日期"] = pd.to_datetime(data_test["日期"])
data.drop(["行ID","日期"],axis=1,inplace=True)
data_test.drop(["行ID","日期"],axis=1,inplace=True)
data["商店ID"].min()
data["商店ID"].max()
365
enc = OneHotEncoder(drop="if_binary")
enc.fit(data["折扣"].values.reshape(-1,1))
enc.transform(data["折扣"].values.reshape(-1,1)).toarray()
enc.transform(data_test["折扣"].values.reshape(-1,1)).toarray()
array([[0.],
[0.],
[0.],
...,
[1.],
[0.],
[0.]])
data["折扣"] = enc.transform(data["折扣"].values.reshape(-1,1)).toarray()
data_test["折扣"] = enc.transform(data_test["折扣"].values.reshape(-1,1)).toarray()
enc = OneHotEncoder(drop="if_binary")
enc.fit(data[["商店类型","位置","地区"]])
OneHotEncoder(drop='if_binary')
columns_ = []
for index in enc.categories_:
for name in index :
columns_.append(name)
columns_
['S1', 'S2', 'S3', 'S4', 'L1', 'L2', 'L3', 'L4', 'L5', 'R1', 'R2', 'R3', 'R4']
enc_train = pd.DataFrame(enc.transform(data[["商店类型","位置","地区"]]).toarray(),columns=columns_)
enc_test = pd.DataFrame(enc.transform(data_test[["商店类型","位置","地区"]]).toarray(),columns=columns_)
data = pd.concat([data,enc_train],axis=1)
data_test = pd.concat([data_test,enc_test],axis=1)
data.drop(["商店类型","位置","地区"],axis=1,inplace=True)
data_test.drop(["商店类型","位置","地区"],axis=1,inplace=True)
def time_derivation(t,col="运营日期"):
t["year"] = t[col].dt.year
t["month"] = t[col].dt.month
t["day"] = t[col].dt.day
t["quarter"] = t[col].dt.quarter
t["weekofyear"] = t[col].dt.weekofyear
t["dayofweek"] = t[col].dt.dayofweek+1
t["weekend"] = (t["dayofweek"]>5).astype(int)
return t
data_train = time_derivation(data)
data_test_ = time_derivation(data_test)
data_train.drop("运营日期",axis=1,inplace=True)
data_test_.drop("运营日期",axis=1,inplace=True)
data_train.shape
(188340, 24)
def series_to_supervisied_(data,step_in,step_out,dropnan = True):
"""
:param data: 观测的序列,类型为列表或者二维的numpy数组
:param step_in: 作为输入滞后观测数量(x)
:param step_out: 作为输出的观测值数量(y)
:param dropnan: 是否具有Nan的行,默认为True
return 监督学习的重组得到的dataframe列
"""
n_vars = 1 if type(data) is list else data.shape[1]
df = data
cols = []
names = []
# 输入序列[(t-n),(t-n+1),(t-n+2)..(t-1)]
for i in range(step_in,0,-1):
cols.append(df.shift(i))
names+=[f"{name}-({i})step" for name in df.columns]
# 输出序列[t,(t+1),(t+2)...(t+n)]
for i in range(0,step_out):
cols.append(df.shift(-i))
if i ==0:
names+=[f"{name}+(0)step" for name in df.columns]
else:
names+=[f"{name}+({i})step" for name in df.columns]
df_re = pd.concat(cols,axis=1)
df_re.columns = names
if dropnan:
df_re.dropna(inplace=True)
return df_re
train_data = []
test_data = []
for i in tqdm.trange(1,366):
data = data_train[data_train.loc[:,"商店ID"]==i].copy()
df_re = series_to_supervisied_(data,step_in= 10,step_out = 1,dropnan = True)
data_train_,data_test_ = train_test_split(df_re,test_size=0.2,shuffle=False)
train_data.append(data_train_)
test_data.append(data_test_)
del data ,df_re,data_train_,data_test_
gc.collect()
train_data = pd.concat(train_data)
test_data = pd.concat(test_data)
100%|████████████████████████████████████████████████████████████████████████████████| 365/365 [00:13<00:00, 26.78it/s]
train_data
商店ID-(10)step | 节假日-(10)step | 折扣-(10)step | 销量-(10)step | S1-(10)step | S2-(10)step | S3-(10)step | S4-(10)step | L1-(10)step | L2-(10)step | ... | R2+(0)step | R3+(0)step | R4+(0)step | year+(0)step | month+(0)step | day+(0)step | quarter+(0)step | weekofyear+(0)step | dayofweek+(0)step | weekend+(0)step | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3806 | 1.0 | 1.0 | 1.0 | 7011.84 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 2018 | 1 | 11 | 1 | 2 | 4 | 0 |
4195 | 1.0 | 0.0 | 1.0 | 42369.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 2018 | 1 | 12 | 1 | 2 | 5 | 0 |
4652 | 1.0 | 0.0 | 1.0 | 50037.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 2018 | 1 | 13 | 1 | 2 | 6 | 1 |
4757 | 1.0 | 0.0 | 1.0 | 44397.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 2018 | 1 | 14 | 1 | 2 | 7 | 1 |
5454 | 1.0 | 0.0 | 1.0 | 47604.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 2018 | 1 | 15 | 1 | 3 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
149622 | 365.0 | 0.0 | 1.0 | 32430.00 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 2019 | 2 | 14 | 1 | 7 | 4 | 0 |
149753 | 365.0 | 0.0 | 1.0 | 38766.00 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 2019 | 2 | 15 | 1 | 7 | 5 | 0 |
150337 | 365.0 | 0.0 | 1.0 | 62139.00 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 2019 | 2 | 16 | 1 | 7 | 6 | 1 |
150651 | 365.0 | 0.0 | 0.0 | 21273.00 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 2019 | 2 | 17 | 1 | 7 | 7 | 1 |
150910 | 365.0 | 0.0 | 0.0 | 28833.00 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 2019 | 2 | 18 | 1 | 8 | 1 | 0 |
147460 rows × 264 columns
y_columns = "销量+(0)step"
y_train = train_data[y_columns]
x_train = train_data[list(train_data.columns)[:-24]]
x_test= test_data[list(test_data.columns)[:-24]]
y_test = test_data[y_columns]
from sklearn.utils import shuffle
trains = pd.concat([x_train,y_train],axis=1)
trains = shuffle(trains,random_state=1412)
x__train = trains.iloc[:,:-1]
y__train = trains.iloc[:,-1]
from sklearn.ensemble import RandomForestRegressor
rf_clf = RandomForestRegressor()
rf_clf.fit(x__train,y__train)
RandomForestRegressor()
rf_clf.score(x_test,y_test)
0.7297944553988286
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,rf_clf.predict(x_test))**0.5
10316.49939033907
mean_squared_error(y__train,rf_clf.predict(x__train))**0.5
2340.9166743339324
def symmetric_mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.sum(np.abs(y_true - y_pred) * 2) / np.sum(np.abs(y_true) + np.abs(y_pred))
def prophet_smape(y_true, y_pred):
smape_val = symmetric_mean_absolute_percentage_error(y_true, y_pred)
return 'SMAPE', smape_val, False
prophet_smape(y_test,rf_clf.predict(x_test))
('SMAPE', 0.15452129351735566, False)
prophet_smape(y__train,rf_clf.predict(x__train))
('SMAPE', 0.03526619069708201, False)
from xgboost import XGBRegressor
xgb_clf = XGBRegressor()
xgb_clf.fit(x__train,y__train)
xgb_clf.score(x_test,y_test)
0.7136438711451321
mean_squared_error(y_test,xgb_clf.predict(x_test))**0.5
10620.341210004963
mean_squared_error(y__train,xgb_clf.predict(x__train))**0.5
5570.337227187151
prophet_smape(y_test,xgb_clf.predict(x_test))
('SMAPE', 0.16851558969888786, False)
prophet_smape(y__train,xgb_clf.predict(x__train))
('SMAPE', 0.09342656379545357, False)
这个代码也是可用的,因为耗时所以中间打断掉了。
from hyperopt import hp,fmin,tpe,Trials,partial
from hyperopt.early_stop import no_progress_loss
# 定义目标函数:
def hpyeropt_objective(params):
train_data = []
test_data = []
for i in range(1,366):
data = data_train[data_train.loc[:,"商店ID"]==i].copy()
df_re = series_to_supervisied_(data,step_in= int(params["time_step"]),step_out = 1,dropnan = True)
data_train_,data_test_ = train_test_split(df_re,test_size=0.2,shuffle=False)
train_data.append(data_train_)
test_data.append(data_test_)
del data ,df_re,data_train_,data_test_
gc.collect()
train_data = pd.concat(train_data)
test_data = pd.concat(test_data)
y_columns = "销量+(0)step"
y_train = train_data[y_columns]
x_train = train_data[list(train_data.columns)[:-24]]
x_test= test_data[list(test_data.columns)[:-24]]
y_test = test_data[y_columns]
from sklearn.utils import shuffle
trains = pd.concat([x_train,y_train],axis=1)
trains = shuffle(trains,random_state=1412)
x__train = trains.iloc[:,:-1]
y__train = trains.iloc[:,-1]
clf = RandomForestRegressor(n_estimators=int(params["n_estimators"]),
max_depth=int(params["max_depth"]),
# max_features=int(min(params["max_features"],len(xtrain.columns))),
min_impurity_decrease=params["min_impurity_decrease"],
random_state=1412,
verbose=False,
n_jobs=-1
).fit(x__train,y__train)
scores = clf.score(x_test,y_test)
return -scores
#定义参数空间
params_grid = {"n_estimators":hp.quniform("n_estimators",10,1000,20),
"max_depth":hp.quniform("max_depth",5,25,1),
# "max_features":hp.quniform("max_features",10,10000,1),
"min_impurity_decrease":hp.quniform("min_impurity_decrease",0,5,1),
"time_step":hp.quniform("time_step",10,200,5)
}
#定义迭代
def param_hyperopt(max_evals = 100):
trials = Trials()
early_stop_fn =no_progress_loss(50)
params_best = fmin(hpyeropt_objective,
space=params_grid,
algo=tpe.suggest,
max_evals=max_evals,
verbose=True,
trials=trials,
early_stop_fn=early_stop_fn
)
print("\n","\n","best params:",params_best,"\n")
return params_best,trials
params_best,trials = param_hyperopt(max_evals = 300)
30%|██████████▉ | 89/300 [83:23:32<243:07:02, 4147.97s/trial, best loss: -0.7321221326244512]
这种方法的耗时太久了,这和时间步长的不断变化有关,受到电脑算力的限制,这里不做进一步深入的探索,但是目前在只考虑时间窗的前提下,对于 R 2 R^2 R2来讲,可以发现表现效果得到了极大的提升。 也就是说,目前的思路是正确的, 这种方案也是可行的。 如果要做进一步优化的话,便是对特征的角度去考虑了,即特征衍生。 因此未来的优化方向可以首先向时间窗内的特征衍生去考虑。