在这里尝试下第二种方案,其实也算是一种特征衍生的时间窗构造特征(个人理解是这样)。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")
plt.rcParams["font.sans-serif"] = ["Microsoft YaHei"]
plt.rcParams["axes.unicode_minus"] = False
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import gc
import tqdm
import warnings
warnings.filterwarnings("ignore")
path_train = "../preocess_data/train_data_o.csv"
path_test = "../data/test_data.csv"
data = pd.read_csv(path_train)
data_test = pd.read_csv(path_test)
data["运营日期"] = pd.to_datetime(data["运营日期"] )
data_test["运营日期"] = pd.to_datetime(data_test["日期"])
data_test
行ID | 商店ID | 商店类型 | 位置 | 地区 | 日期 | 节假日 | 折扣 | 运营日期 | |
---|---|---|---|---|---|---|---|---|---|
0 | T1188341 | 171 | S4 | L2 | R3 | 2019/6/1 | 0 | No | 2019-06-01 |
1 | T1188342 | 172 | S1 | L1 | R1 | 2019/6/1 | 0 | No | 2019-06-01 |
2 | T1188343 | 173 | S4 | L2 | R1 | 2019/6/1 | 0 | No | 2019-06-01 |
3 | T1188344 | 174 | S1 | L1 | R4 | 2019/6/1 | 0 | No | 2019-06-01 |
4 | T1188345 | 170 | S1 | L1 | R2 | 2019/6/1 | 0 | No | 2019-06-01 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
22260 | T1210601 | 186 | S2 | L5 | R2 | 2019/7/31 | 0 | No | 2019-07-31 |
22261 | T1210602 | 11 | S4 | L2 | R1 | 2019/7/31 | 0 | No | 2019-07-31 |
22262 | T1210603 | 185 | S1 | L1 | R3 | 2019/7/31 | 0 | Yes | 2019-07-31 |
22263 | T1210604 | 69 | S1 | L1 | R4 | 2019/7/31 | 0 | No | 2019-07-31 |
22264 | T1210605 | 365 | S2 | L1 | R2 | 2019/7/31 | 0 | No | 2019-07-31 |
22265 rows × 9 columns
data_test.loc[310,"商店ID"]
1
data.drop(["行ID","日期"],axis=1,inplace=True)
data_test.drop(["行ID","日期"],axis=1,inplace=True)
data["商店ID"].min()
1
data["商店ID"].max()
365
enc = OneHotEncoder(drop="if_binary")
enc.fit(data["折扣"].values.reshape(-1,1))
enc.transform(data["折扣"].values.reshape(-1,1)).toarray()
enc.transform(data_test["折扣"].values.reshape(-1,1)).toarray()
array([[0.],
[0.],
[0.],
...,
[1.],
[0.],
[0.]])
data["折扣"] = enc.transform(data["折扣"].values.reshape(-1,1)).toarray()
data_test["折扣"] = enc.transform(data_test["折扣"].values.reshape(-1,1)).toarray()
enc = OneHotEncoder(drop="if_binary")
enc.fit(data[["商店类型","位置","地区"]])
OneHotEncoder(drop='if_binary')
columns_ = []
for index in enc.categories_:
for name in index :
columns_.append(name)
columns_
['S1', 'S2', 'S3', 'S4', 'L1', 'L2', 'L3', 'L4', 'L5', 'R1', 'R2', 'R3', 'R4']
enc_train = pd.DataFrame(enc.transform(data[["商店类型","位置","地区"]]).toarray(),columns=columns_)
enc_test = pd.DataFrame(enc.transform(data_test[["商店类型","位置","地区"]]).toarray(),columns=columns_)
data = pd.concat([data,enc_train],axis=1)
data_test = pd.concat([data_test,enc_test],axis=1)
data.drop(["商店类型","位置","地区"],axis=1,inplace=True)
data_test.drop(["商店类型","位置","地区"],axis=1,inplace=True)
def time_derivation(t,col="运营日期"):
t["year"] = t[col].dt.year
t["month"] = t[col].dt.month
t["day"] = t[col].dt.day
t["quarter"] = t[col].dt.quarter
t["weekofyear"] = t[col].dt.weekofyear
t["dayofweek"] = t[col].dt.dayofweek+1
t["weekend"] = (t["dayofweek"]>5).astype(int)
t["is_month_start"] = (t[col].dt.is_month_start).astype(int)
t["is_month_end"] = (t[col].dt.is_month_end).astype(int)
t["is_quarter_start"] = (t[col].dt.is_quarter_start).astype(int)
t["is_month_end"] = (t[col].dt.is_quarter_end).astype(int)
return t
data_train = time_derivation(data)
data_test_ = time_derivation(data_test)
data_train.drop("运营日期",axis=1,inplace=True)
data_test_.drop("运营日期",axis=1,inplace=True)
data_train.shape
(188340, 27)
data_train
商店ID | 节假日 | 折扣 | 销量 | S1 | S2 | S3 | S4 | L1 | L2 | ... | year | month | day | quarter | weekofyear | dayofweek | weekend | is_month_start | is_month_end | is_quarter_start | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1.0 | 7011.84 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2018 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
1 | 253 | 1 | 1.0 | 51789.12 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | ... | 2018 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
2 | 252 | 1 | 1.0 | 36868.20 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | ... | 2018 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
3 | 251 | 1 | 1.0 | 19715.16 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2018 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
4 | 250 | 1 | 1.0 | 45614.52 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2018 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
188335 | 149 | 1 | 1.0 | 37272.00 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2019 | 5 | 31 | 2 | 22 | 5 | 0 | 0 | 0 | 0 |
188336 | 153 | 1 | 0.0 | 54572.64 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | ... | 2019 | 5 | 31 | 2 | 22 | 5 | 0 | 0 | 0 | 0 |
188337 | 154 | 1 | 0.0 | 31624.56 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2019 | 5 | 31 | 2 | 22 | 5 | 0 | 0 | 0 | 0 |
188338 | 155 | 1 | 1.0 | 49162.41 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | ... | 2019 | 5 | 31 | 2 | 22 | 5 | 0 | 0 | 0 | 0 |
188339 | 152 | 1 | 0.0 | 37977.00 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | ... | 2019 | 5 | 31 | 2 | 22 | 5 | 0 | 0 | 0 | 0 |
188340 rows × 27 columns
data_train[data_train["商店ID"]==1]
商店ID | 节假日 | 折扣 | 销量 | S1 | S2 | S3 | S4 | L1 | L2 | ... | year | month | day | quarter | weekofyear | dayofweek | weekend | is_month_start | is_month_end | is_quarter_start | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1.0 | 7011.84 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2018 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 |
607 | 1 | 0 | 1.0 | 42369.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2018 | 1 | 2 | 1 | 1 | 2 | 0 | 0 | 0 | 0 |
1046 | 1 | 0 | 1.0 | 50037.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2018 | 1 | 3 | 1 | 1 | 3 | 0 | 0 | 0 | 0 |
1207 | 1 | 0 | 1.0 | 44397.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2018 | 1 | 4 | 1 | 1 | 4 | 0 | 0 | 0 | 0 |
1752 | 1 | 0 | 1.0 | 47604.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2018 | 1 | 5 | 1 | 1 | 5 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
186569 | 1 | 0 | 1.0 | 33075.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2019 | 5 | 27 | 2 | 22 | 1 | 0 | 0 | 0 | 0 |
187165 | 1 | 0 | 1.0 | 37317.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2019 | 5 | 28 | 2 | 22 | 2 | 0 | 0 | 0 | 0 |
187391 | 1 | 0 | 1.0 | 44652.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2019 | 5 | 29 | 2 | 22 | 3 | 0 | 0 | 0 | 0 |
187962 | 1 | 0 | 1.0 | 42387.00 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2019 | 5 | 30 | 2 | 22 | 4 | 0 | 0 | 0 | 0 |
188113 | 1 | 1 | 1.0 | 39843.78 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 2019 | 5 | 31 | 2 | 22 | 5 | 0 | 0 | 0 | 0 |
516 rows × 27 columns
horizon = 1
target = "销量"
lags=list(range(horizon, horizon+7))+ [28,35,42,49]
for lag in lags:
data_train[f'{target}_lag_{lag}'] = data_train.groupby(['商店ID'],as_index=False)[target].shift(lag)
data_train.head()
商店ID | 节假日 | 折扣 | 销量 | S1 | S2 | S3 | S4 | L1 | L2 | ... | 销量_lag_2 | 销量_lag_3 | 销量_lag_4 | 销量_lag_5 | 销量_lag_6 | 销量_lag_7 | 销量_lag_28 | 销量_lag_35 | 销量_lag_42 | 销量_lag_49 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1.0 | 7011.84 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 253 | 1 | 1.0 | 51789.12 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 252 | 1 | 1.0 | 36868.20 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 251 | 1 | 1.0 | 19715.16 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 250 | 1 | 1.0 | 45614.52 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 38 columns
data_1 = data_train[["商店ID","销量","销量_lag_1","销量_lag_2","销量_lag_3","销量_lag_4"]]
data_1.loc[data_1["商店ID"]==3,:]
商店ID | 销量 | 销量_lag_1 | 销量_lag_2 | 销量_lag_3 | 销量_lag_4 | |
---|---|---|---|---|---|---|
127 | 3 | 57288.00 | NaN | NaN | NaN | NaN |
511 | 3 | 58929.00 | 57288.0 | NaN | NaN | NaN |
759 | 3 | 69603.00 | 58929.0 | 57288.0 | NaN | NaN |
1447 | 3 | 59721.00 | 69603.0 | 58929.0 | 57288.0 | NaN |
1814 | 3 | 71355.00 | 59721.0 | 69603.0 | 58929.0 | 57288.0 |
... | ... | ... | ... | ... | ... | ... |
186819 | 3 | 69930.00 | 66036.0 | 94548.0 | 76899.0 | 76677.0 |
187006 | 3 | 72540.00 | 69930.0 | 66036.0 | 94548.0 | 76899.0 |
187248 | 3 | 76428.00 | 72540.0 | 69930.0 | 66036.0 | 94548.0 |
187767 | 3 | 78135.00 | 76428.0 | 72540.0 | 69930.0 | 66036.0 |
188264 | 3 | 75790.95 | 78135.0 | 76428.0 | 72540.0 | 69930.0 |
516 rows × 6 columns
data_train[["商店ID","销量","销量_lag_1"]]
商店ID | 销量 | 销量_lag_1 | |
---|---|---|---|
0 | 1 | 7011.84 | NaN |
1 | 253 | 51789.12 | NaN |
2 | 252 | 36868.20 | NaN |
3 | 251 | 19715.16 | NaN |
4 | 250 | 45614.52 | NaN |
... | ... | ... | ... |
188335 | 149 | 37272.00 | 37272.0 |
188336 | 153 | 54572.64 | 58056.0 |
188337 | 154 | 31624.56 | 31944.0 |
188338 | 155 | 49162.41 | 49659.0 |
188339 | 152 | 37977.00 | 37977.0 |
188340 rows × 3 columns
lag_columns = []
for lag in lags:
lag_columns.append(f'{target}_lag_{lag}')
for i in lag_columns:
data_train[i]= data_train[i].fillna(0)
data_train.columns
Index(['商店ID', '节假日', '折扣', '销量', 'S1', 'S2', 'S3', 'S4', 'L1', 'L2', 'L3',
'L4', 'L5', 'R1', 'R2', 'R3', 'R4', 'year', 'month', 'day', 'quarter',
'weekofyear', 'dayofweek', 'weekend', 'is_month_start', 'is_month_end',
'is_quarter_start', '销量_lag_1', '销量_lag_2', '销量_lag_3', '销量_lag_4',
'销量_lag_5', '销量_lag_6', '销量_lag_7', '销量_lag_28', '销量_lag_35',
'销量_lag_42', '销量_lag_49'],
dtype='object')
可以分别求出过去两天、过去三天、过去一周、过去28天、过去35天、过去42天、过去49天、过去56天、过去70天的特征的均值、中位数、最大值、最小值、下四分位数、上四分位数等特征。
for lag in lags:
data_train[f'{target}_MA_lag_1_{lag}_mean'] = data_train.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).mean().shift(1))
data_train[f'{target}_MA_lag_1_{lag}_max'] = data_train.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).max().shift(1))
data_train[f'{target}_MA_lag_1_{lag}_min'] = data_train.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).min().shift(1))
data_train[f'{target}_MA_lag_1_{lag}_std'] = data_train.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).std().shift(1))
# data_train[f'{target}_MA_lag_1_{lag}_median'] = data_train.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).median().shift(1))
data_train = data_train.fillna(0)
data_train.columns
Index(['商店ID', '节假日', '折扣', '销量', 'S1', 'S2', 'S3', 'S4', 'L1', 'L2', 'L3',
'L4', 'L5', 'R1', 'R2', 'R3', 'R4', 'year', 'month', 'day', 'quarter',
'weekofyear', 'dayofweek', 'weekend', 'is_month_start', 'is_month_end',
'is_quarter_start', '销量_lag_1', '销量_lag_2', '销量_lag_3', '销量_lag_4',
'销量_lag_5', '销量_lag_6', '销量_lag_7', '销量_lag_28', '销量_lag_35',
'销量_lag_42', '销量_lag_49', '销量_MA_lag_1_1_mean', '销量_MA_lag_1_1_max',
'销量_MA_lag_1_1_min', '销量_MA_lag_1_1_std', '销量_MA_lag_1_2_mean',
'销量_MA_lag_1_2_max', '销量_MA_lag_1_2_min', '销量_MA_lag_1_2_std',
'销量_MA_lag_1_3_mean', '销量_MA_lag_1_3_max', '销量_MA_lag_1_3_min',
'销量_MA_lag_1_3_std', '销量_MA_lag_1_4_mean', '销量_MA_lag_1_4_max',
'销量_MA_lag_1_4_min', '销量_MA_lag_1_4_std', '销量_MA_lag_1_5_mean',
'销量_MA_lag_1_5_max', '销量_MA_lag_1_5_min', '销量_MA_lag_1_5_std',
'销量_MA_lag_1_6_mean', '销量_MA_lag_1_6_max', '销量_MA_lag_1_6_min',
'销量_MA_lag_1_6_std', '销量_MA_lag_1_7_mean', '销量_MA_lag_1_7_max',
'销量_MA_lag_1_7_min', '销量_MA_lag_1_7_std', '销量_MA_lag_1_28_mean',
'销量_MA_lag_1_28_max', '销量_MA_lag_1_28_min', '销量_MA_lag_1_28_std',
'销量_MA_lag_1_35_mean', '销量_MA_lag_1_35_max', '销量_MA_lag_1_35_min',
'销量_MA_lag_1_35_std', '销量_MA_lag_1_42_mean', '销量_MA_lag_1_42_max',
'销量_MA_lag_1_42_min', '销量_MA_lag_1_42_std', '销量_MA_lag_1_49_mean',
'销量_MA_lag_1_49_max', '销量_MA_lag_1_49_min', '销量_MA_lag_1_49_std'],
dtype='object')
同理,也可以衍生出其他的特征出来,比如中位数,方差,下四分位数,上四分位数等。
for lag in lags:
data_train[f'{target}_MA_lag_1_{lag}_median'] = data_train.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).median().shift(1))
data_train[f'{target}_MA_lag_1_{lag}_var'] = data_train.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).var().shift(1))
data_train[f'{target}_MA_lag_1_{lag}_quantile_1'] = data_train.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).quantile(0.25).shift(1))
data_train[f'{target}_MA_lag_1_{lag}_quantile_3'] = data_train.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).quantile(0.75).shift(1))
data_train = data_train.fillna(0)
data__1 = data_train[["商店ID","销量","销量_MA_lag_1_2_mean","销量_MA_lag_1_3_mean","销量_MA_lag_1_4_mean"]]
data__1.loc[data__1["商店ID"]==4,:].head()
商店ID | 销量 | 销量_MA_lag_1_2_mean | 销量_MA_lag_1_3_mean | 销量_MA_lag_1_4_mean | |
---|---|---|---|---|---|
96 | 4 | 53615.52 | 0.00 | 0.00 | 0.00 |
546 | 4 | 43194.00 | 0.00 | 0.00 | 0.00 |
947 | 4 | 40998.00 | 48404.76 | 0.00 | 0.00 |
1291 | 4 | 44787.00 | 42096.00 | 45935.84 | 0.00 |
1819 | 4 | 50814.00 | 42892.50 | 42993.00 | 45648.63 |
(50037.00+42369.00+ 7011.84 )/3
33139.28
1-365号 对每个商店来讲,都是,前80%的数据作为训练,
train_data = []
test_data = []
for i in tqdm.trange(1,366):
data = data_train[data_train.loc[:,"商店ID"]==i].copy()
data_train_,data_test_ = train_test_split(data,test_size=0.2,shuffle=False)
train_data.append(data_train_)
test_data.append(data_test_)
del data ,data_train_,data_test_
gc.collect()
train_data = pd.concat(train_data)
test_data = pd.concat(test_data)
100%|████████████████████████████████████████████████████████████████████████████████| 365/365 [00:29<00:00, 12.56it/s]
cate_features = ['商店ID', '节假日', '折扣','S1', 'S2', 'S3', 'S4', 'L1', 'L2', 'L3','L4', 'L5', 'R1', 'R2', 'R3', 'R4']
train_data_scaler = []
test_data_scaler = []
ytrain_scaler = []
ytest_scaler = []
encs_x = []
encs_y = []
for i in tqdm.trange(1,366):
data_train_ = train_data[train_data.loc[:,"商店ID"]==i].copy()
data_test_ = test_data[test_data.loc[:,"商店ID"]==i].copy()
x_train=data_train_.loc[:,data_train_.columns != "销量"]
y_train= data_train_.loc[:,data_train_.columns == "销量"]
x_test= data_test_.loc[:,data_test_.columns != "销量"]
y_test= data_test_.loc[:,data_test_.columns == "销量"]
enc_x = MinMaxScaler()
enc_y = MinMaxScaler()
enc_x = enc_x.fit(x_train)
enc_y = enc_y.fit(y_train)
x_train_scaler = enc_x.transform(x_train)
y_train_scaler = enc_y.transform(y_train)
x_test_scaler = enc_x.transform(x_test)
y_test_scaler = enc_y.transform(y_test)
x_train_scaler = pd.DataFrame(x_train_scaler)
x_test_scaler = pd.DataFrame(x_test_scaler)
y_train_scaler = pd.DataFrame(y_train_scaler)
y_test_scaler = pd.DataFrame(y_test_scaler)
train_data_scaler.append(x_train_scaler)
test_data_scaler.append(x_test_scaler)
ytrain_scaler.append(y_train_scaler)
ytest_scaler.append(y_test_scaler)
encs_x.append(enc_x)
encs_y.append(enc_y)
del x_train,y_train,x_test,y_test,enc_x,enc_y,x_train_scaler,y_train_scaler,x_test_scaler,y_test_scaler
gc.collect()
100%|████████████████████████████████████████████████████████████████████████████████| 365/365 [00:31<00:00, 11.43it/s]
xtrain_data_scaler = pd.concat(train_data_scaler)
ytrain_data_scaler = pd.concat(ytrain_scaler)
xtest_data_scaler = pd.concat(test_data_scaler)
ytest_data_scaler = pd.concat(ytest_scaler)
cols = list(train_data.columns)
cols.remove("销量")
xtrain_data_scaler.columns = cols
xtest_data_scaler.columns = cols
ytrain_data_scaler.columns = ["销量"]
ytest_data_scaler.columns =[ "销量"]
train_data_scaler = pd.concat([xtrain_data_scaler,ytrain_data_scaler],axis=1)
test_data_scaler = pd.concat([xtest_data_scaler,ytest_data_scaler],axis=1)
在这里其实可以根据过滤法或者特征递归消除的思想,借助皮尔逊相关系数,或者斯皮尔曼相关系数来进行特征的选择。 上面归一化的原因是,为了避免量纲得影响。
mape 是一个很直接的概念,如果真实销量为0,或者很小,则mape无法使用,因此会有smape。
def symmetric_mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.sum(np.abs(y_true - y_pred) * 2) / np.sum(np.abs(y_true) + np.abs(y_pred))
def lgbm_smape(y_true, y_pred):
smape_val = symmetric_mean_absolute_percentage_error(y_true, y_pred)
return 'SMAPE', smape_val, False
data_train.columns
Index(['商店ID', '节假日', '折扣', '销量', 'S1', 'S2', 'S3', 'S4', 'L1', 'L2',
...
'销量_MA_lag_1_35_quantile_1', '销量_MA_lag_1_35_quantile_3',
'销量_MA_lag_1_42_median', '销量_MA_lag_1_42_var',
'销量_MA_lag_1_42_quantile_1', '销量_MA_lag_1_42_quantile_3',
'销量_MA_lag_1_49_median', '销量_MA_lag_1_49_var',
'销量_MA_lag_1_49_quantile_1', '销量_MA_lag_1_49_quantile_3'],
dtype='object', length=126)
import lightgbm as lgb
model_params = {
'boosting_type':'gbdt',
'objective': 'rmse',
'num_leaves': 127,
'learning_rate': 0.05,
'n_estimators': 300,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'verbose': 1,
'max_bin': 100,
'max_depth':9,
'n_jobs': 16,
'seed': 1412,
}
model_lgb = lgb.LGBMRegressor(**model_params)
model_lgb.fit(X=train_data.loc[:,train_data.columns!="销量"],y=train_data["销量"],
eval_set = [(test_data.loc[:,test_data.columns!="销量"],test_data["销量"])],
eval_metric=lgbm_smape,
# categorical_feature=cate_features,
# early_stopping_rounds=15,
verbose=10,
)
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019825 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9952
[LightGBM] [Info] Number of data points in the train set: 150380, number of used features: 123
[LightGBM] [Warning] feature_fraction is set=0.8, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.8
[LightGBM] [Warning] bagging_fraction is set=0.8, subsample=1.0 will be ignored. Current value: bagging_fraction=0.8
[LightGBM] [Info] Start training from score 42684.680538
[10] valid_0's rmse: 14644.6 valid_0's SMAPE: 0.242413
[20] valid_0's rmse: 12052.4 valid_0's SMAPE: 0.192232
[30] valid_0's rmse: 10688.5 valid_0's SMAPE: 0.16508
[40] valid_0's rmse: 10070.3 valid_0's SMAPE: 0.153135
[50] valid_0's rmse: 9758.72 valid_0's SMAPE: 0.147373
[60] valid_0's rmse: 9609.15 valid_0's SMAPE: 0.144601
[70] valid_0's rmse: 9540.43 valid_0's SMAPE: 0.14361
[80] valid_0's rmse: 9473.71 valid_0's SMAPE: 0.142544
[90] valid_0's rmse: 9450.8 valid_0's SMAPE: 0.142559
[100] valid_0's rmse: 9439.76 valid_0's SMAPE: 0.142701
[110] valid_0's rmse: 9446.47 valid_0's SMAPE: 0.143186
[120] valid_0's rmse: 9472.73 valid_0's SMAPE: 0.143815
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[130] valid_0's rmse: 9474.26 valid_0's SMAPE: 0.14384
[140] valid_0's rmse: 9497.09 valid_0's SMAPE: 0.144311
[150] valid_0's rmse: 9511.71 valid_0's SMAPE: 0.144707
[160] valid_0's rmse: 9508.39 valid_0's SMAPE: 0.14467
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[170] valid_0's rmse: 9521.67 valid_0's SMAPE: 0.144947
[180] valid_0's rmse: 9535.4 valid_0's SMAPE: 0.145272
[190] valid_0's rmse: 9551.49 valid_0's SMAPE: 0.145574
[200] valid_0's rmse: 9558.05 valid_0's SMAPE: 0.145939
[210] valid_0's rmse: 9570.66 valid_0's SMAPE: 0.146168
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[220] valid_0's rmse: 9583.49 valid_0's SMAPE: 0.146495
[230] valid_0's rmse: 9601.02 valid_0's SMAPE: 0.146821
[240] valid_0's rmse: 9607.68 valid_0's SMAPE: 0.146962
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[250] valid_0's rmse: 9632.83 valid_0's SMAPE: 0.147384
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[260] valid_0's rmse: 9638.67 valid_0's SMAPE: 0.147522
[270] valid_0's rmse: 9645.29 valid_0's SMAPE: 0.14771
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[280] valid_0's rmse: 9644.43 valid_0's SMAPE: 0.147663
[290] valid_0's rmse: 9647.48 valid_0's SMAPE: 0.147739
[300] valid_0's rmse: 9648.16 valid_0's SMAPE: 0.14777
LGBMRegressor(bagging_fraction=0.8, feature_fraction=0.8, learning_rate=0.05,
max_bin=100, max_depth=9, n_estimators=300, n_jobs=16,
num_leaves=127, objective='rmse', seed=1412, verbose=1)
model_lgb.predict(test_data.loc[:,test_data.columns!="销量"])
array([43714.65580677, 36343.79176822, 28326.45447246, ...,
24522.5461704 , 32373.00750644, 25690.44092983])
from hyperopt import hp,fmin,tpe,Trials,partial
from hyperopt.early_stop import no_progress_loss
# 定义目标函数:
def hpyeropt_objective(params):
model_params = {
'boosting_type':"gbdt",
'objective': 'rmse',
'num_leaves': 127,
'learning_rate':params['learning_rate'],
'n_estimators': int(params['n_estimators']),
'feature_fraction': params['feature_fraction'],
'bagging_fraction': params['bagging_fraction'],
'verbose': 1,
'max_bin': 100,
'max_depth':int(params["max_depth"]),
'n_jobs': 16,
'seed': 1412,
}
model_lgb = lgb.LGBMRegressor(**model_params)
model_lgb.fit(X=train_data.loc[:,train_data.columns!="销量"],y=train_data["销量"],
eval_set = [(test_data.loc[:,test_data.columns!="销量"],test_data["销量"])],
eval_metric=lgbm_smape,
# categorical_feature=cate_features,
early_stopping_rounds=10,
verbose=10,
)
index_, scores, wether= lgbm_smape(test_data["销量"], model_lgb.predict(test_data.loc[:,test_data.columns!="销量"]))
print(scores,params)
return scores
#定义参数空间
params_grid = {"boosting_type":hp.choice("boosting_type",["gbdt"]),
"max_depth":hp.choice("max_depth",[3, 5, 6, 7, 9, 12, 15, 17, 25]),
"learning_rate":hp.quniform("learning_rate",0.1,1,0.1),
"n_estimators":hp.quniform("n_estimators",100,500,50),
"feature_fraction":hp.quniform("feature_fraction",0.5,1,0.1),
"bagging_fraction":hp.quniform("bagging_fraction",0.5,1,0.1),
}
#定义迭代
def param_hyperopt(max_evals = 100):
trials = Trials()
early_stop_fn =no_progress_loss(100)
params_best = fmin(hpyeropt_objective,
space=params_grid,
algo=tpe.suggest,
max_evals=max_evals,
verbose=True,
trials=trials,
early_stop_fn=early_stop_fn
)
print("\n","\n","best params:",params_best,"\n")
return params_best,trials
data_test["销量"] = 0
horizon = 1
target = "销量"
lags=list(range(horizon, horizon+7))+ [28,35,42,49]
for lag in lags:
data_test[f'{target}_lag_{lag}'] = data_test.groupby(['商店ID'],as_index=False)[target].shift(lag)
for lag in lags:
data_test[f'{target}_MA_lag_1_{lag}_mean'] = data_test.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).mean().shift(1))
data_test[f'{target}_MA_lag_1_{lag}_max'] = data_test.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).max().shift(1))
data_test[f'{target}_MA_lag_1_{lag}_min'] = data_test.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).min().shift(1))
data_test[f'{target}_MA_lag_1_{lag}_std'] = data_test.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).std().shift(1))
# data_train[f'{target}_MA_lag_1_{lag}_median'] = data_train.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).median().shift(1))
data_test = data_test.fillna(0)
for lag in lags:
data_test[f'{target}_MA_lag_1_{lag}_median'] = data_test.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).median().shift(1))
data_test[f'{target}_MA_lag_1_{lag}_var'] = data_test.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).var().shift(1))
data_test[f'{target}_MA_lag_1_{lag}_quantile_1'] = data_test.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).quantile(0.25).shift(1))
data_test[f'{target}_MA_lag_1_{lag}_quantile_3'] = data_test.groupby(["商店ID"])['销量'].transform(lambda x: x.rolling(window=lag).quantile(0.75).shift(1))
data_test = data_test.fillna(0)
data_pres = []
data_pres_first = []
for i in tqdm.trange(1,366):
data_pred = test_data[test_data["商店ID"]==i]
data_pre = data_test[data_test["商店ID"]==i]
data_pre.loc[data_pre.head(1).index,"销量_lag_1"] =np.array(data_pred.loc[data_pred.tail(1).index,"销量"])
data_pre.loc[data_pre.head(1).index,"销量_lag_2"] =np.array(data_pred.loc[data_pred.tail(2).index[0],"销量"])
data_pre.loc[data_pre.head(1).index,"销量_lag_3"] =np.array( data_pred.loc[data_pred.tail(3).index[0],"销量"])
data_pre.loc[data_pre.head(1).index,"销量_lag_4"] = np.array(data_pred.loc[data_pred.tail(4).index[0],"销量"])
data_pre.loc[data_pre.head(1).index,"销量_lag_5"] = np.array(data_pred.loc[data_pred.tail(5).index[0],"销量"])
data_pre.loc[data_pre.head(1).index,"销量_lag_6"] = np.array(data_pred.loc[data_pred.tail(6).index[0],"销量"])
data_pre.loc[data_pre.head(1).index,"销量_lag_7"] = np.array(data_pred.loc[data_pred.tail(7).index[0],"销量"])
data_pre.loc[data_pre.head(1).index,"销量_lag_28"] = np.array(data_pred.loc[data_pred.tail(28).index[0],"销量"])
data_pre.loc[data_pre.head(1).index,"销量_lag_35"] = np.array(data_pred.loc[data_pred.tail(35).index[0],"销量"])
data_pre.loc[data_pre.head(1).index,"销量_lag_42"] = np.array(data_pred.loc[data_pred.tail(42).index[0],"销量"])
data_pre.loc[data_pre.head(1).index,"销量_lag_49"] = np.array(data_pred.loc[data_pred.tail(49).index[0],"销量"])
data_pres.append(data_pre)
data_pres_first.append(data_pre.head(1))
del data_pred,data_pre
gc.collect()
data_pres = pd.concat(data_pres)
data_pres_first = pd.concat(data_pres_first)
colus = [f'销量_lag_{lag}' for lag in lags]
for lag in lags:
data_pres_first.loc[:,f"销量_MA_lag_1_{lag}_mean"] = np.array(data_pres_first.loc[indexs_[0],colus[:lag]].mean())
data_pres_first.loc[:,f"销量_MA_lag_1_{lag}_max"] = np.array(data_pres_first.loc[indexs_[0],colus[:lag]].max())
data_pres_first.loc[:,f"销量_MA_lag_1_{lag}_min"] = np.array(data_pres_first.loc[indexs_[0],colus[:lag]].min())
data_pres_first.loc[:,f"销量_MA_lag_1_{lag}_std"] = np.array(data_pres_first.loc[indexs_[0],colus[:lag]].std())
data_pres_first.loc[:,f"销量_MA_lag_1_{lag}_median"] = np.array(data_pres_first.loc[indexs_[0],colus[:lag]].median())
data_pres_first.loc[:,f"销量_MA_lag_1_{lag}_var"] = np.array(data_pres_first.loc[indexs_[0],colus[:lag]].var())
data_pres_first.loc[:,f"销量_MA_lag_1_{lag}_quantile_1"] = np.array(data_pres_first.loc[indexs_[0],colus[:lag]].quantile(0.25))
data_pres_first.loc[:,f"销量_MA_lag_1_{lag}_quantile_3"] = np.array(data_pres_first.loc[indexs_[0],colus[:lag]].quantile(0.75))
data_pres_first.loc[:,"销量"] = rf_.predict(np.nan_to_num(data_pres_first.loc[:,data_pres_first.columns!="销量"]))
data_middle = pd.concat([test_data,data_pres])
data__1 = data_train[["商店ID","销量","销量_lag_1","销量_lag_2","销量_MA_lag_1_2_mean","销量_MA_lag_1_3_mean","销量_MA_lag_1_4_mean"]]
data_pres_result = []
for i in tqdm.trange(1,366):
# print("*****************************************************")
# print(i)
# print("*****************************************************")
data_pres_ = data_pres[data_pres["商店ID"] ==i]
indexs_ = list(data_pres[data_pres["商店ID"] ==i].index)
for j in range(0,len(indexs_)):
# print(indexs_[j])
if j <len(indexs_)-1:
# 预测的当天销量作为下一天的 lag
data_middle_ = data_middle[data_middle["商店ID"] ==i].loc[:indexs_[j],:]
data_pres_.loc[indexs_[j+1],"销量_lag_1"] = np.array(data_pres_.loc[indexs_[j],"销量"])
data_pres_.loc[indexs_[j+1],"销量_lag_2"] =np.array(data_middle_.loc[data_middle_.tail(2).index[0],"销量"])
data_pres_.loc[indexs_[j+1],"销量_lag_3"] =np.array(data_middle_.loc[data_middle_.tail(3).index[0],"销量"])
data_pres_.loc[indexs_[j+1],"销量_lag_4"] = np.array(data_middle_.loc[data_middle_.tail(4).index[0],"销量"])
data_pres_.loc[indexs_[j+1],"销量_lag_5"] = np.array(data_middle_.loc[data_middle_.tail(5).index[0],"销量"])
data_pres_.loc[indexs_[j+1],"销量_lag_6"] = np.array(data_middle_.loc[data_middle_.tail(6).index[0],"销量"])
data_pres_.loc[indexs_[j+1],"销量_lag_7"] = np.array(data_middle_.loc[data_middle_.tail(7).index[0],"销量"])
data_pres_.loc[indexs_[j+1],"销量_lag_28"] = np.array(data_middle_.loc[data_middle_.tail(28).index[0],"销量"])
data_pres_.loc[indexs_[j+1],"销量_lag_35"] = np.array(data_middle_.loc[data_middle_.tail(35).index[0],"销量"])
data_pres_.loc[indexs_[j+1],"销量_lag_42"] = np.array(data_middle_.loc[data_middle_.tail(42).index[0],"销量"])
data_pres_.loc[indexs_[j+1],"销量_lag_49"] = np.array(data_middle_.loc[data_middle_.tail(49).index[0],"销量"])
for lag in lags:
#
data_pres_.loc[indexs_[j],f"销量_MA_lag_1_{lag}_mean"] = np.array(data_pres_.loc[indexs_[j],colus[:lag]].mean())
data_pres_.loc[indexs_[j],f"销量_MA_lag_1_{lag}_max"] = np.array(data_pres_.loc[indexs_[j],colus[:lag]].max())
data_pres_.loc[indexs_[j],f"销量_MA_lag_1_{lag}_min"] = np.array(data_pres_.loc[indexs_[j],colus[:lag]].min())
data_pres_.loc[indexs_[j],f"销量_MA_lag_1_{lag}_std"] = np.array(data_pres_.loc[indexs_[j],colus[:lag]].std())
data_pres_.loc[indexs_[j],f"销量_MA_lag_1_{lag}_median"] = np.array(data_pres_.loc[indexs_[j],colus[:lag]].median())
data_pres_.loc[indexs_[j],f"销量_MA_lag_1_{lag}_var"] = np.array(data_pres_.loc[indexs_[j],colus[:lag]].var())
data_pres_.loc[indexs_[j],f"销量_MA_lag_1_{lag}_quantile_1"] = np.array(data_pres_.loc[indexs_[j],colus[:lag]].quantile(0.25))
data_pres_.loc[indexs_[j],f"销量_MA_lag_1_{lag}_quantile_3"] = np.array(data_pres_.loc[indexs_[j],colus[:lag]].quantile(0.75))
data_pres_.loc[indexs_[j],"销量"] = rf_.predict(np.nan_to_num(pd.DataFrame(data_pres_.loc[indexs_[j],
data_pres_.columns!="销量"]).T))
data_pres_result.append(data_pres_)
del data_pres_,data_middle_
gc.collect()
data_pres_result= pd.concat(data_pres_result)
pre_results = pd.merge(data_test[["商店ID"]],data_pres_result[["销量"]],how="inner",left_index=True,right_index=True)
pre_results.to_csv("../preocess_data/data_pres_rf.csv",index=False)
对于这个方法, S M A P E SMAPE SMAPE转化成 R 2 R^2 R2后相对于其他方式的basline的预测精度都有大幅度的提升,也就是说,目前来讲,做差分及在差分衍生特征的的前提下,做进一步的特征优化。其实后续也可以造出更多的特征,例如从数据角度,数据偏度,流量平滑特征等统计演变特征,从业务角度去考虑一些例如每个商店过去销售的占比等。由于时间关系,深度学习的方法还没有被尝试,从深度学习的角度来讲的话,可以从deepAR/MQRNN尝试,至于tft的话,可以放在后面考虑,因为,这个算法对数据量要求蛮大的,然后不断的设计与改进模型。仍然可以按照上述的思路去思考。 此外,若考虑神经网络的话,其实神经元特征提取的过程和特征衍生的过程是类似的。未来其实都可以尝试。
个人认为,在工程中,置信区间的预测是比点预测更合适一点?,因为我们现在所做的建模工作,根据中心极限定理的思想,用样本均值和方差估计总体的均值和方差,及最终的预测在一个区间范围内,我觉得更合适一点也更加严谨一些。
至此,basline整个流程结束。其实更为严谨的方法是在训练过程中增加时间序列交叉验证的过程。但是算力有限(算力和时间充足的话可以考虑尝试),对于上述流程, 目前来讲是将多步预测问题转化成单步预测,运用的评价指标包含:9302.08,:0.140946
。
在这个数据集上, 虽然数据具备平稳性, 但是ARIMA效果并不合适,说明不仅仅是销量本身的影响,prophet目前的尝试也不太好,但是作为basline模型,个人认为它是不可缺少的一步,按照时间序列问题的思想来讲,工业界中也常用这两个模型作为basline模型。
目前特征的探索有限,关于特征衍生的方法,从数据的角度以及业务的角度去造出海量的特征并进行特征筛选,完成特征探索,是未来优化的目标,在这个基础上,特征筛选也是不可缺少的一环。
至于算法的话,对于机器学习和深度学习的算法,个人理解是优先考虑机器学习,这是因为为了尽可能的保留可解释性。 在机器学习解决不了问题的情况下,模型融合以及深度学习被作为考虑对象,完成短期及中长期预测的研究工作。
数据量的优化,由于前面使用了原始的训练集的部分数据数据作为了验证数据等同于数据量的减少,对于频率为天的预测,了解到的资料介绍最好多于两年,这可能也是未来的一个优化探索的目标。
更为严谨的来讲, 其实可以加入时间交叉验证的思想。如递增时间交叉验证(自己构造)。至于优化方面,由于贝叶斯的随机性,所以需要多跑几次,然后选出出现次数比较多的结果作为参数,在此基础上做进一步的特征优化(由于时间的关系,并未进一步尝试)。
最后便是代码的优化。