销量预测05(数据的初步处理:机器学习的探索2)

导入计算库

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
plt.style.use("fivethirtyeight")
plt.rcParams["font.sans-serif"] = ["Microsoft YaHei"]
plt.rcParams["axes.unicode_minus"] = False
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import gc

import tqdm
import warnings
warnings.filterwarnings("ignore")

导入数据

path_train = "../preocess_data/train_data_o.csv"
path_test = "../data/test_data.csv"
data  = pd.read_csv(path_train)
data_test = pd.read_csv(path_test)
data["运营日期"] = pd.to_datetime(data["运营日期"] )
data_test["运营日期"] = pd.to_datetime(data_test["日期"])
data.drop(["行ID","日期"],axis=1,inplace=True) 
data_test.drop(["行ID","日期"],axis=1,inplace=True)
data["商店ID"].min()
data["商店ID"].max()
365

折扣编码

enc = OneHotEncoder(drop="if_binary")
enc.fit(data["折扣"].values.reshape(-1,1))
enc.transform(data["折扣"].values.reshape(-1,1)).toarray()
enc.transform(data_test["折扣"].values.reshape(-1,1)).toarray()
array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [0.],
       [0.]])
data["折扣"] = enc.transform(data["折扣"].values.reshape(-1,1)).toarray()
data_test["折扣"]  = enc.transform(data_test["折扣"].values.reshape(-1,1)).toarray()

商店类型、商店位置、商店地区编码

enc = OneHotEncoder(drop="if_binary")
enc.fit(data[["商店类型","位置","地区"]])

OneHotEncoder(drop='if_binary')
columns_ = []
for index in enc.categories_:
    for name  in  index :
       columns_.append(name)
columns_
['S1', 'S2', 'S3', 'S4', 'L1', 'L2', 'L3', 'L4', 'L5', 'R1', 'R2', 'R3', 'R4']
enc_train = pd.DataFrame(enc.transform(data[["商店类型","位置","地区"]]).toarray(),columns=columns_)
enc_test = pd.DataFrame(enc.transform(data_test[["商店类型","位置","地区"]]).toarray(),columns=columns_)
data = pd.concat([data,enc_train],axis=1)
data_test = pd.concat([data_test,enc_test],axis=1)
data.drop(["商店类型","位置","地区"],axis=1,inplace=True)
data_test.drop(["商店类型","位置","地区"],axis=1,inplace=True)

日期衍生

def time_derivation(t,col="运营日期"):
    t["year"] = t[col].dt.year
    t["month"] = t[col].dt.month
    t["day"] = t[col].dt.day
    t["quarter"] = t[col].dt.quarter
    t["weekofyear"] = t[col].dt.weekofyear
    t["dayofweek"] = t[col].dt.dayofweek+1
    t["weekend"] = (t["dayofweek"]>5).astype(int)
    return t

data_train  = time_derivation(data)
data_test_  = time_derivation(data_test)
data_train.drop("运营日期",axis=1,inplace=True)
data_test_.drop("运营日期",axis=1,inplace=True)
data_train.shape
(188340, 24)

将数据集转化为有监督学习数据集

def series_to_supervisied_(data,step_in,step_out,dropnan = True):
    """
    :param data: 观测的序列,类型为列表或者二维的numpy数组
    :param step_in: 作为输入滞后观测数量(x)
    :param step_out: 作为输出的观测值数量(y)
    :param  dropnan: 是否具有Nan的行,默认为True
    return 监督学习的重组得到的dataframe列
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = data
    cols = []
    names = []
    # 输入序列[(t-n),(t-n+1),(t-n+2)..(t-1)]
    for i in range(step_in,0,-1):
        cols.append(df.shift(i))
        names+=[f"{name}-({i})step" for name in df.columns]
    # 输出序列[t,(t+1),(t+2)...(t+n)]
    for i in range(0,step_out):
        cols.append(df.shift(-i))
        if i ==0:
            names+=[f"{name}+(0)step" for name in df.columns]
        else:
            names+=[f"{name}+({i})step" for name in df.columns]
    
    df_re = pd.concat(cols,axis=1)
    df_re.columns = names
    if dropnan:
        df_re.dropna(inplace=True)
    
    return df_re

对每家店进行探索

对所有的数据进行处理,预测根据过去的一个时间窗口数据,预测未来一个时间时刻

train_data = []
test_data = [] 
for i in tqdm.trange(1,366):
    data = data_train[data_train.loc[:,"商店ID"]==i].copy()
    df_re = series_to_supervisied_(data,step_in= 10,step_out = 1,dropnan = True)
    data_train_,data_test_ = train_test_split(df_re,test_size=0.2,shuffle=False)
    train_data.append(data_train_)
    test_data.append(data_test_)
    del  data ,df_re,data_train_,data_test_
    gc.collect()
train_data = pd.concat(train_data)
test_data = pd.concat(test_data)
100%|████████████████████████████████████████████████████████████████████████████████| 365/365 [00:13<00:00, 26.78it/s]
train_data
商店ID-(10)step 节假日-(10)step 折扣-(10)step 销量-(10)step S1-(10)step S2-(10)step S3-(10)step S4-(10)step L1-(10)step L2-(10)step ... R2+(0)step R3+(0)step R4+(0)step year+(0)step month+(0)step day+(0)step quarter+(0)step weekofyear+(0)step dayofweek+(0)step weekend+(0)step
3806 1.0 1.0 1.0 7011.84 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 2018 1 11 1 2 4 0
4195 1.0 0.0 1.0 42369.00 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 2018 1 12 1 2 5 0
4652 1.0 0.0 1.0 50037.00 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 2018 1 13 1 2 6 1
4757 1.0 0.0 1.0 44397.00 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 2018 1 14 1 2 7 1
5454 1.0 0.0 1.0 47604.00 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 2018 1 15 1 3 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
149622 365.0 0.0 1.0 32430.00 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 0.0 0.0 2019 2 14 1 7 4 0
149753 365.0 0.0 1.0 38766.00 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 0.0 0.0 2019 2 15 1 7 5 0
150337 365.0 0.0 1.0 62139.00 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 0.0 0.0 2019 2 16 1 7 6 1
150651 365.0 0.0 0.0 21273.00 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 0.0 0.0 2019 2 17 1 7 7 1
150910 365.0 0.0 0.0 28833.00 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 0.0 0.0 2019 2 18 1 8 1 0

147460 rows × 264 columns

y_columns = "销量+(0)step"
y_train = train_data[y_columns]
x_train = train_data[list(train_data.columns)[:-24]]
x_test= test_data[list(test_data.columns)[:-24]]
y_test = test_data[y_columns]
from sklearn.utils import shuffle
trains = pd.concat([x_train,y_train],axis=1)
trains  = shuffle(trains,random_state=1412)
x__train = trains.iloc[:,:-1]
y__train = trains.iloc[:,-1]

随机森林

from sklearn.ensemble import RandomForestRegressor
rf_clf = RandomForestRegressor()
rf_clf.fit(x__train,y__train)
RandomForestRegressor()
rf_clf.score(x_test,y_test)
0.7297944553988286
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,rf_clf.predict(x_test))**0.5
10316.49939033907
mean_squared_error(y__train,rf_clf.predict(x__train))**0.5
2340.9166743339324
def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.sum(np.abs(y_true - y_pred) * 2) / np.sum(np.abs(y_true) + np.abs(y_pred))

def prophet_smape(y_true, y_pred):
    smape_val = symmetric_mean_absolute_percentage_error(y_true, y_pred)
    return 'SMAPE', smape_val, False
prophet_smape(y_test,rf_clf.predict(x_test))
('SMAPE', 0.15452129351735566, False)
 prophet_smape(y__train,rf_clf.predict(x__train))
('SMAPE', 0.03526619069708201, False)

xgboost

from xgboost import XGBRegressor
xgb_clf = XGBRegressor()
xgb_clf.fit(x__train,y__train)
xgb_clf.score(x_test,y_test)
0.7136438711451321
mean_squared_error(y_test,xgb_clf.predict(x_test))**0.5
10620.341210004963
mean_squared_error(y__train,xgb_clf.predict(x__train))**0.5
5570.337227187151
 prophet_smape(y_test,xgb_clf.predict(x_test))
('SMAPE', 0.16851558969888786, False)
 prophet_smape(y__train,xgb_clf.predict(x__train))
('SMAPE', 0.09342656379545357, False)

用一下贝叶斯优化看一下.

这个代码也是可用的,因为耗时所以中间打断掉了。

from hyperopt import hp,fmin,tpe,Trials,partial
from hyperopt.early_stop import no_progress_loss


# 定义目标函数:
def hpyeropt_objective(params):

    train_data = []
    test_data = [] 
    for i in range(1,366):
        data = data_train[data_train.loc[:,"商店ID"]==i].copy()
        df_re = series_to_supervisied_(data,step_in= int(params["time_step"]),step_out = 1,dropnan = True)
        data_train_,data_test_ = train_test_split(df_re,test_size=0.2,shuffle=False)
        train_data.append(data_train_)
        test_data.append(data_test_)
        del  data ,df_re,data_train_,data_test_
        gc.collect()
    train_data = pd.concat(train_data)
    test_data = pd.concat(test_data)
    
    y_columns = "销量+(0)step"
    y_train = train_data[y_columns]
    x_train = train_data[list(train_data.columns)[:-24]]
    x_test= test_data[list(test_data.columns)[:-24]]
    y_test = test_data[y_columns]
    
    
    from sklearn.utils import shuffle
    trains = pd.concat([x_train,y_train],axis=1)
    trains  = shuffle(trains,random_state=1412)
    x__train = trains.iloc[:,:-1]
    y__train = trains.iloc[:,-1]
    


    
    clf = RandomForestRegressor(n_estimators=int(params["n_estimators"]),
                                max_depth=int(params["max_depth"]),
#                                max_features=int(min(params["max_features"],len(xtrain.columns))),
                               min_impurity_decrease=params["min_impurity_decrease"],
                               random_state=1412,
                               verbose=False,
                               n_jobs=-1
                               ).fit(x__train,y__train)
    
    scores = clf.score(x_test,y_test)
    return -scores   


#定义参数空间
params_grid = {"n_estimators":hp.quniform("n_estimators",10,1000,20),
                   "max_depth":hp.quniform("max_depth",5,25,1),
#                    "max_features":hp.quniform("max_features",10,10000,1),
                   "min_impurity_decrease":hp.quniform("min_impurity_decrease",0,5,1),
              
           
               "time_step":hp.quniform("time_step",10,200,5)
              }

#定义迭代
def param_hyperopt(max_evals = 100):
    trials = Trials()
    early_stop_fn =no_progress_loss(50)
    params_best = fmin(hpyeropt_objective,
                      space=params_grid,
                       algo=tpe.suggest,
                       max_evals=max_evals,
                       verbose=True,
                       trials=trials,
                       early_stop_fn=early_stop_fn
                      )
    print("\n","\n","best params:",params_best,"\n")
    return params_best,trials
params_best,trials =  param_hyperopt(max_evals = 300)
 30%|██████████▉                          | 89/300 [83:23:32<243:07:02, 4147.97s/trial, best loss: -0.7321221326244512]

结论

这种方法的耗时太久了,这和时间步长的不断变化有关,受到电脑算力的限制,这里不做进一步深入的探索,但是目前在只考虑时间窗的前提下,对于 R 2 R^2 R2来讲,可以发现表现效果得到了极大的提升。 也就是说,目前的思路是正确的, 这种方案也是可行的。 如果要做进一步优化的话,便是对特征的角度去考虑了,即特征衍生。 因此未来的优化方向可以首先向时间窗内的特征衍生去考虑。

你可能感兴趣的:(时间序列预测,python,人工智能)