Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)

之前我们对TPSJAN22进行了简单的数据分析(详见:Kaggle Tabular Playground Series - Jan 2022 学习笔记1(数据分析))。现在我们尝试使用时间序列和线性回归来训练模型。

试题地址:Tabular Playground Series - Jan 2022

本文参考:TPSJAN22-03 Linear Model

import pandas as pd
import numpy as np
import pickle
import math
import matplotlib.pyplot as plt
import dateutil.easter as easter
from matplotlib.ticker import MaxNLocator
from datetime import datetime, date, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LinearRegression, HuberRegressor, Ridge, Lasso

import matplotlib.dates as mdates

original_train_df = pd.read_csv('../datas/train.csv')
original_test_df = pd.read_csv('../datas/test.csv')
gdp_df = pd.read_csv('../datas/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')

gdp_df.set_index('year', inplace=True)


for df in [original_train_df, original_test_df]:
    df['date'] = pd.to_datetime(df.date)
original_train_df.head(2)

在这里插入图片描述
TPSJAN22 要求使用SMAPE作为损失函数,但是Scikit-learn没有提供,所以先自定义一个损失函数。

def smape_loss(y_true, y_pred):
    """SMAPE Loss"""
    return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200

接下来根据我们之前的数据分析猜想对特征进行处理

def engineer(df):   
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]


    #加上gdp信息;增加每周的季节性指示器(Seasonal indicators)   
    new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
                           'wd2': df.date.dt.weekday == 1, 
                           'wd3': df.date.dt.weekday == 2,
                           'wd4': df.date.dt.weekday == 3,  
                           'wd5': df.date.dt.weekday == 4,
                           'wd6': df.date.dt.weekday == 5,
                           'wd7': df.date.dt.weekday == 6,
                          })

    #将商品种类,国家,商店进行独热编码
    for country in ['Finland', 'Norway']:
        new_df[country] = df.country == country
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        new_df[product] = df['product'] == product
        
    #添加傅里叶特征:我们对每个产品添加3对傅里叶特征
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 3):
        new_df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Mug']
        new_df[f'hat_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Hat']
        new_df[f'hat_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Hat']
    
    
    return new_df

来看看处理好的特征

train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)
features = test_df.columns
for df in [train_df, test_df]:
    df[features] = df[features].astype(np.float32)

train_df

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第1张图片

list(train_df)

[‘gdp’, ‘wd2’, ‘wd3’, ‘wd4’, ‘wd5’, ‘wd6’, ‘wd7’, ‘Finland’, ‘Norway’, ‘KaggleRama’, ‘Kaggle Mug’, ‘Kaggle Hat’, ‘sin1’, ‘cos1’, ‘mug_sin1’, ‘mug_cos1’, ‘hat_sin1’, ‘hat_cos1’, ‘sin2’, ‘cos2’, ‘mug_sin2’, ‘mug_cos2’, ‘hat_sin2’, ‘hat_cos2’, ‘date’, ‘num_sold’]

下面我们开始训练模型

def fit_model(X_tr):
    # Preprocess the data
    X_tr_f = X_tr[features]
    preproc = StandardScaler()
    X_tr_f = preproc.fit_transform(X_tr_f)
    y_tr = X_tr.num_sold.values.reshape(-1, 1)

    
    model = LinearRegression()

    #因为sk-learn没有SMAPE作为损失函数,所以对目标值取对数,然后使用默认的MAE可以得到近似SMAPE的效果。
    #更多详细信息参考:https://www.kaggle.com/code/ambrosm/tpsjan22-03-linear-model/notebook#Training-the-simple-model-(without-holidays) 第一段
    #和该讨论:https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298473
    model.fit(X_tr_f, np.log(y_tr).ravel())
        
    return preproc, model

preproc, model = fit_model(train_df)

train_pred_df = original_train_df.copy()

#因为预测的结果是对目标值取对数,所以要获取预测值需要将预测结果进行指数运算来进行还原
train_pred_df['pred'] = np.exp(model.predict(preproc.transform(train_df[features])))
train_pred_df

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第2张图片
接下来我们选取Finland来看看每个商品在每个商店的预测值和目标值的损失值

plt.figure(figsize=(18, 100))
for i, (combi, df) in enumerate(train_pred_df[train_pred_df.country == 'Finland'].groupby(['store', 'product'])):
    df = df.set_index('date')
    # print(df.index)
    # break
    residuals = (df.pred - df.num_sold) / (df.pred + df.num_sold) * 200
    # print(residuals.min())
    # break
    ax = plt.subplot(18, 1, i+1, ymargin=0.5)
    ax.scatter(df.index,
                residuals,
                s=1, color='k')
    
    ax.set_title(combi)

    ax.vlines(pd.date_range('2014-12-31', '2019-01-01', freq='M'),
               plt.ylim()[0], plt.ylim()[1], alpha=0.5)
    ax.vlines(pd.date_range('2014-12-31', '2019-01-01', freq='Y'),
               plt.ylim()[0], plt.ylim()[1], alpha=0.5)
    
    ax.xaxis.set_major_locator(mdates.YearLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    ax.xaxis.set_minor_locator(mdates.MonthLocator())
    ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))
    ax.set_ylim(residuals.min(), residuals.max())
plt.tight_layout(h_pad=3.0)
plt.suptitle('Residuals for four years', y=1.003)
plt.show()

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第3张图片
可以发现,每个商店每个商品的损失值分布很相似,我们可以直接看Finland所有商品每天的损失值。

# Plot all residuals (four-year range, sum of all products)
def plot_all_residuals(residuals):
    plt.figure(figsize=(20,6))
    plt.scatter(residuals.index,
                residuals,
                s=1, color='k')
    plt.vlines(pd.date_range('2014-12-31', '2019-01-01', freq='M'),
               plt.ylim()[0], plt.ylim()[1], alpha=0.5)
    plt.vlines(pd.date_range('2014-12-31', '2019-01-01', freq='Y'),
               plt.ylim()[0], plt.ylim()[1], alpha=0.5)
    
    plt.gca().xaxis.set_major_locator(mdates.YearLocator())
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.gca().xaxis.set_minor_locator(mdates.MonthLocator())
    plt.gca().xaxis.set_minor_formatter(mdates.DateFormatter('%m'))
    # plt.setp(plt.gca().get_xticklabels(), rotation=70, horizontalalignment='right')
    plt.title('Residuals for four years')
    plt.show()
by_date = train_pred_df[train_pred_df.country == 'Finland'].groupby(train_pred_df['date'])
residuals = (by_date.pred.sum() - by_date.num_sold.sum()) / (by_date.pred.sum() + by_date.num_sold.sum()) * 200


plot_all_residuals(residuals)

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第4张图片
可以发现每年1月和12月有巨大的误差,跟之前特征分析的时候的推测很相似。我们可能需要将年底和年初的日期提取出来作为特征值。放大12月、1月的误差来看看:

# Plot residuals for interesting intervals
def plot_around(residuals, m, d, w):
    """Plot residuals in an interval of with 2*w around month=m and day=d"""
    plt.figure(figsize=(20,6))
    plt.title(f"Residuals around m={m} d={d}")
    for y in np.arange(2015, 2020):
        d0 = pd.Timestamp(date(y, m, d))
        residual_range = residuals[(residuals.index > d0 - timedelta(w)) & 
                                   (residuals.index < d0 + timedelta(w))]
        plt.plot([(r - d0).days for r in residual_range.index], residual_range, label=str(y))
    plt.gca().xaxis.set_major_locator(MaxNLocator(2*w,integer=True)) # only integer labels
    # 
    plt.legend()
    plt.show()
plot_around(residuals, 1, 1, 30)

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第5张图片

可以发现在12月24号到1月4号之间误差有巨大的波动,所以我们将每年的12月24号到1月4号之间的日期标注出来作为特征,再训练模型看看

# Feature engineering
def engineer(df):
    """Return a new dataframe with the engineered features"""
    
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]
        
    new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
                           'wd2': df.date.dt.weekday == 1, 
                           'wd3': df.date.dt.weekday == 2,
                           'wd4': df.date.dt.weekday == 3,  
                           'wd5': df.date.dt.weekday == 4,
                           'wd6': df.date.dt.weekday == 5,
                           'wd7': df.date.dt.weekday == 6,
                          })

    # One-hot encoding (no need to encode the last categories)
    for country in ['Finland', 'Norway']:
        new_df[country] = df.country == country
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        new_df[product] = df['product'] == product
        
    # Seasonal variations (Fourier series)
    # The three products have different seasonal patterns
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 3):
        temp_sin = np.sin(dayofyear / 365 * 2 * math.pi * k)
        temp_cos = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = temp_sin * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = temp_cos * new_df['Kaggle Mug']
        new_df[f'hat_sin{k}'] = temp_sin * new_df['Kaggle Hat']
        new_df[f'hat_cos{k}'] = temp_cos * new_df['Kaggle Hat']
    
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(24, 32)}),
                        pd.DataFrame({f"jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(1, 5)})],
                       axis=1)

    return new_df

train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)
features = test_df.columns
for df in [train_df, test_df]:
    df[features] = df[features].astype(np.float32)

preproc, model = fit_model(train_df)



train_pred_df = original_train_df.copy()
train_pred_df['pred'] = np.exp(model.predict(preproc.transform(train_df[features])))


by_date = train_pred_df[train_pred_df.country == 'Finland'].groupby(train_pred_df['date'])
residuals = (by_date.pred.mean() - by_date.num_sold.mean()) / (by_date.pred.mean() + by_date.num_sold.mean()) * 200


plot_all_residuals(residuals)

plot_around(residuals, 1, 1, 30)

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第6张图片
Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第7张图片
似乎1月4到15号之间还有比较明显的误差,同时12月初似乎也有误差,我们先将1月的特征范围增加到15号

# Feature engineering
def engineer(df):
    """Return a new dataframe with the engineered features"""
    
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]
        
    new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
                           'wd2': df.date.dt.weekday == 1, 
                           'wd3': df.date.dt.weekday == 2,
                           'wd4': df.date.dt.weekday == 3,  
                           'wd5': df.date.dt.weekday == 4,
                           'wd6': df.date.dt.weekday == 5,
                           'wd7': df.date.dt.weekday == 6,
                          })

    # One-hot encoding (no need to encode the last categories)
    for country in ['Finland', 'Norway']:
        new_df[country] = df.country == country
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        new_df[product] = df['product'] == product
        
    # Seasonal variations (Fourier series)
    # The three products have different seasonal patterns
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 3):
        temp_sin = np.sin(dayofyear / 365 * 2 * math.pi * k)
        temp_cos = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = temp_sin * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = temp_cos * new_df['Kaggle Mug']
        new_df[f'hat_sin{k}'] = temp_sin * new_df['Kaggle Hat']
        new_df[f'hat_cos{k}'] = temp_cos * new_df['Kaggle Hat']
    
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(24, 32)}),
                        pd.DataFrame({f"jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(1, 16)})],
                       axis=1)

    return new_df

train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)
features = test_df.columns
for df in [train_df, test_df]:
    df[features] = df[features].astype(np.float32)

preproc, model = fit_model(train_df)



train_pred_df = original_train_df.copy()
train_pred_df['pred'] = np.exp(model.predict(preproc.transform(train_df[features])))


by_date = train_pred_df[train_pred_df.country == 'Finland'].groupby(train_pred_df['date'])
residuals = (by_date.pred.mean() - by_date.num_sold.mean()) / (by_date.pred.mean() + by_date.num_sold.mean()) * 200


plot_all_residuals(residuals)

plot_around(residuals, 1, 1, 30)

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第8张图片
Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第9张图片
现在12月末和1月初的误差我们已经处理得好了很多了。但是观察上面的散点图,可以发现在每年的4、5、6、7、11、12月还有很明显的误差。我们放大区间来观察一下。

plot_around(residuals, 4, 1, 30) 
plot_around(residuals, 5, 1, 30) 
plot_around(residuals, 6, 1, 30) 
plot_around(residuals, 7, 1, 30) 
plot_around(residuals, 11, 1, 15)
plot_around(residuals, 12, 1, 30)

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第10张图片
Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第11张图片
Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第12张图片
Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第13张图片
Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第14张图片
Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第15张图片
可以发现在4月每年都有相似的波动,但是有的年份波动发生得早,有些年份的波动发生得晚,6月底、11月初都有类似的情况。从每年的5月1日开始又持续时间大约10天的波动,12月初也有类似的情况。回想之前的12月末和1月初的信息,我们可以推测这些误差点可能与节假日有关。从kaggle网友提供的三个国家的官方节假日信息,我们可以从中得到一些启发。

Holidays_Finland_Norway_Sweden_2015-2019

我们发现1月有元旦节,日期固定。4月会有复活节,日期不固定,5月有国际劳动节,日期固定等等。通过对节假日信息和上面误差信息比较,我们可以对我们的特征工程做出修改:

# Feature engineering
def engineer(df):
    
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]
        
    new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
                           'wd2': df.date.dt.weekday == 1, 
                           'wd3': df.date.dt.weekday == 2,
                           'wd4': df.date.dt.weekday == 3,  
                           'wd5': df.date.dt.weekday == 4,
                           'wd6': df.date.dt.weekday == 5,
                           'wd7': df.date.dt.weekday == 6,
                          })

    # One-hot encoding (no need to encode the last categories)
    for country in ['Finland', 'Norway']:
        new_df[country] = df.country == country
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        new_df[product] = df['product'] == product
        
    
    dayofyear = df.date.dt.dayofyear
    for k in range(1, 3):
        temp_sin = np.sin(dayofyear / 365 * 2 * math.pi * k)
        temp_cos = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = temp_sin * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = temp_cos * new_df['Kaggle Mug']
        new_df[f'hat_sin{k}'] = temp_sin * new_df['Kaggle Hat']
        new_df[f'hat_cos{k}'] = temp_cos * new_df['Kaggle Hat']
    
    ##圣诞节、元旦节
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(24, 32)}),
                        pd.DataFrame({f"jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(1, 16)}),
                        pd.DataFrame({f"n-dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway') for d in range(24, 32)}),
                        pd.DataFrame({f"n-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway') for d in range(1, 10)}),
                       
                        pd.DataFrame({f"s-dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Sweden') for d in range(24, 32)}),
                        pd.DataFrame({f"s-jan{d}":
                                      (df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in range(1, 16)})],
                       axis=1)
    
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d)
                                      for d in list(range(1, 11))}), #  + list(range(17, 25))
                        
                        pd.DataFrame({f"may{d}":
                                      (df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
                                      for d in list(range(16, 28))})
                        ],
                        
                       axis=1)
    
    
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"june{d}":
                                      (df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
                                      for d in list(range(7, 15))}),
                       ],
                       axis=1)
    
    # Midsummer Day
    midsummer_day_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-20')),
                                         2016: pd.Timestamp(('2016-06-25')),
                                         2017: pd.Timestamp(('2017-06-24')),
                                         2018: pd.Timestamp(('2018-06-23')),
                                         2019: pd.Timestamp(('2019-06-22'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"midsummer_day{d}": 
                                      (df.date - midsummer_day_date == np.timedelta64(d, "D")) & (df.country != 'Norway') 
                                      for d in list(range(-2, 11))})],
                       axis=1)

    # First Sunday of November
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"sun_nov{d}": 
                                      (df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country != 'Norway') 
                                      for d in list(range(-1, 10))})],
                       axis=1)


    # First half of December (Independence Day of Finland, 6th of December)
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"dec{d}":
                                      (df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
                                      for d in list(range(6, 14))})],
                       axis=1)
                       
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"easter{d}": 
                                      (df.date - easter_date == np.timedelta64(d, "D")) & (df.country == 'Finland') 
                                      for d in list(range(-2, 11)) 
                                      + list(range(40, 48)) + list(range(50, 59))
                                      })],
                       axis=1)

    new_df = pd.concat([new_df,
                        pd.DataFrame({f"n-easter{d}": 
                                      (df.date - easter_date == np.timedelta64(d, "D")) & (df.country == 'Norway') 
                                      for d in list(range(-2, 11)) 
                                      + list(range(40, 48)) + list(range(50, 59))
                                      })],
                       axis=1)
    new_df = pd.concat([new_df,
                        pd.DataFrame({f"s-easter{d}": 
                                      (df.date - easter_date == np.timedelta64(d, "D")) & (df.country == 'Sweden') 
                                      for d in list(range(-2, 11)) 
                                      + list(range(40, 48)) + list(range(50, 59))
                                      })],
                       axis=1)

    return new_df

train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)
features = test_df.columns
for df in [train_df, test_df]:
    df[features] = df[features].astype(np.float32)

preproc, model = fit_model(train_df)



train_pred_df = original_train_df.copy()
train_pred_df['pred'] = np.exp(model.predict(preproc.transform(train_df[features])))



by_date = train_pred_df[train_pred_df.country == 'Norway'].groupby(train_pred_df['date'])
residuals = (by_date.pred.sum() - by_date.num_sold.sum()) / (by_date.pred.sum() + by_date.num_sold.sum()) * 200


plot_all_residuals(residuals)

plot_around(residuals, 1, 1, 15) 
plot_around(residuals, 4, 1, 15) 
plot_around(residuals, 5, 1, 30) 
plot_around(residuals, 6, 1, 30) 
plot_around(residuals, 7, 1, 30) 
plot_around(residuals, 11, 1, 15)
plot_around(residuals, 12, 1, 30)

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第16张图片

可以发现将每年的节假日加入特征之后,我们的误差得到了很好的改善。接下来来看看评估分数

def fit_model(X_tr, X_va=None, outliers=False):
    """Scale the data, fit a model, plot the training history and validate the model"""
    start_time = datetime.now()

    # Preprocess the data
    X_tr_f = X_tr[features]
    preproc = StandardScaler()
    X_tr_f = preproc.fit_transform(X_tr_f)
    y_tr = X_tr.num_sold.values.reshape(-1, 1)
    
    
    model = LinearRegression()
   
    model.fit(X_tr_f, np.log(y_tr).ravel())

    if X_va is not None:
        # Preprocess the validation data
        X_va_f = X_va[features]
        X_va_f = preproc.transform(X_va_f)
        y_va = X_va.num_sold.values.reshape(-1, 1)

        # Inference for validation
        y_va_pred = np.exp(model.predict(X_va_f)).reshape(-1, 1)
        oof.update(pd.Series(y_va_pred.ravel(), index=X_va.index))
        
        # Evaluation: Execution time and SMAPE
        smape_before_correction = np.mean(smape_loss(y_va, y_va_pred))
        #y_va_pred *= LOSS_CORRECTION
        smape = np.mean(smape_loss(y_va, y_va_pred))
        print(f"Fold {run}.{fold} | {str(datetime.now() - start_time)[-12:-7]}"
              f" | SMAPE: {smape:.5f}   (before correction: {smape_before_correction:.5f})")
        score_list.append(smape)
        
        # Plot y_true vs. y_pred
        if fold == 0:
            plt.figure(figsize=(10, 10))
            plt.scatter(y_va, y_va_pred, s=1, color='r')
            #plt.scatter(np.log(y_va), np.log(y_va_pred), s=1, color='g')
            plt.plot([plt.xlim()[0], plt.xlim()[1]], [plt.xlim()[0], plt.xlim()[1]], '--', color='k')
            plt.gca().set_aspect('equal')
            plt.xlabel('y_true')
            plt.ylabel('y_pred')
            plt.title('OOF Predictions')
            plt.show()
        
    return preproc, model
#%%time
RUNS = 1 # should be 1. increase the number of runs only if you want see how the result depends on the random seed
OUTLIERS = True
TRAIN_VAL_CUT = datetime(2018, 1, 1)
LOSS_CORRECTION = 1

# Make the results reproducible
np.random.seed(202100)

total_start_time = datetime.now()
oof = pd.Series(0.0, index=train_df.index)
score_list = []
for run in range(RUNS):
    kf = GroupKFold(n_splits=4)
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df, groups=train_df.date.dt.year)):
        X_tr = train_df.iloc[train_idx]
        X_va = train_df.iloc[val_idx]
        print(f"Fold {run}.{fold}")
        preproc, model = fit_model(X_tr, X_va)

print(f"Average SMAPE: {sum(score_list) / len(score_list):.5f}")
with open('oof.pickle', 'wb') as handle: pickle.dump(oof, handle)

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第17张图片

# Plot all num_sold_true and num_sold_pred (five years) for one country-store-product combination
def plot_five_years_combination(engineer, country='Norway', store='KaggleMart', product='Kaggle Hat'):
    demo_df = pd.DataFrame({'row_id': 0,
                            'date': pd.date_range('2015-01-01', '2019-12-31', freq='D'),
                            'country': country,
                            'store': store,
                            'product': product})
    demo_df.set_index('date', inplace=True, drop=False)
    demo_df = engineer(demo_df)
    demo_df['num_sold'] = np.exp(model.predict(preproc.transform(demo_df[features])))
    plt.figure(figsize=(20, 6))
    plt.plot(np.arange(len(demo_df)), demo_df.num_sold, label='prediction')
    train_subset = train_df[(original_train_df.country == country) & (original_train_df.store == store) & (original_train_df['product'] == product)]
    # plt.scatter(np.arange(len(train_subset)), train_subset.num_sold, label='true', alpha=0.5, color='red', s=3)
    plt.plot(np.arange(len(train_subset)), train_subset.num_sold, label='true',  color='red')
    plt.legend()
    plt.title('Predictions and true num_sold for five years')
    plt.show()

plot_five_years_combination(engineer)

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第18张图片

生成提交数据

# Fit the model on the complete training data
train_idx = np.arange(len(train_df))
X_tr = train_df.iloc[train_idx]
preproc, model = fit_model(X_tr, None)

plot_five_years_combination(engineer) # Quick check for debugging

# Inference for test
test_pred_list = []
test_pred_list.append(np.exp(model.predict(preproc.transform(test_df[features]))) * LOSS_CORRECTION)

# Create the submission file
sub = original_test_df[['row_id']].copy()
sub['num_sold'] = sum(test_pred_list) / len(test_pred_list)
sub.to_csv('submission_linear_model.csv', index=False)

# Plot the distribution of the test predictions
plt.figure(figsize=(16,3))
plt.hist(train_df['num_sold'], bins=np.linspace(0, 3000, 201),
         density=True, label='Training')
plt.hist(sub['num_sold'], bins=np.linspace(0, 3000, 201),
         density=True, rwidth=0.5, label='Test predictions')
plt.xlabel('num_sold')
plt.ylabel('Frequency')
plt.legend()
plt.show()
sub

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第19张图片
Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第20张图片
Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第21张图片

因为销售数量是整数,我们对结果进行四舍五入一下,提交看看哪个分数要高些。

# Create a rounded submission file
sub_rounded = sub.copy()
sub_rounded['num_sold'] = sub_rounded['num_sold'].round()
sub_rounded.to_csv('submission_linear_model_rounded.csv', index=False)
sub_rounded

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第22张图片

未四舍五入的分数
在这里插入图片描述
四舍五入的分数

Kaggle Tabular Playground Series - Jan 2022 学习笔记2(使用时间序列的线性回归)_第23张图片

相关连接:

Kaggle Tabular Playground Series - Jan 2022 学习笔记1(数据分析)

你可能感兴趣的:(机器学习,python,机器学习,人工智能,算法)