之前我们对TPSJAN22进行了简单的数据分析(详见:Kaggle Tabular Playground Series - Jan 2022 学习笔记1(数据分析))。现在我们尝试使用时间序列和线性回归来训练模型。
试题地址:Tabular Playground Series - Jan 2022
本文参考:TPSJAN22-03 Linear Model
import pandas as pd
import numpy as np
import pickle
import math
import matplotlib.pyplot as plt
import dateutil.easter as easter
from matplotlib.ticker import MaxNLocator
from datetime import datetime, date, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LinearRegression, HuberRegressor, Ridge, Lasso
import matplotlib.dates as mdates
original_train_df = pd.read_csv('../datas/train.csv')
original_test_df = pd.read_csv('../datas/test.csv')
gdp_df = pd.read_csv('../datas/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv')
gdp_df.set_index('year', inplace=True)
for df in [original_train_df, original_test_df]:
df['date'] = pd.to_datetime(df.date)
original_train_df.head(2)
TPSJAN22 要求使用SMAPE作为损失函数,但是Scikit-learn没有提供,所以先自定义一个损失函数。
def smape_loss(y_true, y_pred):
"""SMAPE Loss"""
return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200
接下来根据我们之前的数据分析猜想对特征进行处理
def engineer(df):
def get_gdp(row):
country = 'GDP_' + row.country
return gdp_df.loc[row.date.year, country]
#加上gdp信息;增加每周的季节性指示器(Seasonal indicators)
new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
'wd2': df.date.dt.weekday == 1,
'wd3': df.date.dt.weekday == 2,
'wd4': df.date.dt.weekday == 3,
'wd5': df.date.dt.weekday == 4,
'wd6': df.date.dt.weekday == 5,
'wd7': df.date.dt.weekday == 6,
})
#将商品种类,国家,商店进行独热编码
for country in ['Finland', 'Norway']:
new_df[country] = df.country == country
new_df['KaggleRama'] = df.store == 'KaggleRama'
for product in ['Kaggle Mug', 'Kaggle Hat']:
new_df[product] = df['product'] == product
#添加傅里叶特征:我们对每个产品添加3对傅里叶特征
dayofyear = df.date.dt.dayofyear
for k in range(1, 3):
new_df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
new_df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
new_df[f'mug_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Mug']
new_df[f'mug_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Mug']
new_df[f'hat_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Hat']
new_df[f'hat_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Hat']
return new_df
来看看处理好的特征
train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)
features = test_df.columns
for df in [train_df, test_df]:
df[features] = df[features].astype(np.float32)
train_df
list(train_df)
[‘gdp’, ‘wd2’, ‘wd3’, ‘wd4’, ‘wd5’, ‘wd6’, ‘wd7’, ‘Finland’, ‘Norway’, ‘KaggleRama’, ‘Kaggle Mug’, ‘Kaggle Hat’, ‘sin1’, ‘cos1’, ‘mug_sin1’, ‘mug_cos1’, ‘hat_sin1’, ‘hat_cos1’, ‘sin2’, ‘cos2’, ‘mug_sin2’, ‘mug_cos2’, ‘hat_sin2’, ‘hat_cos2’, ‘date’, ‘num_sold’]
下面我们开始训练模型
def fit_model(X_tr):
# Preprocess the data
X_tr_f = X_tr[features]
preproc = StandardScaler()
X_tr_f = preproc.fit_transform(X_tr_f)
y_tr = X_tr.num_sold.values.reshape(-1, 1)
model = LinearRegression()
#因为sk-learn没有SMAPE作为损失函数,所以对目标值取对数,然后使用默认的MAE可以得到近似SMAPE的效果。
#更多详细信息参考:https://www.kaggle.com/code/ambrosm/tpsjan22-03-linear-model/notebook#Training-the-simple-model-(without-holidays) 第一段
#和该讨论:https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298473
model.fit(X_tr_f, np.log(y_tr).ravel())
return preproc, model
preproc, model = fit_model(train_df)
train_pred_df = original_train_df.copy()
#因为预测的结果是对目标值取对数,所以要获取预测值需要将预测结果进行指数运算来进行还原
train_pred_df['pred'] = np.exp(model.predict(preproc.transform(train_df[features])))
train_pred_df
接下来我们选取Finland来看看每个商品在每个商店的预测值和目标值的损失值
plt.figure(figsize=(18, 100))
for i, (combi, df) in enumerate(train_pred_df[train_pred_df.country == 'Finland'].groupby(['store', 'product'])):
df = df.set_index('date')
# print(df.index)
# break
residuals = (df.pred - df.num_sold) / (df.pred + df.num_sold) * 200
# print(residuals.min())
# break
ax = plt.subplot(18, 1, i+1, ymargin=0.5)
ax.scatter(df.index,
residuals,
s=1, color='k')
ax.set_title(combi)
ax.vlines(pd.date_range('2014-12-31', '2019-01-01', freq='M'),
plt.ylim()[0], plt.ylim()[1], alpha=0.5)
ax.vlines(pd.date_range('2014-12-31', '2019-01-01', freq='Y'),
plt.ylim()[0], plt.ylim()[1], alpha=0.5)
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.xaxis.set_minor_locator(mdates.MonthLocator())
ax.xaxis.set_minor_formatter(mdates.DateFormatter('%m'))
ax.set_ylim(residuals.min(), residuals.max())
plt.tight_layout(h_pad=3.0)
plt.suptitle('Residuals for four years', y=1.003)
plt.show()
可以发现,每个商店每个商品的损失值分布很相似,我们可以直接看Finland所有商品每天的损失值。
# Plot all residuals (four-year range, sum of all products)
def plot_all_residuals(residuals):
plt.figure(figsize=(20,6))
plt.scatter(residuals.index,
residuals,
s=1, color='k')
plt.vlines(pd.date_range('2014-12-31', '2019-01-01', freq='M'),
plt.ylim()[0], plt.ylim()[1], alpha=0.5)
plt.vlines(pd.date_range('2014-12-31', '2019-01-01', freq='Y'),
plt.ylim()[0], plt.ylim()[1], alpha=0.5)
plt.gca().xaxis.set_major_locator(mdates.YearLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
plt.gca().xaxis.set_minor_locator(mdates.MonthLocator())
plt.gca().xaxis.set_minor_formatter(mdates.DateFormatter('%m'))
# plt.setp(plt.gca().get_xticklabels(), rotation=70, horizontalalignment='right')
plt.title('Residuals for four years')
plt.show()
by_date = train_pred_df[train_pred_df.country == 'Finland'].groupby(train_pred_df['date'])
residuals = (by_date.pred.sum() - by_date.num_sold.sum()) / (by_date.pred.sum() + by_date.num_sold.sum()) * 200
plot_all_residuals(residuals)
可以发现每年1月和12月有巨大的误差,跟之前特征分析的时候的推测很相似。我们可能需要将年底和年初的日期提取出来作为特征值。放大12月、1月的误差来看看:
# Plot residuals for interesting intervals
def plot_around(residuals, m, d, w):
"""Plot residuals in an interval of with 2*w around month=m and day=d"""
plt.figure(figsize=(20,6))
plt.title(f"Residuals around m={m} d={d}")
for y in np.arange(2015, 2020):
d0 = pd.Timestamp(date(y, m, d))
residual_range = residuals[(residuals.index > d0 - timedelta(w)) &
(residuals.index < d0 + timedelta(w))]
plt.plot([(r - d0).days for r in residual_range.index], residual_range, label=str(y))
plt.gca().xaxis.set_major_locator(MaxNLocator(2*w,integer=True)) # only integer labels
#
plt.legend()
plt.show()
plot_around(residuals, 1, 1, 30)
可以发现在12月24号到1月4号之间误差有巨大的波动,所以我们将每年的12月24号到1月4号之间的日期标注出来作为特征,再训练模型看看
# Feature engineering
def engineer(df):
"""Return a new dataframe with the engineered features"""
def get_gdp(row):
country = 'GDP_' + row.country
return gdp_df.loc[row.date.year, country]
new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
'wd2': df.date.dt.weekday == 1,
'wd3': df.date.dt.weekday == 2,
'wd4': df.date.dt.weekday == 3,
'wd5': df.date.dt.weekday == 4,
'wd6': df.date.dt.weekday == 5,
'wd7': df.date.dt.weekday == 6,
})
# One-hot encoding (no need to encode the last categories)
for country in ['Finland', 'Norway']:
new_df[country] = df.country == country
new_df['KaggleRama'] = df.store == 'KaggleRama'
for product in ['Kaggle Mug', 'Kaggle Hat']:
new_df[product] = df['product'] == product
# Seasonal variations (Fourier series)
# The three products have different seasonal patterns
dayofyear = df.date.dt.dayofyear
for k in range(1, 3):
temp_sin = np.sin(dayofyear / 365 * 2 * math.pi * k)
temp_cos = np.cos(dayofyear / 365 * 2 * math.pi * k)
new_df[f'mug_sin{k}'] = temp_sin * new_df['Kaggle Mug']
new_df[f'mug_cos{k}'] = temp_cos * new_df['Kaggle Mug']
new_df[f'hat_sin{k}'] = temp_sin * new_df['Kaggle Hat']
new_df[f'hat_cos{k}'] = temp_cos * new_df['Kaggle Hat']
new_df = pd.concat([new_df,
pd.DataFrame({f"dec{d}":
(df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(24, 32)}),
pd.DataFrame({f"jan{d}":
(df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(1, 5)})],
axis=1)
return new_df
train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)
features = test_df.columns
for df in [train_df, test_df]:
df[features] = df[features].astype(np.float32)
preproc, model = fit_model(train_df)
train_pred_df = original_train_df.copy()
train_pred_df['pred'] = np.exp(model.predict(preproc.transform(train_df[features])))
by_date = train_pred_df[train_pred_df.country == 'Finland'].groupby(train_pred_df['date'])
residuals = (by_date.pred.mean() - by_date.num_sold.mean()) / (by_date.pred.mean() + by_date.num_sold.mean()) * 200
plot_all_residuals(residuals)
plot_around(residuals, 1, 1, 30)
似乎1月4到15号之间还有比较明显的误差,同时12月初似乎也有误差,我们先将1月的特征范围增加到15号
# Feature engineering
def engineer(df):
"""Return a new dataframe with the engineered features"""
def get_gdp(row):
country = 'GDP_' + row.country
return gdp_df.loc[row.date.year, country]
new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
'wd2': df.date.dt.weekday == 1,
'wd3': df.date.dt.weekday == 2,
'wd4': df.date.dt.weekday == 3,
'wd5': df.date.dt.weekday == 4,
'wd6': df.date.dt.weekday == 5,
'wd7': df.date.dt.weekday == 6,
})
# One-hot encoding (no need to encode the last categories)
for country in ['Finland', 'Norway']:
new_df[country] = df.country == country
new_df['KaggleRama'] = df.store == 'KaggleRama'
for product in ['Kaggle Mug', 'Kaggle Hat']:
new_df[product] = df['product'] == product
# Seasonal variations (Fourier series)
# The three products have different seasonal patterns
dayofyear = df.date.dt.dayofyear
for k in range(1, 3):
temp_sin = np.sin(dayofyear / 365 * 2 * math.pi * k)
temp_cos = np.cos(dayofyear / 365 * 2 * math.pi * k)
new_df[f'mug_sin{k}'] = temp_sin * new_df['Kaggle Mug']
new_df[f'mug_cos{k}'] = temp_cos * new_df['Kaggle Mug']
new_df[f'hat_sin{k}'] = temp_sin * new_df['Kaggle Hat']
new_df[f'hat_cos{k}'] = temp_cos * new_df['Kaggle Hat']
new_df = pd.concat([new_df,
pd.DataFrame({f"dec{d}":
(df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(24, 32)}),
pd.DataFrame({f"jan{d}":
(df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(1, 16)})],
axis=1)
return new_df
train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)
features = test_df.columns
for df in [train_df, test_df]:
df[features] = df[features].astype(np.float32)
preproc, model = fit_model(train_df)
train_pred_df = original_train_df.copy()
train_pred_df['pred'] = np.exp(model.predict(preproc.transform(train_df[features])))
by_date = train_pred_df[train_pred_df.country == 'Finland'].groupby(train_pred_df['date'])
residuals = (by_date.pred.mean() - by_date.num_sold.mean()) / (by_date.pred.mean() + by_date.num_sold.mean()) * 200
plot_all_residuals(residuals)
plot_around(residuals, 1, 1, 30)
现在12月末和1月初的误差我们已经处理得好了很多了。但是观察上面的散点图,可以发现在每年的4、5、6、7、11、12月还有很明显的误差。我们放大区间来观察一下。
plot_around(residuals, 4, 1, 30)
plot_around(residuals, 5, 1, 30)
plot_around(residuals, 6, 1, 30)
plot_around(residuals, 7, 1, 30)
plot_around(residuals, 11, 1, 15)
plot_around(residuals, 12, 1, 30)
可以发现在4月每年都有相似的波动,但是有的年份波动发生得早,有些年份的波动发生得晚,6月底、11月初都有类似的情况。从每年的5月1日开始又持续时间大约10天的波动,12月初也有类似的情况。回想之前的12月末和1月初的信息,我们可以推测这些误差点可能与节假日有关。从kaggle网友提供的三个国家的官方节假日信息,我们可以从中得到一些启发。
Holidays_Finland_Norway_Sweden_2015-2019
我们发现1月有元旦节,日期固定。4月会有复活节,日期不固定,5月有国际劳动节,日期固定等等。通过对节假日信息和上面误差信息比较,我们可以对我们的特征工程做出修改:
# Feature engineering
def engineer(df):
def get_gdp(row):
country = 'GDP_' + row.country
return gdp_df.loc[row.date.year, country]
new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
'wd2': df.date.dt.weekday == 1,
'wd3': df.date.dt.weekday == 2,
'wd4': df.date.dt.weekday == 3,
'wd5': df.date.dt.weekday == 4,
'wd6': df.date.dt.weekday == 5,
'wd7': df.date.dt.weekday == 6,
})
# One-hot encoding (no need to encode the last categories)
for country in ['Finland', 'Norway']:
new_df[country] = df.country == country
new_df['KaggleRama'] = df.store == 'KaggleRama'
for product in ['Kaggle Mug', 'Kaggle Hat']:
new_df[product] = df['product'] == product
dayofyear = df.date.dt.dayofyear
for k in range(1, 3):
temp_sin = np.sin(dayofyear / 365 * 2 * math.pi * k)
temp_cos = np.cos(dayofyear / 365 * 2 * math.pi * k)
new_df[f'mug_sin{k}'] = temp_sin * new_df['Kaggle Mug']
new_df[f'mug_cos{k}'] = temp_cos * new_df['Kaggle Mug']
new_df[f'hat_sin{k}'] = temp_sin * new_df['Kaggle Hat']
new_df[f'hat_cos{k}'] = temp_cos * new_df['Kaggle Hat']
##圣诞节、元旦节
new_df = pd.concat([new_df,
pd.DataFrame({f"dec{d}":
(df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(24, 32)}),
pd.DataFrame({f"jan{d}":
(df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Finland') for d in range(1, 16)}),
pd.DataFrame({f"n-dec{d}":
(df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Norway') for d in range(24, 32)}),
pd.DataFrame({f"n-jan{d}":
(df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Norway') for d in range(1, 10)}),
pd.DataFrame({f"s-dec{d}":
(df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Sweden') for d in range(24, 32)}),
pd.DataFrame({f"s-jan{d}":
(df.date.dt.month == 1) & (df.date.dt.day == d) & (df.country == 'Sweden')
for d in range(1, 16)})],
axis=1)
new_df = pd.concat([new_df,
pd.DataFrame({f"may{d}":
(df.date.dt.month == 5) & (df.date.dt.day == d)
for d in list(range(1, 11))}), # + list(range(17, 25))
pd.DataFrame({f"may{d}":
(df.date.dt.month == 5) & (df.date.dt.day == d) & (df.country == 'Norway')
for d in list(range(16, 28))})
],
axis=1)
new_df = pd.concat([new_df,
pd.DataFrame({f"june{d}":
(df.date.dt.month == 6) & (df.date.dt.day == d) & (df.country == 'Sweden')
for d in list(range(7, 15))}),
],
axis=1)
# Midsummer Day
midsummer_day_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-20')),
2016: pd.Timestamp(('2016-06-25')),
2017: pd.Timestamp(('2017-06-24')),
2018: pd.Timestamp(('2018-06-23')),
2019: pd.Timestamp(('2019-06-22'))})
new_df = pd.concat([new_df,
pd.DataFrame({f"midsummer_day{d}":
(df.date - midsummer_day_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
for d in list(range(-2, 11))})],
axis=1)
# First Sunday of November
sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
2016: pd.Timestamp(('2016-11-6')),
2017: pd.Timestamp(('2017-11-5')),
2018: pd.Timestamp(('2018-11-4')),
2019: pd.Timestamp(('2019-11-3'))})
new_df = pd.concat([new_df,
pd.DataFrame({f"sun_nov{d}":
(df.date - sun_nov_date == np.timedelta64(d, "D")) & (df.country != 'Norway')
for d in list(range(-1, 10))})],
axis=1)
# First half of December (Independence Day of Finland, 6th of December)
new_df = pd.concat([new_df,
pd.DataFrame({f"dec{d}":
(df.date.dt.month == 12) & (df.date.dt.day == d) & (df.country == 'Finland')
for d in list(range(6, 14))})],
axis=1)
# Easter
easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
new_df = pd.concat([new_df,
pd.DataFrame({f"easter{d}":
(df.date - easter_date == np.timedelta64(d, "D")) & (df.country == 'Finland')
for d in list(range(-2, 11))
+ list(range(40, 48)) + list(range(50, 59))
})],
axis=1)
new_df = pd.concat([new_df,
pd.DataFrame({f"n-easter{d}":
(df.date - easter_date == np.timedelta64(d, "D")) & (df.country == 'Norway')
for d in list(range(-2, 11))
+ list(range(40, 48)) + list(range(50, 59))
})],
axis=1)
new_df = pd.concat([new_df,
pd.DataFrame({f"s-easter{d}":
(df.date - easter_date == np.timedelta64(d, "D")) & (df.country == 'Sweden')
for d in list(range(-2, 11))
+ list(range(40, 48)) + list(range(50, 59))
})],
axis=1)
return new_df
train_df = engineer(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = engineer(original_test_df)
features = test_df.columns
for df in [train_df, test_df]:
df[features] = df[features].astype(np.float32)
preproc, model = fit_model(train_df)
train_pred_df = original_train_df.copy()
train_pred_df['pred'] = np.exp(model.predict(preproc.transform(train_df[features])))
by_date = train_pred_df[train_pred_df.country == 'Norway'].groupby(train_pred_df['date'])
residuals = (by_date.pred.sum() - by_date.num_sold.sum()) / (by_date.pred.sum() + by_date.num_sold.sum()) * 200
plot_all_residuals(residuals)
plot_around(residuals, 1, 1, 15)
plot_around(residuals, 4, 1, 15)
plot_around(residuals, 5, 1, 30)
plot_around(residuals, 6, 1, 30)
plot_around(residuals, 7, 1, 30)
plot_around(residuals, 11, 1, 15)
plot_around(residuals, 12, 1, 30)
可以发现将每年的节假日加入特征之后,我们的误差得到了很好的改善。接下来来看看评估分数
def fit_model(X_tr, X_va=None, outliers=False):
"""Scale the data, fit a model, plot the training history and validate the model"""
start_time = datetime.now()
# Preprocess the data
X_tr_f = X_tr[features]
preproc = StandardScaler()
X_tr_f = preproc.fit_transform(X_tr_f)
y_tr = X_tr.num_sold.values.reshape(-1, 1)
model = LinearRegression()
model.fit(X_tr_f, np.log(y_tr).ravel())
if X_va is not None:
# Preprocess the validation data
X_va_f = X_va[features]
X_va_f = preproc.transform(X_va_f)
y_va = X_va.num_sold.values.reshape(-1, 1)
# Inference for validation
y_va_pred = np.exp(model.predict(X_va_f)).reshape(-1, 1)
oof.update(pd.Series(y_va_pred.ravel(), index=X_va.index))
# Evaluation: Execution time and SMAPE
smape_before_correction = np.mean(smape_loss(y_va, y_va_pred))
#y_va_pred *= LOSS_CORRECTION
smape = np.mean(smape_loss(y_va, y_va_pred))
print(f"Fold {run}.{fold} | {str(datetime.now() - start_time)[-12:-7]}"
f" | SMAPE: {smape:.5f} (before correction: {smape_before_correction:.5f})")
score_list.append(smape)
# Plot y_true vs. y_pred
if fold == 0:
plt.figure(figsize=(10, 10))
plt.scatter(y_va, y_va_pred, s=1, color='r')
#plt.scatter(np.log(y_va), np.log(y_va_pred), s=1, color='g')
plt.plot([plt.xlim()[0], plt.xlim()[1]], [plt.xlim()[0], plt.xlim()[1]], '--', color='k')
plt.gca().set_aspect('equal')
plt.xlabel('y_true')
plt.ylabel('y_pred')
plt.title('OOF Predictions')
plt.show()
return preproc, model
#%%time
RUNS = 1 # should be 1. increase the number of runs only if you want see how the result depends on the random seed
OUTLIERS = True
TRAIN_VAL_CUT = datetime(2018, 1, 1)
LOSS_CORRECTION = 1
# Make the results reproducible
np.random.seed(202100)
total_start_time = datetime.now()
oof = pd.Series(0.0, index=train_df.index)
score_list = []
for run in range(RUNS):
kf = GroupKFold(n_splits=4)
for fold, (train_idx, val_idx) in enumerate(kf.split(train_df, groups=train_df.date.dt.year)):
X_tr = train_df.iloc[train_idx]
X_va = train_df.iloc[val_idx]
print(f"Fold {run}.{fold}")
preproc, model = fit_model(X_tr, X_va)
print(f"Average SMAPE: {sum(score_list) / len(score_list):.5f}")
with open('oof.pickle', 'wb') as handle: pickle.dump(oof, handle)
# Plot all num_sold_true and num_sold_pred (five years) for one country-store-product combination
def plot_five_years_combination(engineer, country='Norway', store='KaggleMart', product='Kaggle Hat'):
demo_df = pd.DataFrame({'row_id': 0,
'date': pd.date_range('2015-01-01', '2019-12-31', freq='D'),
'country': country,
'store': store,
'product': product})
demo_df.set_index('date', inplace=True, drop=False)
demo_df = engineer(demo_df)
demo_df['num_sold'] = np.exp(model.predict(preproc.transform(demo_df[features])))
plt.figure(figsize=(20, 6))
plt.plot(np.arange(len(demo_df)), demo_df.num_sold, label='prediction')
train_subset = train_df[(original_train_df.country == country) & (original_train_df.store == store) & (original_train_df['product'] == product)]
# plt.scatter(np.arange(len(train_subset)), train_subset.num_sold, label='true', alpha=0.5, color='red', s=3)
plt.plot(np.arange(len(train_subset)), train_subset.num_sold, label='true', color='red')
plt.legend()
plt.title('Predictions and true num_sold for five years')
plt.show()
plot_five_years_combination(engineer)
生成提交数据
# Fit the model on the complete training data
train_idx = np.arange(len(train_df))
X_tr = train_df.iloc[train_idx]
preproc, model = fit_model(X_tr, None)
plot_five_years_combination(engineer) # Quick check for debugging
# Inference for test
test_pred_list = []
test_pred_list.append(np.exp(model.predict(preproc.transform(test_df[features]))) * LOSS_CORRECTION)
# Create the submission file
sub = original_test_df[['row_id']].copy()
sub['num_sold'] = sum(test_pred_list) / len(test_pred_list)
sub.to_csv('submission_linear_model.csv', index=False)
# Plot the distribution of the test predictions
plt.figure(figsize=(16,3))
plt.hist(train_df['num_sold'], bins=np.linspace(0, 3000, 201),
density=True, label='Training')
plt.hist(sub['num_sold'], bins=np.linspace(0, 3000, 201),
density=True, rwidth=0.5, label='Test predictions')
plt.xlabel('num_sold')
plt.ylabel('Frequency')
plt.legend()
plt.show()
sub
因为销售数量是整数,我们对结果进行四舍五入一下,提交看看哪个分数要高些。
# Create a rounded submission file
sub_rounded = sub.copy()
sub_rounded['num_sold'] = sub_rounded['num_sold'].round()
sub_rounded.to_csv('submission_linear_model_rounded.csv', index=False)
sub_rounded
Kaggle Tabular Playground Series - Jan 2022 学习笔记1(数据分析)