学习资源:https://github.com/thedataincubator/ds30_3
初次试着把代码调通
- 导入模块
import csv
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')
from datetime import datetime as dt
import holtwinters as hw
from math import sqrt
from sklearn import linear_model
import numpy as np
from operator import add
import sklearn
实例2
Forecasting the national average cost of a brand drug using Holt-Winters triple exponential smoothing.
- 导入数据
# Import Data
ndi_df = pd.read_csv('monthly_ndi_official.csv')
ndi_df['date'] = pd.to_datetime(ndi_df['date'],format='%m/%d/%Y')
查看数据趋势
ts_df = ndi_df[['date', 'brand']].copy()
ts_df.plot(y='brand')
ts_df['index'] = ts_df['brand']/ts_df.loc[0, 'brand']
ts_df['log_index'] = np.log(ts_df['index'])
X_train = [[x] for x in ts_df.index]
y_train = ts_df['log_index']
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
m = regr.coef_[0]
b = regr.intercept_
print(m, b, regr.score(X_train, y_train))
0.011338121061252584 -0.0016371712335241373 0.9947050500814869
plt.scatter(X_train, y_train)
plt.plot(X_train, y_train)
plt.plot(X_train, regr.predict(X_train), color='blue', linewidth=2)
ts_df['index_log_model'] = regr.predict(X_train)
ts_df['index_log_resid'] = ts_df['log_index'] - ts_df['index_log_model']
temp_df = ts_df.copy()
temp_df.index = temp_df['date']
temp_df.plot(x='date',y='index_log_resid')
resids = temp_df['index_log_resid']
overlap = 12
hw_forecast = hw.additive(list(resids[:-1*overlap]), 12, 12+overlap)
print("alpha is %s, beta is %s, gamma is %s." %hw_forecast[1:4])
print("rmse is %s." %hw_forecast[4])
temp_ls = list(resids[:-1*overlap]) + hw_forecast[0]
plt.plot(list(resids[:]))
plt.plot(temp_ls, '--')
alpha is 0.0, beta is 0.0, gamma is 1.0.
rmse is 0.0018246112687583863.
line_ls = [m*x+b for x in range(len(temp_ls))]
new_log_model_ls = [x for x in map(add, line_ls, temp_ls)]
fc_ls = ts_df.loc[0, 'brand']*np.exp(new_log_model_ls)
plt.plot(ts_df['brand'])
plt.plot(fc_ls, '--')
rmse = sklearn.metrics.mean_squared_error(ts_df.loc[24:36, 'brand'], fc_ls[24:36])**0.5
print(rmse, rmse/340.)
2.305782393554999 0.006781712922220585