案例:2015/1/1至2015/2/6某餐厅销售数据进行建模
参考链接:
1.https://zhuanlan.zhihu.com/p/54985638
2.https://zhuanlan.zhihu.com/p/35128342
3.https://www.kaggle.com/pratyushakar/time-series-analysis-using-arima-sarima
statsmodels.tsa.arima_model文档:https://www.statsmodels.org/stable/search.html?q=statsmodels.tsa.arima_model
1:原始数据平稳性判断
2:差分数据平稳性判断
3:对差分数据建模,并得到预测值
4:返回原数据的预测值,并画出拟合图
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima_model import ARIMA
excelFile = 'C:/Users/admin/Desktop/Jupyter/数据分析/python数据分析-从入门到精通/第12周/arima_data.xls'
data = pd.read_excel(excelFile, index_col = u'日期')
data = pd.DataFrame(data,dtype=np.float64)
plt.figure(figsize=(10, 6))
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
data["销量"].plot()
plt.xlabel('日期',fontsize=12,verticalalignment='top')
plt.ylabel('销量',fontsize=14,horizontalalignment='center')
plt.show()
data.head()
|
销量 |
日期 |
|
2015-01-01 |
3023.0 |
2015-01-02 |
3039.0 |
2015-01-03 |
3056.0 |
2015-01-04 |
3138.0 |
2015-01-05 |
3188.0 |
fig = plt.figure(figsize=(12,8))
ax1=fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(data,lags=20,ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(data,lags=20,ax=ax2)
plt.show()
temp = np.array(data["销量"])
t = adfuller(temp)
output=pd.DataFrame(index=['Test Statistic Value', "p-value", "Lags Used", "Number of Observations Used","Critical Value(1%)","Critical Value(5%)","Critical Value(10%)"],columns=['value'])
output['value']['Test Statistic Value'] = t[0]
output['value']['p-value'] = t[1]
output['value']['Lags Used'] = t[2]
output['value']['Number of Observations Used'] = t[3]
output['value']['Critical Value(1%)'] = t[4]['1%']
output['value']['Critical Value(5%)'] = t[4]['5%']
output['value']['Critical Value(10%)'] = t[4]['10%']
output
|
value |
Test Statistic Value |
1.81377 |
p-value |
0.998376 |
Lags Used |
10 |
Number of Observations Used |
26 |
Critical Value(1%) |
-3.71121 |
Critical Value(5%) |
-2.98125 |
Critical Value(10%) |
-2.63009 |
from statsmodels.stats.diagnostic import acorr_ljungbox
print(acorr_ljungbox(data["销量"], lags=1))
(array([ 32.0111333]), array([ 1.53291527e-08]))
data1= data["销量"].diff(1)
plt.figure(figsize=(10, 6))
data1.plot()
plt.xlabel('日期',fontsize=12,verticalalignment='top')
plt.ylabel('销量差分',fontsize=14,horizontalalignment='center')
plt.show()
temp = np.diff(data["销量"])
t = adfuller(temp)
output=pd.DataFrame(index=['Test Statistic Value', "p-value", "Lags Used", "Number of Observations Used","Critical Value(1%)","Critical Value(5%)","Critical Value(10%)"],columns=['value'])
output['value']['Test Statistic Value'] = t[0]
output['value']['p-value'] = t[1]
output['value']['Lags Used'] = t[2]
output['value']['Number of Observations Used'] = t[3]
output['value']['Critical Value(1%)'] = t[4]['1%']
output['value']['Critical Value(5%)'] = t[4]['5%']
output['value']['Critical Value(10%)'] = t[4]['10%']
output
|
value |
Test Statistic Value |
-3.15606 |
p-value |
0.0226734 |
Lags Used |
0 |
Number of Observations Used |
35 |
Critical Value(1%) |
-3.63274 |
Critical Value(5%) |
-2.94851 |
Critical Value(10%) |
-2.61302 |
sales = list(np.diff(data["销量"]))
data2 = {
"日期":data1.index[1:],
"销量":sales
}
df = pd.DataFrame(data2)
df['日期'] = pd.to_datetime(df['日期'])
data_diff = df.set_index(['日期'], drop=True)
data_diff.head()
|
销量 |
日期 |
|
2015-01-02 |
16.0 |
2015-01-03 |
17.0 |
2015-01-04 |
82.0 |
2015-01-05 |
50.0 |
2015-01-06 |
36.0 |
fig = plt.figure(figsize=(12,8))
ax1=fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(data_diff,lags=20,ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(data_diff,lags=20,ax=ax2)
plt.show()
sm.tsa.arma_order_select_ic(data_diff,max_ar=6,max_ma=4,ic='aic')['aic_min_order']
(0, 1)
```python
pmax = int(len(df) / 10)
qmax = int(len(df) / 10)
bic_matrix = []
for p in range(pmax +1):
temp= []
for q in range(qmax+1):
try:
temp.append(ARIMA(data, (p, 1, q)).fit().bic)
except:
temp.append(None)
bic_matrix.append(temp)
bic_matrix = pd.DataFrame(bic_matrix)
p,q = bic_matrix.stack().idxmin()
print(u'BIC 最小的p值 和 q 值:%s,%s' %(p,q))
BIC 最小的p值 和 q 值:0,1
model = ARIMA(data, (p,1,q)).fit()
model.summary2()
Model: |
ARIMA |
BIC: |
422.5101 |
Dependent Variable: |
D.销量 |
Log-Likelihood: |
-205.88 |
Date: |
2020-04-27 22:21 |
Scale: |
1.0000 |
No. Observations: |
36 |
Method: |
css-mle |
Df Model: |
2 |
Sample: |
01-02-2015 |
Df Residuals: |
34 |
|
02-06-2015 |
Converged: |
1.0000 |
S.D. of innovations: |
73.086 |
AIC: |
417.7595 |
HQIC: |
419.418 |
|
Coef. |
Std.Err. |
t |
P>|t| |
[0.025 |
0.975] |
const |
49.9564 |
20.1390 |
2.4806 |
0.0182 |
10.4847 |
89.4281 |
ma.L1.D.销量 |
0.6710 |
0.1648 |
4.0712 |
0.0003 |
0.3480 |
0.9941 |
|
Real |
Imaginary |
Modulus |
Frequency |
MA.1 |
-1.4902 |
0.0000 |
1.4902 |
0.5000 |
predictions_ARIMA_diff = pd.Series(model.fittedvalues, copy=True)
print(predictions_ARIMA_diff.head())
日期
2015-01-02 49.956427
2015-01-03 34.245128
2015-01-04 39.803780
2015-01-05 76.789439
2015-01-06 32.393775
dtype: float64
plt.figure(figsize=(10, 6))
plt.plot(predictions_ARIMA_diff,label="forecast_diff")
plt.plot(data_diff,label="diff")
plt.xlabel('日期',fontsize=12,verticalalignment='top')
plt.ylabel('销量差分',fontsize=14,horizontalalignment='center')
plt.legend()
plt.show()
predictions = [i + j for i, j in zip(list(predictions_ARIMA_diff), list(data["销量"][:36]))]
prediction_sales = {
"日期":data1.index[1:],
"销量":predictions
}
prediction_sales = pd.DataFrame(prediction_sales)
prediction_sales['日期'] = pd.to_datetime(prediction_sales['日期'])
prediction_sales = prediction_sales.set_index(['日期'], drop=True)
prediction_sales.head()
|
销量 |
日期 |
|
2015-01-02 |
3072.956427 |
2015-01-03 |
3073.245128 |
2015-01-04 |
3095.803780 |
2015-01-05 |
3214.789439 |
2015-01-06 |
3220.393775 |
plt.figure(figsize=(10, 6))
plt.plot(prediction_sales,label="forecast")
plt.plot(data,label="real")
plt.xlabel('日期',fontsize=12,verticalalignment='top')
plt.ylabel('销量',fontsize=14,horizontalalignment='center')
plt.legend()
plt.show()
model.forecast(5)
(array([ 4873.9667493 , 4923.92317644, 4973.87960359, 5023.83603073,
5073.79245787]),
array([ 73.08574293, 142.32679918, 187.542821 , 223.80281869,
254.95704265]),
array([[ 4730.72132537, 5017.21217324],
[ 4644.96777602, 5202.87857687],
[ 4606.30242887, 5341.4567783 ],
[ 4585.19056646, 5462.48149499],
[ 4574.08583666, 5573.49907907]])