import pandas as pd
from statsmodels.tsa.arima_model import ARMA
import matplotlib.pyplot as plt
import pmdarima as pm
import warnings
import numpy as np
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import mean_squared_error as MSE
from datetime import datetime #数据索引改为时间
import statsmodels.api as sm #acf,pacf图
from statsmodels.tsa.stattools import adfuller #adf检验
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima_model import ARIMA
from itertools import product # some useful functions
from tqdm import tqdm_notebook
from statsmodels.stats.diagnostic import acorr_ljungbox
# 过滤warning
warnings.filterwarnings('ignore')
#pmdarima==1.8.4
#numpy==1.19.5
#python3.6
def excelFile(path):
#excelFile = '降水.xlsx'
#path = 'arima_data.xlsx'
# 读取数据,指定日期列为指标,Pandas自动将“日期”列识别为Datetime格式
time_series = pd.read_excel(path, index_col=u'Date')
# data = pd.DataFrame(data,dtype=np.float64)
time_series.index = pd.to_datetime(time_series.index)
origin_timeseries = time_series
return origin_timeseries
def whiteNoiseCheck(data):
result = acorr_ljungbox(data, lags=1)
temp = result[1]
# print('白噪声检验结果:', result)
# 如果temp小于0.05,则可以以95%的概率拒绝原假设,认为该序列为非白噪声序列;否则,为白噪声序列,认为没有分析意义
print(temp)
return temp
def ADF_examine(data1):
# 进行ADF检验
# ADF 大致的思想就是基于随机游走的,对[公式]回归,如果发现p=1,说明序列满足随机游走,就是非平稳的。
temp = np.array(data1)
t = adfuller(temp) # ADF检验
output = pd.DataFrame(
index=['Test Statistic Value', "p-value", "Lags Used", "Number of Observations Used", "Critical Value(1%)",
"Critical Value(5%)", "Critical Value(10%)"], columns=['value'])
output['value']['Test Statistic Value'] = t[0]
output['value']['p-value'] = t[1]
output['value']['Lags Used'] = t[2]
output['value']['Number of Observations Used'] = t[3]
output['value']['Critical Value(1%)'] = t[4]['1%']
output['value']['Critical Value(5%)'] = t[4]['5%']
output['value']['Critical Value(10%)'] = t[4]['10%']
return t[0],t[4]['1%']
def stableCheck(timeseries):
# 可视化移动12期的均值和方差,未改动,没用到
rol_mean = timeseries.rolling(window=12).mean()
rol_std = timeseries.rolling(window=12).std()
# 绘图
fig = plt.figure(figsize=(12, 8))
orig = plt.plot(timeseries, color='blue', label='Original')
mean = plt.plot(rol_mean, color='red', label='Rolling Mean')
std = plt.plot(rol_std, color='black', label='Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show()
# 进行ADF检验
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
# 对检验结果进行语义描述
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
for key, value in dftest[4].items():
dfoutput['Critical Value (%s)' % key] = value
print('ADF检验结果:')
print(dfoutput)
def season_fiff(data,n):
time_series_diff1 = data.diff(1).dropna()
# 在一阶差分基础上进行季节性差分差分
time_series_diff2 = time_series_diff1.diff(n).dropna()
#stableCheck_result2 = stableCheck(time_series_diff2)
return time_series_diff2
def parameter_generation(len_season):
ps = range(2, 5)
d = 1 # 做了一次一阶差分
qs = range(2, 5)
Ps = range(0, 2)
D = 1 # 季节性差分阶数。
Qs = range(0, 2)
s = len_season # season length is still 24
# creating list with all the possible combinations of parameters
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
return parameters_list
def optimizeSARIMA(parameters_list, d, D, s, data1):
results = []
best_aic = float("inf")
for param in tqdm_notebook(parameters_list):
# we need try-except because on some combinations model fails to converge
try:
model = sm.tsa.statespace.SARIMAX(data1, order=(param[0], d, param[1]),
seasonal_order=(param[2], D, param[3], s)).fit(disp=-1)
except:
continue
aic = model.aic
# saving best model, AIC and parameters
if aic < best_aic:
best_model = model
best_aic = aic
best_param = param
results.append([param, model.aic])
result_table = pd.DataFrame(results)
result_table.columns = ['parameters', 'aic']
# sorting in ascending order, the lower AIC is - the better
result_table = result_table.sort_values(by='aic', ascending=True).reset_index(drop=True)
return result_table
def model_struct(parameters_list,data1,d,D,s):
result_table = optimizeSARIMA(parameters_list, d, D, s, data1)
p, q, P, Q = result_table.parameters[0]
best_model = sm.tsa.statespace.SARIMAX(data1, order=(p, d, q),
seasonal_order=(P, D, Q, s)).fit(disp=-1)
#print(best_model.summary())
return best_model
def mean_absolute_percentage_error(y_true, y_pred):
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def plotSARIMA(series, model, n_steps,d,s):
# adding model values
data = series.copy()
data.columns = ['actual']
data['sarima_model'] = model.fittedvalues
# making a shift on s+d steps, because these values were unobserved by the model
# due to the differentiating
data['sarima_model'][:s + d] = np.NaN
# forecasting on n_steps forward
forecast = model.predict(start=data.shape[0], end=data.shape[0] + n_steps)
forecast = data.sarima_model.append(forecast)
# calculate error, again having shifted on s+d steps from the beginning
error = mean_absolute_percentage_error(data['actual'][s + d:], data['sarima_model'][s + d:])
print(error)
date = forecast.index[data.shape[0]:]
pre_y = forecast.values[data.shape[0]:]
output_x = [x.strftime('%F') for x in date]
output_y = pre_y.tolist()
return output_x,output_y
def model_struct_auto(time_series,parser_m):
# 将数据分为训练集data_train和测试集data_test 。
split_point = int(len(time_series) * 0.85)
# 确定训练集/测试集
data_train, data_test = time_series[0:split_point], time_series[split_point:len(time_series)]
# 使用训练集的数据来拟合模型
built_arimamodel = pm.auto_arima(data_train, start_p=0, # p最小值
start_q=0, # q最小值
test='adf', # ADF检验确认差分阶数d
max_p=5, # p最大值
max_q=5, # q最大值
m=parser_m, # 季节性周期长度,当m=1时则不考虑季节性
d=None, # 通过函数来计算d
seasonal=True, start_P=0, D=1, trace=True,
error_action='ignore', suppress_warnings=True,
stepwise=False) # stepwise为False则不进行完全组合遍历
print(built_arimamodel.summary())
#可视化
#built_arimamodel.plot_diagnostics()
#plt.show()
if __name__ == '__main__':
time_series=excelFile('降水.xlsx')
s=12
d = 1
D = 1
data1=time_series.y
t1,t2=ADF_examine(data1)
if t1>t2:
data=season_fiff(time_series, s)
t=whiteNoiseCheck(data)
if t<0.05:
parameters_list=parameter_generation(s)
best_model=model_struct(parameters_list,data1,d,D,s)
x,y=plotSARIMA(time_series, best_model, 20,d,s)#预测20个
print(x)
print(y)
#pmdarima==1.8.4
#numpy==1.19.5
#python3.6
#代码有借鉴,有修改
数据集
链接:https://pan.baidu.com/s/1SKZwtLgh3tWBePKTJjFCkw
提取码:4kwb
emmmmm,有问题的话请指出,我也是小白