时间序列季节回归模型预测

import pandas as pd
from statsmodels.tsa.arima_model import ARMA
import matplotlib.pyplot as plt
import pmdarima as pm
import warnings
import numpy as np
from statsmodels.tsa.ar_model import AR
from sklearn.metrics import mean_squared_error as MSE
from datetime import datetime   #数据索引改为时间
import statsmodels.api as sm     #acf,pacf图
from statsmodels.tsa.stattools import adfuller  #adf检验
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima_model import ARIMA
from itertools import product                    # some useful functions
from tqdm import tqdm_notebook
from statsmodels.stats.diagnostic import acorr_ljungbox
# 过滤warning
warnings.filterwarnings('ignore')
#pmdarima==1.8.4
#numpy==1.19.5
#python3.6
def excelFile(path):
    #excelFile = '降水.xlsx'
    #path = 'arima_data.xlsx'
    # 读取数据,指定日期列为指标,Pandas自动将“日期”列识别为Datetime格式
    time_series = pd.read_excel(path, index_col=u'Date')
    # data = pd.DataFrame(data,dtype=np.float64)
    time_series.index = pd.to_datetime(time_series.index)
    origin_timeseries = time_series
    return origin_timeseries
def whiteNoiseCheck(data):
    result = acorr_ljungbox(data, lags=1)
    temp = result[1]
   # print('白噪声检验结果:', result)
    # 如果temp小于0.05,则可以以95%的概率拒绝原假设,认为该序列为非白噪声序列;否则,为白噪声序列,认为没有分析意义
    print(temp)
    return temp
def ADF_examine(data1):
    # 进行ADF检验
    # ADF 大致的思想就是基于随机游走的,对[公式]回归,如果发现p=1,说明序列满足随机游走,就是非平稳的。
    temp = np.array(data1)
    t = adfuller(temp)  # ADF检验
    output = pd.DataFrame(
        index=['Test Statistic Value', "p-value", "Lags Used", "Number of Observations Used", "Critical Value(1%)",
               "Critical Value(5%)", "Critical Value(10%)"], columns=['value'])
    output['value']['Test Statistic Value'] = t[0]
    output['value']['p-value'] = t[1]
    output['value']['Lags Used'] = t[2]
    output['value']['Number of Observations Used'] = t[3]
    output['value']['Critical Value(1%)'] = t[4]['1%']
    output['value']['Critical Value(5%)'] = t[4]['5%']
    output['value']['Critical Value(10%)'] = t[4]['10%']
    return t[0],t[4]['1%']
def stableCheck(timeseries):
    # 可视化移动12期的均值和方差,未改动,没用到
    rol_mean = timeseries.rolling(window=12).mean()
    rol_std = timeseries.rolling(window=12).std()
    # 绘图
    fig = plt.figure(figsize=(12, 8))
    orig = plt.plot(timeseries, color='blue', label='Original')
    mean = plt.plot(rol_mean, color='red', label='Rolling Mean')
    std = plt.plot(rol_std, color='black', label='Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show()
    # 进行ADF检验
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    # 对检验结果进行语义描述
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
    for key, value in dftest[4].items():
        dfoutput['Critical Value (%s)' % key] = value
    print('ADF检验结果:')
    print(dfoutput)
def season_fiff(data,n):
    time_series_diff1 = data.diff(1).dropna()
    # 在一阶差分基础上进行季节性差分差分
    time_series_diff2 = time_series_diff1.diff(n).dropna()
    #stableCheck_result2 = stableCheck(time_series_diff2)
    return time_series_diff2
def parameter_generation(len_season):
    ps = range(2, 5)
    d = 1  # 做了一次一阶差分
    qs = range(2, 5)
    Ps = range(0, 2)
    D = 1  # 季节性差分阶数。
    Qs = range(0, 2)
    s = len_season  # season length is still 24
    # creating list with all the possible combinations of parameters
    parameters = product(ps, qs, Ps, Qs)
    parameters_list = list(parameters)
    return parameters_list


def optimizeSARIMA(parameters_list, d, D, s, data1):
    results = []
    best_aic = float("inf")

    for param in tqdm_notebook(parameters_list):
        # we need try-except because on some combinations model fails to converge
        try:
            model = sm.tsa.statespace.SARIMAX(data1, order=(param[0], d, param[1]),
                                              seasonal_order=(param[2], D, param[3], s)).fit(disp=-1)
        except:
            continue
        aic = model.aic
        # saving best model, AIC and parameters
        if aic < best_aic:
            best_model = model
            best_aic = aic
            best_param = param
        results.append([param, model.aic])

    result_table = pd.DataFrame(results)
    result_table.columns = ['parameters', 'aic']
    # sorting in ascending order, the lower AIC is - the better
    result_table = result_table.sort_values(by='aic', ascending=True).reset_index(drop=True)
    return result_table

def model_struct(parameters_list,data1,d,D,s):

    result_table = optimizeSARIMA(parameters_list, d, D, s, data1)
    p, q, P, Q = result_table.parameters[0]
    best_model = sm.tsa.statespace.SARIMAX(data1, order=(p, d, q),
                                           seasonal_order=(P, D, Q, s)).fit(disp=-1)
    #print(best_model.summary())
    return best_model


def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


def plotSARIMA(series, model, n_steps,d,s):

    # adding model values
    data = series.copy()
    data.columns = ['actual']
    data['sarima_model'] = model.fittedvalues
    # making a shift on s+d steps, because these values were unobserved by the model
    # due to the differentiating
    data['sarima_model'][:s + d] = np.NaN

    # forecasting on n_steps forward
    forecast = model.predict(start=data.shape[0], end=data.shape[0] + n_steps)
    forecast = data.sarima_model.append(forecast)
    # calculate error, again having shifted on s+d steps from the beginning
    error = mean_absolute_percentage_error(data['actual'][s + d:], data['sarima_model'][s + d:])
    print(error)
    date = forecast.index[data.shape[0]:]
    pre_y = forecast.values[data.shape[0]:]
    output_x = [x.strftime('%F') for x in date]
    output_y = pre_y.tolist()
    return output_x,output_y

def model_struct_auto(time_series,parser_m):
    # 将数据分为训练集data_train和测试集data_test 。
    split_point = int(len(time_series) * 0.85)
    # 确定训练集/测试集
    data_train, data_test = time_series[0:split_point], time_series[split_point:len(time_series)]
    # 使用训练集的数据来拟合模型
    built_arimamodel = pm.auto_arima(data_train, start_p=0,  # p最小值
                                     start_q=0,  # q最小值
                                     test='adf',  # ADF检验确认差分阶数d
                                     max_p=5,  # p最大值
                                     max_q=5,  # q最大值
                                     m=parser_m,  # 季节性周期长度,当m=1时则不考虑季节性
                                     d=None,  # 通过函数来计算d
                                     seasonal=True, start_P=0, D=1, trace=True,
                                     error_action='ignore', suppress_warnings=True,
                                     stepwise=False)  # stepwise为False则不进行完全组合遍历
    print(built_arimamodel.summary())
    #可视化
    #built_arimamodel.plot_diagnostics()
    #plt.show()

if __name__ == '__main__':
    time_series=excelFile('降水.xlsx')
    s=12
    d = 1
    D = 1
    data1=time_series.y
    t1,t2=ADF_examine(data1)
    if t1>t2:
        data=season_fiff(time_series, s)
        t=whiteNoiseCheck(data)
        if t<0.05:
            parameters_list=parameter_generation(s)
            best_model=model_struct(parameters_list,data1,d,D,s)
            x,y=plotSARIMA(time_series, best_model, 20,d,s)#预测20个
            print(x)
            print(y)


#pmdarima==1.8.4
#numpy==1.19.5
#python3.6
#代码有借鉴,有修改

运行结果时间序列季节回归模型预测_第1张图片

数据集
链接:https://pan.baidu.com/s/1SKZwtLgh3tWBePKTJjFCkw
提取码:4kwb
emmmmm,有问题的话请指出,我也是小白

你可能感兴趣的:(回归,python,机器学习)