关于Sklearn的SVM、XGBOOST、随机森林等算回归预测建模的方法及数据提取

# -*- coding: utf-8 -*-
"""

@author: Hotil

"""

import pandas as pd#pandas库操作excel,csv
import os#系统操作库,可以用来获取当前代码路径
from sklearn.model_selection import train_test_split#随机分割训练集测试集
from sklearn.metrics import r2_score as R2
from sklearn.metrics import mean_squared_error as MSE#计算MSE,sklearn没有直接计算RMSE的方法,将MSE开根即可
import numpy as np#数组操作
import warnings#警示信息
from sklearn.multioutput import MultiOutputRegressor#有的模型不支持模拟、期末同时输出,因此需要用到此方法
import pydotplus 
from sklearn.tree import export_graphviz
import graphviz
#%%
#定义一个函数计算MAPE
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100


warnings.filterwarnings('ignore')
#将代码与两个csv文件放在同意路径下

currentPath = os.getcwd()#获取当前代码路径

#拼接文件路径
filePath1 = currentPath + r'\2021data.csv'
filePath2 = currentPath + r'\2020data.csv'
#读取文件
dataBase1 = pd.read_csv(filePath1)
dataBase2 = pd.read_csv(filePath2)
#将两个csv合并组成一个数据库
dataBase = pd.concat([dataBase1,dataBase2])

featuresNames = dataBase.columns[:-2].values
labelsNames = dataBase.columns[-2:].values

tranFeaturesNames = ['num1_hw','num1_test','num2_hw','num3_hw','num4_hw','num2_test','num5_hw','num6_hw',
                     'num7_hw','num8_hw', 'num3_test', 'num9_hw', 'num10_hw', 'num4_test']
labelsNames = ['model','final']

features = dataBase.iloc[:,:-2].values
labels = dataBase.iloc[:,-2:].values
#随机划分训练集和测试集,测试集:训练集 = 2:8
xTrain,xTest,yTrain,yTest = train_test_split(features,labels,test_size=0.2)

saveBase = pd.DataFrame(columns=dataBase.columns.values)
for rowI in range(len(xTest)):
    saveBase.loc[rowI,:-2] = xTest[rowI]
saveBase.iloc[:,-2] = yTest[:,0]
saveBase.iloc[:,-1] = yTest[:,1]

#============================================================
#Decision Tree,ExtraTree,Random Forest,Adaboost,Gradient Boosting,XGBoost
#============================================================
#1 Decision Tree
from sklearn.tree import DecisionTreeRegressor#从sklearn的数模型中隐入决策树模型

DTR = DecisionTreeRegressor(criterion='mse'
                            , splitter='best'
                            , max_depth=6
                            ,random_state=500)#实例化模型并传入参数
DTR.fit(xTrain,yTrain)#传入特征和标签,拟合模型
DTRPre = DTR.predict(xTest)#调用predict方法用测试集进行预测
DTRRMSE = np.sqrt(MSE(yTest,DTRPre))#MSE()计算真实值与预测值的均方误差,np.sqrt进行开方后获得RMSE
print('='*60)
print('Decision Tree的评估指标RMSE值为:',DTRRMSE)
print('Decision Tree的评估指标MAPE值为:',mape(yTest,DTRPre))
print('Decision Tree的评估指标R-square值为:',R2(yTest,DTRPre))

DTRBase = saveBase.copy()#复制测试集dataFrame,方便加入各模型的预测值
DTRBase.loc[:,'预测模拟'] = DTRPre[:,0]#加入预测模拟的数值
DTRBase.loc[:,'预测期末'] = DTRPre[:,1]#加入预测期末的数值

DTRBase.to_csv(currentPath +'\Decision Tree预测结果.csv',encoding='gb18030')#保存预测结果
#============================================================
#2ExtraTree
from sklearn.tree import ExtraTreeRegressor#从树模型中引入极端树模型

ETR = ExtraTreeRegressor(criterion='mse'#实例化模型并传入参数
                         , splitter='random'
                         , max_depth=6
                         ,random_state=500)
ETR.fit(xTrain,yTrain)#传入特征和标签,拟合模型
ETRPre = ETR.predict(xTest)#调用predict方法用测试集进行预测
ETRRMSE = np.sqrt(MSE(yTest,ETRPre))#MSE()计算真实值与预测值的均方误差,np.sqrt进行开方后获得RMSE
print('='*60)
print('ExtraTree的评估指标RMSE值为:',ETRRMSE)
print('ExtraTree的评估指标MAPE值为:',mape(yTest,ETRPre))
print('ExtraTree的评估指标R-square值为:',R2(yTest,ETRPre))

ETRBase = saveBase.copy()#复制测试集dataFrame,方便加入各模型的预测值
ETRBase.loc[:,'预测模拟'] = ETRPre[:,0]#加入预测模拟的数值
ETRBase.loc[:,'预测期末'] = ETRPre[:,1]#加入预测期末的数值

ETRBase.to_csv(currentPath +'\ExtraTree预测结果.csv',encoding='gb18030')#保存预测结果
#============================================================
#3Random Forest
from sklearn.ensemble import RandomForestRegressor#从集成模型中引入随机森林模型

RFR = RandomForestRegressor(n_estimators=99#实例化模型并传入参数
                            ,criterion='squared_error'
                            , max_depth=6
                            ,random_state=300
                            )
RFR.fit(xTrain,yTrain)#传入特征和标签,拟合模型
RFRPre = RFR.predict(xTest)#调用predict方法用测试集进行预测
RFRRMSE = np.sqrt(MSE(yTest,RFRPre))#MSE()计算真实值与预测值的均方误差,np.sqrt进行开方后获得RMSE
print('='*60)
print('Random Forest的评估指标RMSE值为:',RFRRMSE)
print('Random Forest的评估指标MAPE值为:',mape(yTest,RFRPre))
print('Random Forest的评估指标R-square值为:',R2(yTest,RFRPre))

RFRBase = saveBase.copy()#复制测试dataFrame,方便加入各模型的预测值
RFRBase.loc[:,'预测模拟'] = RFRPre[:,0]#加入预测模拟的数值
RFRBase.loc[:,'预测期末'] = RFRPre[:,1]#加入预测期末的数值

RFRBase.to_csv(currentPath +'\Random Forest预测结果.csv',encoding='gb18030')#保存预测结果


tree = RFR.estimators_[5]
dot_data = export_graphviz(tree, out_file=None,
                         feature_names=tranFeaturesNames,
                         class_names=labelsNames,
                         filled=True, rounded=True,
                         special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data) 
graph.write_pdf("单颗树可视化.pdf") 
#============================================================
#4Adaboost
from sklearn.ensemble import AdaBoostRegressor#从集成模型中引入自适应提升模型
#由于Adaboost不支持多目标输出,需要调用MultiOutputRegressor,一次可预测多个变量
ABR = MultiOutputRegressor(AdaBoostRegressor(n_estimators=99#实例化模型并传入参数
                        ,learning_rate=0.2
                        ,loss='linear'
                        ,random_state=500))

ABR.fit(xTrain,yTrain)#传入特征和标签,拟合模型
ABRPre = ABR.predict(xTest)#调用predict方法用测试集进行预测
ABRRMSE = np.sqrt(MSE(yTest,ABRPre))#MSE()计算真实值与预测值的均方误差,np.sqrt进行开方后获得RMSE

print('='*60)
print('Adaboost的评估指标RMSE值为:',ABRRMSE)
print('Adaboost的评估指标MAPE值为:',mape(yTest,ABRPre))
print('Adaboost的评估指标R-square值为:',R2(yTest,ABRPre))

ABRBase = saveBase.copy()#复制测试dataFrame,方便加入各模型的预测值
ABRBase.loc[:,'预测模拟'] = ABRPre[:,0]#加入预测模拟的数值
ABRBase.loc[:,'预测期末'] = ABRPre[:,1]#加入预测期末的数值

ABRBase.to_csv(currentPath +'\Adaboost预测结果.csv',encoding='gb18030')#保存预测结果
#============================================================
#5Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor#从集成模型中引入梯度提升模型
#由于Gradient Boosting不支持多目标输出,需要调用MultiOutputRegressor,一次可预测多个变量
GBR = MultiOutputRegressor(GradientBoostingRegressor(loss='squared_error'#实例化模型并传入参数
                                                    ,learning_rate=0.2
                                                    ,n_estimators=99
                                                    ,criterion='squared_error'
                                    ,random_state=500))
GBR.fit(xTrain,yTrain)#传入特征和标签,拟合模型
GBRPre = GBR.predict(xTest)#调用predict方法用测试集进行预测
GBRRMSE = np.sqrt(MSE(yTest,GBRPre))#MSE()计算真实值与预测值的均方误差,np.sqrt进行开方后获得RMSE
print('='*60)
print('Gradient Boosting的评估指标RMSE值为:',GBRRMSE)
print('Gradient Boosting的评估指标MAPE值为:',mape(yTest,GBRPre))
print('Gradient Boosting的评估指标R-square值为:',R2(yTest,GBRPre))

GBRBase = saveBase.copy()#复制测试dataFrame,方便加入各模型的预测值
GBRBase.loc[:,'预测模拟'] = GBRPre[:,0]#加入预测模拟的数值
GBRBase.loc[:,'预测期末'] = GBRPre[:,1]#加入预测期末的数值

GBRBase.to_csv(currentPath +'\Gradient Boosting预测结果.csv',encoding='gb18030')#保存预测结果
#============================================================
#6XGBOOST
# import xgboost as xgb#引入xgboost,xgboost分为原生的独立库xgboost和sklearn里封装好的xgboost,二者本质无太大区别
# # from xgboost import XGBRegressor
# xgb.set_config(verbosity=0)#隐藏警告信息
# fileTrain = xgb.DMatrix(xTrain,yTrain)#要将训练集和测试集使用DMatrix方法封装成xgboost可接受的特殊形式
# fileTest = xgb.DMatrix(xTest,yTest)
# param = {'silent':True#xgboost原生库需要提前自定参数
#           ,'objective':'reg:linear'
#           ,"eta":0.2
#           ,"max_depth":7
#           ,"subsample":1
#           ,"gamma":0
#           ,"lambda":25
#           ,"alpha":0
#           ,"colsample_bytree":1
#           ,"colsample_bylevel":1
#           ,"colsample_bynode":1
#           }
# num_round = 99#树的数量
# XGBSR=xgb.train(param,fileTrain,num_boost_round=num_round)#训练模型

# XGBSRre = XGBSR.predict(fileTest)#调用predict方法用测试集进行预测
# XGBRRMSE = np.sqrt(MSE(yTest,XGBSRre))#MSE()计算真实值与预测值的均方误差,np.sqrt进行开方后获得RMSE
# print('='*60)
# print('XGBOOST的评估指标RMSE值为:',XGBRRMSE)#保存预测结果
#============================================================
#7SVM
from sklearn.svm import SVR#引入SVM
#由于svm不支持多目标输出,需要调用MultiOutputRegressor,一次可预测多个变量
SVRM = MultiOutputRegressor(SVR(kernel='linear'#实例化模型
         ,degree=3
         ,gamma=0.1))
SVRM.fit(xTrain,yTrain)#传入特征和标签,拟合模型
SVRMPre = SVRM.predict(xTest)#调用predict方法用测试集进行预测
SVRMRMSE = np.sqrt(MSE(yTest,SVRMPre))#MSE()计算真实值与预测值的均方误差,np.sqrt进行开方后获得RMSE
print('='*60)
print('SVM的评估指标RMSE值为:',SVRMRMSE)
print('SVM的评估指标MAPE值为:',mape(yTest,SVRMPre))
print('SVM的评估指标R-square值为:',R2(yTest,SVRMPre))

SVRMBase = saveBase.copy()
SVRMBase.loc[:,'预测模拟'] = SVRMPre[:,0]#加入预测模拟的数值
SVRMBase.loc[:,'预测期末'] = SVRMPre[:,1]#加入预测期末的数值

SVRMBase.to_csv(currentPath +'\SVM预测结果.csv',encoding='gb18030')#保存预测结果
#============================================================
#8K Neighbors
from sklearn.neighbors import KNeighborsRegressor
#由于K Neighbors不支持多目标输出,需要调用MultiOutputRegressor,一次可预测多个变量
KNNR = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=5#实例化模型
                                                ,p=2))
KNNR.fit(xTrain,yTrain)#传入特征和标签,拟合模型
KNNRPre = KNNR.predict(xTest)#调用predict方法用测试集进行预测
KNNRRMSE = np.sqrt(MSE(yTest,KNNRPre))#MSE()计算真实值与预测值的均方误差,np.sqrt进行开方后获得RMSE
print('='*60)
print('KNNR的评估指标RMSE值为:',KNNRRMSE)
print('KNNR的评估指标MAPE值为:',mape(yTest,KNNRPre))
print('KNNR的评估指标R-square值为:',R2(yTest,KNNRPre))
print('='*60)

KNNRBase = saveBase.copy()
KNNRBase.loc[:,'预测模拟'] = KNNRPre[:,0]#加入预测模拟的数值
KNNRBase.loc[:,'预测期末'] = KNNRPre[:,1]#加入预测期末的数值

KNNRBase.to_csv(currentPath +'\K Neighbors预测结果.csv',encoding='gb18030')#保存预测结果
#============================================================
#各模型平均性能得分为
meanScore = np.mean([DTRRMSE,ETRRMSE,RFRRMSE,ABRRMSE,GBRRMSE,SVRMRMSE,KNNRRMSE])
print('各模型平均性能得分为:',meanScore)
print('='*60)
#随机森林性能最好,特征重要性为
fIm = RFR.feature_importances_#直接获取模型的重要性
print('以下为随机森林各特征重要性:')
for i in range(len(dataBase.columns.values[:-2])):
    print(dataBase.columns.values[:-2][i],'的特征重要性分数为:',fIm[i])
#============================================================
#优秀模型permutation_importance
from sklearn.inspection import permutation_importance#引入排序重要性得分
PIScore = permutation_importance(RFR, xTrain, yTrain)#拟合时得分情况
print(sum(PIScore['importances_mean']))
print('以下为随机森林各特征permutation_importance得分:')#调用数组进行打印
for i in range(len(dataBase.columns.values[:-2])):
    print(dataBase.columns.values[:-2][i],'的permutation_importance特征重要性分数为:',PIScore['importances_mean'][i])
#============================================================
# dot_data = tree.export_graphviz(RFR, out_file=None) 
#%%

你可能感兴趣的:(sklearn,支持向量机,随机森林)