本文给出了python机器学习包sklearn回归方法,和评价指标。
下面是本demo的参与预测的特征,和预测目标。
特征:两个数值型数据gamma和max_lambda
预测目标:一个数值型数据beta
评测指标遍历了所以sklearn的评测指标:
由于篇幅有限,本文主要目的是展示用法,具体公式可以参考{click here}
尽可能的遍历了sklearn的回归方法。下面是代码。
# -*- coding: utf-8 -*-
# @Time : 2018/3/22 10:02
# @Author : timothy
'''
根据max_lambda和gamma值,预测beta
'''
import pandas as pd
from sklearn import cross_validation, linear_model
from sklearn import tree
from sklearn import svm
from sklearn import neighbors
from sklearn import ensemble
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
def read_data():
'''
读取数据集
:return: 输入值和标签
'''
file_path = 'resources/ba.csv'
df = pd.read_csv(file_path)
attribute_list = ['gamma', 'max_lambda'] # 通过这两个属性来预测
X = df.loc[:, attribute_list].values
y = df.loc[:, 'beta'].values
# 用交叉验证法,划分训练集和测试集
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2) # 百分之20测试
return X_train, X_test, y_train, y_test
def try_different_model(clf, X_train, X_test, y_train, y_test):
clf.fit(X_train, y_train)
expected = y_test
predicted = clf.predict(X_test)
# 评价预测的准确性
print('解释方差(越接近1越好):', explained_variance_score(expected, predicted)) # 解释方差,越接近1越好
print('r2_score(越接近1越好):', r2_score(expected, predicted)) # r2 score 满分1,越接近1越好
print('平均绝对误差(越小越好):', mean_absolute_error(expected, predicted)) # 平均绝对误差,越小越好
print('均方误差(越小越好):', mean_squared_error(expected, predicted)) # 均方误差,越小越好
print('中值绝对误差(越小越好):', median_absolute_error(expected, predicted)) # 中值绝对误差,越小越好
def main():
X_train, X_test, y_train, y_test = read_data()
print('\n------使用线性回归-------')
clf = linear_model.LinearRegression() # 线性回归
try_different_model(clf, X_train, X_test, y_train, y_test)
print('\n------使用KNN-------')
clf = neighbors.KNeighborsRegressor() # 使用KNN
try_different_model(clf, X_train, X_test, y_train, y_test)
print('\n------使用支持向量机-------')
clf = svm.SVR() # 支持向量机
try_different_model(clf, X_train, X_test, y_train, y_test)
print('\n-------用决策树-------')
clf = tree.DecisionTreeRegressor() # 决策树回归
try_different_model(clf, X_train, X_test, y_train, y_test)
print('\n-------ExtraTreeRegressor-------')
clf = tree.ExtraTreeRegressor() # ExtraTreeRegressor回归
try_different_model(clf, X_train, X_test, y_train, y_test)
print('\n-------用随机森林-------')
clf = ensemble.RandomForestRegressor(n_estimators=20) # 随机森林回归
try_different_model(clf, X_train, X_test, y_train, y_test)
print('\n-------用GBDT-------')
clf = ensemble.GradientBoostingRegressor(n_estimators=100) # GBDT回归
try_different_model(clf, X_train, X_test, y_train, y_test)
print('\n-------用AdaBoost-------')
clf = ensemble.AdaBoostRegressor(n_estimators=50) # 用AdaBoost回归
try_different_model(clf, X_train, X_test, y_train, y_test)
print('\n-------用Bagging-------')
clf = ensemble.BaggingRegressor() # 用Bagging回归
try_different_model(clf, X_train, X_test, y_train, y_test)
if __name__ == '__main__':
main()
通过对大量方法的比较,针对具体数据选好一个或几个较优的回归器后,可以使用gridsearch来调参。