sklearn回归方法汇总demo

本文给出了python机器学习包sklearn回归方法,和评价指标。
下面是本demo的参与预测的特征,和预测目标。

特征:两个数值型数据gamma和max_lambda
预测目标:一个数值型数据beta

评测指标遍历了所以sklearn的评测指标:

  1. explained_variance_score  # 解释方差,越接近1越好
  2. r2_score          # 满分1,越接近1越好
  3. mean_absolute_error    # 平均绝对误差,越小越好
  4. mean_squared_error    # 均方误差,越小越好
  5. median_absolute_error   # 中值绝对误差,越小越好

由于篇幅有限,本文主要目的是展示用法,具体公式可以参考{click here}

尽可能的遍历了sklearn的回归方法。下面是代码。

# -*- coding: utf-8 -*-
# @Time    : 2018/3/22 10:02
# @Author  : timothy
'''
    根据max_lambda和gamma值,预测beta
'''

import pandas as pd
from sklearn import cross_validation, linear_model
from sklearn import tree
from sklearn import svm
from sklearn import neighbors
from sklearn import ensemble
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score


def read_data():
    '''
        读取数据集
    :return: 输入值和标签
    '''
    file_path = 'resources/ba.csv'
    df = pd.read_csv(file_path)
    attribute_list = ['gamma', 'max_lambda']  # 通过这两个属性来预测
    X = df.loc[:, attribute_list].values
    y = df.loc[:, 'beta'].values
    # 用交叉验证法,划分训练集和测试集
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)  # 百分之20测试
    return X_train, X_test, y_train, y_test


def try_different_model(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    expected = y_test
    predicted = clf.predict(X_test)
    # 评价预测的准确性
    print('解释方差(越接近1越好):', explained_variance_score(expected, predicted))  # 解释方差,越接近1越好
    print('r2_score(越接近1越好):', r2_score(expected, predicted))  # r2 score 满分1,越接近1越好
    print('平均绝对误差(越小越好):', mean_absolute_error(expected, predicted))  # 平均绝对误差,越小越好
    print('均方误差(越小越好):', mean_squared_error(expected, predicted))  # 均方误差,越小越好
    print('中值绝对误差(越小越好):', median_absolute_error(expected, predicted))  # 中值绝对误差,越小越好


def main():
    X_train, X_test, y_train, y_test = read_data()

    print('\n------使用线性回归-------')
    clf = linear_model.LinearRegression()  # 线性回归
    try_different_model(clf, X_train, X_test, y_train, y_test)

    print('\n------使用KNN-------')
    clf = neighbors.KNeighborsRegressor()  # 使用KNN
    try_different_model(clf, X_train, X_test, y_train, y_test)

    print('\n------使用支持向量机-------')
    clf = svm.SVR()  # 支持向量机
    try_different_model(clf, X_train, X_test, y_train, y_test)

    print('\n-------用决策树-------')
    clf = tree.DecisionTreeRegressor()  # 决策树回归
    try_different_model(clf, X_train, X_test, y_train, y_test)

    print('\n-------ExtraTreeRegressor-------')
    clf = tree.ExtraTreeRegressor()  # ExtraTreeRegressor回归
    try_different_model(clf, X_train, X_test, y_train, y_test)

    print('\n-------用随机森林-------')
    clf = ensemble.RandomForestRegressor(n_estimators=20)  # 随机森林回归
    try_different_model(clf, X_train, X_test, y_train, y_test)

    print('\n-------用GBDT-------')
    clf = ensemble.GradientBoostingRegressor(n_estimators=100)  # GBDT回归
    try_different_model(clf, X_train, X_test, y_train, y_test)

    print('\n-------用AdaBoost-------')
    clf = ensemble.AdaBoostRegressor(n_estimators=50)  # 用AdaBoost回归
    try_different_model(clf, X_train, X_test, y_train, y_test)

    print('\n-------用Bagging-------')
    clf = ensemble.BaggingRegressor()  # 用Bagging回归
    try_different_model(clf, X_train, X_test, y_train, y_test)


if __name__ == '__main__':
    main()

写在最后:

通过对大量方法的比较,针对具体数据选好一个或几个较优的回归器后,可以使用gridsearch来调参。

你可能感兴趣的:(AI)