学习曲线和决策边界的绘制

评估机器学习模型好坏的时候,常常需要观察学习曲线的变化,以及最后的分类结果(二分类)的效果。一个好的可视化结果可以加强对模型的理解程度。下面总结一下决策边界和学习曲线的绘制代码,以便加强印象,方便查看。

# 决策边界的绘制

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
def plot_decision_boundary(clf, X, y, num_row = 100, num_col = 100):
    """
    绘制决策边界的核心代码
    :param clf: 分类器, 即使用的模型
    :param X: 输入的数据X
    :param y: 真实的分类结果y
    :param num_row: 绘制决策边界时,行数据生成的个数
    :param num_col: 列数据生成的个数
    """
    clf.fit(X, y)
    sigma = 1   # 防止数据在图形的边上而加上的一个偏移量,设定一个较小的值即可
    x1_min, x1_max = np.min(X[:, 0])-sigma, np.max(X[:, 0])+sigma
    x2_min, x2_max = np.min(X[:, 1])-sigma, np.max(X[:, 1])+sigma
    t1 = np.linspace(x1_min, x1_max, num_row)
    t2 = np.linspace(x2_min, x2_max, num_col)
    x1, x2 = np.meshgrid(t1, t2)
    x_test = np.stack((x1, x2), axis=1)

    # 设置使用的颜色colors, 这里假设最后的结果是三个类别
    cm_dark = mpl.colors.ListedColormap(['#FFA0A0', '#A0FFA0', '#A0A0FF'])
    cm_light = mpl.colors.ListedColormap(['r', 'g', 'b'])

    y_hat = clf.predict(x_test)
    y_hat = y_hat.reshape(x1.shape)

    plt.pcolormesh(x1, x2, y_hat, cmap=cm_dark)   # 绘制底色
    plt.scatter(X[:, 0], X[:, 1], y, edgecolors='k', cmap=cm_light)   # 绘制数据的颜色
    
    plt.xlabel('x label')
    plt.ylabel('y label')
    plt.title('decision-boundary')
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.grid()
    plt.show()

# 学习曲线的绘制

from sklearn.model_selection import learning_curve
import numpy as np
import matplotlib.pyplot as plt
def plot_learning_curve(estimator, X, y, n_jobs=1, cv = 5, train_size = np.linspace(0.02, 1, 50), verbose=0):
    """
    绘制学习曲线,评估训练和测试结果,方便对模型进行评估
    :param estimator: 使用的模型
    """
    train_sizes, train_scores, test_scores = learning_curve(estimator=estimator, X=X, y=y, cv=cv, n_jobs=n_jobs,
                                                            train_sizes=train_size, scoring='accuracy', verbose=verbose)
    # 从 http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 选择适合的scoring
    train_score_mean = train_scores.mean(axis=1)
    train_size_std = train_scores.std(axis=1)
    test_score_mean = np.mean(test_scores, axis=1)
    test_score_std = np.std(test_scores, axis=1)

    plt.figure()
    plt.fill_between(train_sizes, train_score_mean+train_size_std, train_score_mean-train_size_std, color='blue', alpha=0.1)
    plt.fill_between(train_sizes, test_score_mean+test_score_std, test_score_mean-test_score_std, color='red', alpha=0.1)
    plt.plot(train_sizes, train_score_mean, 'o-', color='blue', label='training score')
    plt.plot(train_sizes, test_score_mean, 'o-', color='red', label='testing score')

    plt.xlabel('xlabel')
    plt.ylabel('ylabel')
    plt.title('learning_curve')
    plt.legend(loc='best')
    plt.grid()
    plt.show()

# 说明:以上部分代码可能来自网络,如有侵权,请联系删除
 

 

 

你可能感兴趣的:(数据可视化,机器学习)