交叉验证目的:测试某个算法在一个数据集上的应用的好坏,帮助我们进行超参数的调优和模型选择
交叉验证方法:
K倍交叉验证(K>=2)。将样本数据集随机划分为K个子集(一般是均分),将一个子集数据作为测试集,其余的K-1组子集作为训练集;将K个子集轮流作为测试集,重复上述过程,这样得到了K个分类器或模型,并利用测试集得到了K个分类器或模型的分类准确率。用K个分类准确率的平均值作为分类器或模型的性能指标。10-倍交叉证实是比较常用的。
# -*- coding: utf-8 -*-
"""
场景:用鸢尾花数据集为例展示sklearn的一般流程,包括:
1. 数据的获取
2. 数据预处理
3. 模型的训练
4. 模型的评估
5. 模型的优化
6. 模型持久化
"""
import numpy as np
import pandas as pd
# 1.-----------数据集-------------------
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
'''
import matplotlib.pyplot as plt
plt.scatter(X[:50, 0], X[:50, 1],color='red', marker='o', label='setosa') # 前50个样本的散点图
plt.scatter(X[50:100, 0], X[50:100, 1],color='blue', marker='x', label='versicolor') # 中间50个样本的散点图
plt.scatter(X[100:, 0], X[100:, 1],color='green', marker='+', label='Virginica') # 后50个样本的散点图
plt.xlabel('petal length')
plt.ylabel('sepal length')
plt.legend(loc=2) # 说明放在左上角
plt.show()
'''
# 2---------------数据标准化----------------------
# MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
X = scaler.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)
# 3.------------------------------训练模型-------------------------------
from sklearn.svm import SVC
classifier = SVC(kernel = "linear", probability = True)
classifier.fit(X_train, y_train)
'''
#-------------------------调用不同的核函数进行分析---------------------
kernel=['linear','rbf','sigmoid']
for kernel in kernel :
classifier = SVC(kernel = kernel, probability = True)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print('核函数:%s 预测准确率:%f'%(kernel,score))
'''
y_pred = classifier.predict(X_test)#测试集的预测结果
'''
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
kernel=['linear','rbf','sigmoid']
k_scores=[]
for kernel in kernel :
classifier = SVC(kernel = kernel, probability = True)
classifier.fit(X_train, y_train)
scores = cross_val_score(classifier ,X, y, cv = 5,scoring='accuracy')#交叉验证5次
k_scores.append(scores.mean())
fig=plt.figure()
ax=fig.add_subplot(111)
ax.plot(range(3), k_scores)
ax.set_xticks([0,1,2])
ax.set_xticklabels(['linear','rbf','sigmoid'])
ax.set_xlabel('kernel for SVC')
ax.set_ylabel('Cross-Validated Accuracy')
'''
# 4. --------------------------------模型的评估----------------------
# 查看参数
params = classifier.get_params()
y_pred_proba = classifier.predict_proba(X_test)#每个样本属于某个类别的概率
# 查看模型评分
score = classifier.score(X_test, y_test) # 96%
# 分类模型的评分报告
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
# 交叉验证评分
from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier ,X, y, cv = 5,scoring='accuracy')#交叉验证5次
print(scores.mean())
# 5.------------------------模型的优化(参数的选择)----------------
from sklearn.model_selection import GridSearchCV
# 估计器
svc = SVC()
# 超参数空间
param_grid = [{'C':[0.1, 1, 10, 100, 1000], 'kernel':['linear']},
{'C':[0.1, 1, 10, 100, 1000], 'kernel':['rbf'], 'gamma':[0.001, 0.01]}]
# 一共15种情况(字典格式)
# 评分函数
scoring = 'accuracy'
# 指定采样方法, clf即最佳模型
clf = GridSearchCV(svc, param_grid, scoring= scoring, cv = 5) # 返回最佳模型
clf.fit(X_train, y_train)
grid_result = clf.fit(X_train, y_train) #运行网格搜索
print("Best: %f using %s" % (grid_result.best_score_,clf.best_params_))#打印出效果最好时的参数选择
y_pred_best = clf.predict(X_test)
score_best = clf.score(X_test, y_pred_best) # 100%
params_best = clf.get_params() # 最优模型的参数
# 6. 模型持久化
# pickle
import pickle
with open('clf.pkl', 'wb') as f:
pickle.dump(clf, f)
with open('clf.pkl','rb') as f:
clf2 = pickle.load(f)
y2_pred = clf2.predict(X_test)
# joblib
from sklearn.externals import joblib
joblib.dump(clf, 'clf_joblib.pkl')
clf3 = joblib.load('clf_joblib.pkl')
y3_pred = clf3.predict(X_test)