import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
# 以手写数据集为例
digits = datasets.load_digits()
digits.keys()
dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])
分离出训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data,digits.target,test_size=0.2, random_state=666)
from sklearn.preprocessing import StandardScaler #调出
standardscaler = StandardScaler() #创建实例
standardscaler.fit(X_train)
#返回训练集的平均值
standardscaler.mean_
#返回训练集的标准差
standardscaler.scale_
#将X_train转换为归一化后的数据
X_train = standardscaler.transform(X_train)
#将X_test转换为归一化后的数据
X_test = standardscaler.transform(X_test)
网格搜索调参
# 首先定义要搜索的参数
# 二维数组内嵌套字典
# 每个字典内都是一组网格搜索,标明每个参数的取值范围
# 注意:p值只有在weights=distance时才有意义
param_grid = [
{ # 需遍历10次
'weights': ['uniform'], # 参数取值范围
'n_neighbors': [i for i in range(1, 11)] # 使用其他方式如np.arange()也可以
# 这里没有p参数
},
{ # 需遍历50次
'weights': ['distance'],
'n_neighbors': [i for i in range(1, 11)],
'p': [i for i in range(1, 6)]
}
]
调出KNN训练器
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier() # 默认参数,创建空分类器
from sklearn.model_selection import GridSearchCV # CV,使用交叉验证方式获得模型正确率
grid_search = GridSearchCV(knn_clf, param_grid, cv=5) # 网格搜索参数
grid_search.fit(X_train, y_train)
# 输出指定过的最佳参数
grid_search.best_params_
#输出最佳准确率
grid_search.best_score_
KNN中的参数
KNN中第一个参数是k
best_score = 0 # 最高准确率
best_k = -1 # 最佳k值
for k in range(1, 11):
knn_clf = KNeighborsClassifier(n_neighbors=k)
knn_clf.fit(X_train, y_train) # 训练
score = knn_clf.score(X_test, y_test) # 查看准确率
if score > best_score:
best_score = score
best_k = k
print('最佳准确率为:', best_score)
print('最佳k值为:', best_k)
另一个是 weights :uniform / distance
uniform 默认,统一只计算距离,权重一样
#distance, 增加权重考虑
best_score = 0.0 # 最高准确率
best_k = -1 # 最佳k值
best_weight = '' # 距离或权重
for j in ['uniform', 'distance']:
for i in range(1, 11):
knn_clf = KNeighborsClassifier(n_neighbors=i, weights=j)
knn_clf.fit(X_train, y_train) # 训练
score = knn_clf.score(X_test, y_test) # 查看准确率
if score > best_score:
best_score = score
best_k = i
best_weight = j
print('best_score = {}'.format(best_score))
print('best_k = {}'.format(best_k))
print('best_weight = {}'.format(best_weight))