数据集
Iris鸢尾花数据集: 包含 3 类分别为山鸢尾(Iris-setosa)、变色鸢尾(Iris-versicolor)和维吉尼亚鸢尾(Iris-virginica),共 150 条数据,每类各 50 个数据,每条记录都有 4 项特征:花萼长度、花萼宽度、花瓣长度、花瓣宽度,通常可以通过这4个特征预测鸢尾花卉属于哪一品种。
使用sklearn库中的KNN进行分类
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# 载入iris数据集
iris_dataset = load_iris()
X = iris_dataset['data']#特征
Y = iris_dataset['target']#类别
# 数据划分,80%的训练集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
# 训练阶段
knn = KNeighborsClassifier(n_neighbors=5)# 设置邻居数K
knn.fit(X_train, Y_train)# 构建基于训练集的模型
# 测试评估模型
print("Test set score:{:.2f}".format(knn.score(X_test, Y_test)))
Test set score:0.97
# 随机选一个做出预测
temp = 5
X_temp = [X_test[temp]]
Y_temp = [Y_test[temp]]
prediction = knn.predict(X_temp)
print("Predicted target name:{}".format(iris_dataset['target_names'][prediction]))
print("Actual target name:{}".format(iris_dataset['target_names'][Y_temp]))
Predicted target name:['virginica']
Actual target name:['virginica']
自己编写KNN
# 定义距离
import math
def euclideanDistance(point1,point2,length):
distance = 0
# 欧式距离计算预测点与所有点的距离
for x in range(length):
distance += pow((point1[x] - point2[x]),2)
return math.sqrt(distance)
# 返回排序后的近邻
import operator
def sortNeighbors(trainingData,trainingTarget,testInstance):
Neighbors=[]
length = len(testInstance) - 1
# 逐个计算距离
for x in range(len(trainingData)):
dist = euclideanDistance(testInstance, trainingData[x], length)
Neighbors.append((trainingData[x],dist,trainingTarget[x]))
# 对所有距离排序
Neighbors.sort(key=operator.itemgetter(1))
return Neighbors
# 获得预测结果
def getTarget(sortedNeighbors,k):
neighbors=[]
for x in range(k):
neighbors.append(sortedNeighbors[x])
classVotes={}# 以字典存储
# print(neighbors)
for x in range(len(neighbors)):
index=neighbors[x][-1]
if(index in classVotes):
classVotes[index] += 1
else:
classVotes[index] = 1
sortedVotes = sorted(classVotes.items(),key = operator.itemgetter(1),reverse=True)
# print(sortedVotes)
return [sortedVotes[0][0]]# 保证和sklearn返回值一样是ndarray
进行封装,使之更接近于sklearn库的knn
class myKNN:
def __init__(self,n_neighbors):
self.n_neighbors = n_neighbors
# 导入训练集
def fit(self, X_train, Y_train):
self.X_train = X_train
self.Y_train = Y_train
# 单个进行预测
def predict(self, X_temp):
self.X_temp = X_temp
return getTarget(sortNeighbors(self.X_train, self.Y_train, self.X_temp), self.n_neighbors)
# 计算测试集准确率
def score(self, X_test, Y_test):
self.X_test = X_test
ac = 0
for i in range(len(Y_test)):
if self.predict(self.X_test[i]) == Y_test[i]:
ac = ac+1
return float(ac/len(Y_test))
经过了封装,故相较于前面使用的sklearn库,myKNN与sklearn.neighbors.KNeighborsClassifier中同名函数的输入与输出类型完全相同,此处代码只要改一下实例化语句即可进行复用
# 训练阶段
knn = KNeighborsClassifier(n_neighbors=5)# 设置邻居数K
knn.fit(X_train, Y_train)# 构建基于训练集的模型
# 测试评估模型
print("Test set score:{:.2f}".format(knn.score(X_test, Y_test)))
Test set score:0.97
# 随机选一个做出预测
temp = 5
X_temp = [X_test[temp]]
Y_temp = [Y_test[temp]]
prediction = knn.predict(X_temp)
print("Predicted target name:{}".format(iris_dataset['target_names'][prediction]))
print("Actual target name:{}".format(iris_dataset['target_names'][Y_temp]))
Predicted target name:['virginica']
Actual target name:['virginica']
# 训练阶段
MyKnn = myKNN(n_neighbors=5)# 设置邻居数K
MyKnn.fit(X_train, Y_train)# 构建基于训练集的模型
# 测试评估模型
print("Test set score:{:.2f}".format(MyKnn.score(X_test, Y_test)))
Test set score:0.93
# 随机选一个做出预测
temp = 5
X_temp = [X_test[temp]]
Y_temp = [Y_test[temp]]
prediction = MyKnn.predict(X_temp)
print("Predicted target name:{}".format(iris_dataset['target_names'][prediction]))
print("Actual target name:{}".format(iris_dataset['target_names'][Y_temp]))
Predicted target name:['virginica']
Actual target name:['virginica']
查看选择不同的K和不同的KNN算法时对准确率的影响
import matplotlib.pyplot as plt
k_range = range(1, 31)
k_error1 = []
k_error2 = []
for k in k_range:
temp_knn1 = KNeighborsClassifier(n_neighbors=k)
temp_knn1.fit(X_train, Y_train)
temp_knn2 = myKNN(n_neighbors=k)
temp_knn2.fit(X_train, Y_train)
score1 = temp_knn1.score(X_test, Y_test)
score2 = temp_knn2.score(X_test, Y_test)
k_error1.append(1 - score1)
k_error2.append(1 - score2)
# 画图
plt.figure(figsize=(18, 6))
plt.subplot(1, 2, 1)
plt.plot(k_range, k_error1)
plt.xlabel('Value of K for sklearnKNN')
plt.ylabel('Error')
plt.subplot(1, 2, 2)
plt.plot(k_range, k_error2)
plt.xlabel('Value of K for myKNN')
plt.show()
在实际运用中,应该在保证准确率的前提下尽可能地使用较小地K来节省算力