一、题目名称
实现knn分类算法
二、题目内容
原生Python实现knn分类算法,并使用鸢尾花数据集进行测试
三、算法分析
knn算法是最简单的机器学习算法之一,通过测量不同特征值之间的距离进行分类。其基本思路是:如果一个样本在特征空间中的k个最相似(即特征空间中最近邻)的样本中的大多数属于某一个类别,则该样本也属于这个类别。
本次作业主要模拟实现了knn测试数据与训练数据之间的距离求解、排序、最邻近k个元素的筛选。其中,空间距离采用“欧式距离”进行计算,表达式如下:
上式中dist[i] 为测试数据与下标为i的训练数据的距离,xt,xi 分别为测试数据和下标为i的训练数据,算法整体流程图如下:
图 1 knn算法流程图
四、调试截图
调试过程主要的任务是观察数据结构:Python中的嵌套结构较为复杂,需要清楚每一步输出结果的维度和具体数据结构
五、运行结果
本次作业中的输入为鸢尾花数据集,输出为预测后的鸢尾花类型。最初设想采用散点图输出,但原生Python散点图效果较差,故改为直接字符串输出,输出类别即可,得出运行结果
图 4 原生Python散点图效果较差
=
图 5 改为直接字符串输出类别
六、问题及解决
实现过程中遇到的主要问题是数据结构的混淆。在knn实现类中,经过多次列表生成、嵌套,容易造成对数据结构的混淆,从而出现下标维数错误等错误,解决办法也很简单,debug查看数据结构或者直接print输出每步内容观察。
图 6 下标错误
七、源代码
1.knn.py
# !/usr/bin/env python
# -*- encoding: utf-8 -*-
# @Project : machinelearning
# @File : knn.py
# @Author : yanchengxu
# @Contact : [email protected]
# @Time : 2019/10/7 16:14
# @IDE : PyCharm
import numpy as np
import math
class KNNClassifier:
"""
KNN
"""
def __init__(self, k=3):
"""
初始化
X_train 特征测试集
y_train 标量测试集
res_class 预测结果
:param k: 默认值为3
"""
self.k = k
self.X_train = []
self.y_train = []
self.result = []
def fit(self, X_train, y_train):
"""
KNN 训练模型
:param X_train: 训练集特征数据
:param y_train: 训练集目标数据
:return: self
"""
assert X_train.shape[0] == y_train.shape[0], '训练集特征与目标值个数不匹配'
assert self.k <= X_train.shape[0], 'K值超出训练数据范围'
self.X_train = X_train
self.y_train = y_train
# print('K', self.k)
# print('X.shape', self.X_train.shape)
# print('y.shape', self.y_train.shape)
def get_distance(self, x_test):
"""
计算距离
:param x_test: 测试集
:return: list_dist
"""
list_dist = []
for i in range(len(x_test)):
# x_train 是 X_train 中的每个坐标,只有一个维度
list_dist.append(
[math.sqrt(np.sum(x_train[0] - x_test[i][0]) ** 2 + np.sum(x_train[1] - x_test[i][1]) ** 2) for x_train
in self.X_train])
# print('len of list_dist =', len(list_dist[0]))
return list_dist
def get_k_nearest_dist(self, list_dist):
"""
对距离进行排序
:param list_dist: 测试点距离样本的距离
:return: list_k_nearest_dist
"""
k = self.k
list_each_dist = []
for i in range(len(list_dist)):
dict_temp = {}
for j in range(len(list_dist[i])):
dict_temp[j] = list_dist[i][j]
list_each_dist.append(dict_temp)
# print('list_each_dist:', list_each_dist)
# print('len of count_mix:', len(list_each_dist))
list_k_nearest_dist = []
for i in range(len(list_each_dist)):
# 键值对排序
dict_sorted_dist = dict(sorted(list_each_dist[i].items(), key=lambda x: x[1], reverse=False))
# print('dict_sorted_dist', dict_sorted_dist)
top = 0
dict_knearest_distance = {}
for key in dict_sorted_dist:
dict_knearest_distance[key] = dict_sorted_dist[key]
top += 1
if top == self.k:
break
list_k_nearest_dist.append(dict_knearest_distance)
# print('list_k_nearest_dist:', list_k_nearest_dist)
# 注意缩进!!!
return list_k_nearest_dist
def vote(self, k_nearest_dist):
"""
投票
:param k_nearest_dist: k个最近距离
:return: self
"""
# 所有测试点的topK个标签
list_all_test = []
for i in range(len(k_nearest_dist)):
# 每个测试点的topK个标签
list_each_test = []
for key in k_nearest_dist[i]:
# 数据结构
list_each_test.append(self.y_train[key])
# list_each_test.append(self.y_train[key][0])
list_all_test.append(list_each_test)
# print('list_class2', list_each_test)
# print('list_all_test:', list_all_test)
# 利用set去重->优化速度
set_list_class = []
for i in range(len(list_all_test)):
set_list_class.append(set(list_all_test[i]))
# print('set_list_class', set_list_class)
for i in range(len(set_list_class)):
dict_count = {}
for item in set_list_class[i]:
dict_count.update({item: list_all_test[i].count(item)})
# print('dict_count', dict_count)
# 获得字典dict_count中value最大值对应的key,即为每个点的分类结果
each_result = max(dict_count, key=dict_count.get)
# print('each_result', each_result)
self.result.append(each_result)
# print('result:', self.result)
return self.result
def predict(self, X_predict):
"""
预测
:param X_predict: 待测集
:return: self
"""
assert X_predict.shape[1] == self.X_train.shape[1], '特征数不匹配'
# 获取待测点与标准点的距离
distances = self.get_distance(X_predict)
# print("distances:", distances)
# 获取k个最近距离
k_nearest_dist = self.get_k_nearest_dist(distances)
# print("k_nearest_dist:", k_nearest_dist)
# 投票
result = self.vote(k_nearest_dist)
return result
2.test.py
# !/usr/bin/env python
# -*- encoding: utf-8 -*-
# @Project : machinelearning
# @File : test.py
# @Author : yanchengxu
# @Contact : [email protected]
# @Time : 2019/10/7 16:57
# @IDE : PyCharm
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from myknn.knn import KNNClassifier
import numpy as np
import matplotlib.pyplot as plt
# import itertools
# import random
kn = KNNClassifier(3)
# 训练数据
# X = [[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3], [3, 1], [3, 2], [3, 3],
# [6, 6], [6, 7], [6, 8], [7, 6], [7, 7], [7, 8], [8, 6], [8, 7], [8, 8],
# [11, 1], [11, 2], [11, 3], [12, 1], [12, 2], [12, 3], [13, 1], [13, 2], [13, 3]]
#无锡做人流多少钱 http://www.xasgyy.net/
# Y = [['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'],
# ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'],
# ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C']]
# # 随机
# random_list = list(itertools.product(range(1, 13), range(1, 8)))
# X = random.sample(random_list, len(Y))
# # print('random_list', X)
# print('shape y:', y_train.shape)
iris_dataset = load_iris()
# test
# print(iris_dataset)
X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0)
X = np.asarray(X_train)
Y = np.asarray(y_train)
# print('X:', X)
# print('Y,shape', Y.shape)
# print('Y.type', type(Y))
# 模型训练
kn.fit(X, Y)
# 数据预测
x_test = [[5, 2.9, 1, 0.2], [6.7, 3.2, 5.2, 2.3], [5.6, 3.1, 4.5, 1.5]]
X_test = np.asarray(x_test)
prediction = kn.predict(X_test)
# 打印预测结果
for i in range(len(prediction)):
print(x_test[i], '->', iris_dataset['target_names'][prediction[i]])
# # 散点图观察
# x1 = []
# y1 = []
#
# # 训练集
# for i in np.asarray(X):
# x1.append(i[0])
# y1.append(i[1])
#
# x2 = []
# y2 = []
# # 测试集
# for i in np.asarray(x_test):
# x2.append(i[0])
# y2.append(i[1])
#
# plt.plot(x1, y1, 'r*')
# plt.plot(x2, y2, 'g+')
# plt.show()