一、题目名称

  实现knn分类算法

  二、题目内容

  原生Python实现knn分类算法,并使用鸢尾花数据集进行测试

  三、算法分析

  knn算法是最简单的机器学习算法之一,通过测量不同特征值之间的距离进行分类。其基本思路是:如果一个样本在特征空间中的k个最相似(即特征空间中最近邻)的样本中的大多数属于某一个类别,则该样本也属于这个类别。

  本次作业主要模拟实现了knn测试数据与训练数据之间的距离求解、排序、最邻近k个元素的筛选。其中,空间距离采用“欧式距离”进行计算,表达式如下:

  

dist\left [ i \right ] = \sqrt{\left ( x_{t}-x_{i} \right )^2}


  上式中dist[i] 为测试数据与下标为i的训练数据的距离,xt,xi 分别为测试数据和下标为i的训练数据,算法整体流程图如下:

  

原生Python实现knn算法_第1张图片


  图 1 knn算法流程图

  四、调试截图

  调试过程主要的任务是观察数据结构:Python中的嵌套结构较为复杂,需要清楚每一步输出结果的维度和具体数据结构

  五、运行结果

  本次作业中的输入为鸢尾花数据集,输出为预测后的鸢尾花类型。最初设想采用散点图输出,但原生Python散点图效果较差,故改为直接字符串输出,输出类别即可,得出运行结果

  图 4 原生Python散点图效果较差

  =

  图 5 改为直接字符串输出类别

  六、问题及解决

  实现过程中遇到的主要问题是数据结构的混淆。在knn实现类中,经过多次列表生成、嵌套,容易造成对数据结构的混淆,从而出现下标维数错误等错误,解决办法也很简单,debug查看数据结构或者直接print输出每步内容观察。

  图 6 下标错误

  七、源代码

  1.knn.py

  # !/usr/bin/env python

  # -*- encoding: utf-8 -*-

  # @Project : machinelearning

  # @File : knn.py

  # @Author : yanchengxu

  # @Contact : [email protected]

  # @Time : 2019/10/7 16:14

  # @IDE : PyCharm

  import numpy as np

  import math

  class KNNClassifier:

  """

  KNN

  """

  def __init__(self, k=3):

  """

  初始化

  X_train 特征测试集

  y_train 标量测试集

  res_class 预测结果

  :param k: 默认值为3

  """

  self.k = k

  self.X_train = []

  self.y_train = []

  self.result = []

  def fit(self, X_train, y_train):

  """

  KNN 训练模型

  :param X_train: 训练集特征数据

  :param y_train: 训练集目标数据

  :return: self

  """

  assert X_train.shape[0] == y_train.shape[0], '训练集特征与目标值个数不匹配'

  assert self.k <= X_train.shape[0], 'K值超出训练数据范围'

  self.X_train = X_train

  self.y_train = y_train

  # print('K', self.k)

  # print('X.shape', self.X_train.shape)

  # print('y.shape', self.y_train.shape)

  def get_distance(self, x_test):

  """

  计算距离

  :param x_test: 测试集

  :return: list_dist

  """

  list_dist = []

  for i in range(len(x_test)):

  # x_train 是 X_train 中的每个坐标,只有一个维度

  list_dist.append(

  [math.sqrt(np.sum(x_train[0] - x_test[i][0]) ** 2 + np.sum(x_train[1] - x_test[i][1]) ** 2) for x_train

  in self.X_train])

  # print('len of list_dist =', len(list_dist[0]))

  return list_dist

  def get_k_nearest_dist(self, list_dist):

  """

  对距离进行排序

  :param list_dist: 测试点距离样本的距离

  :return: list_k_nearest_dist

  """

  k = self.k

  list_each_dist = []

  for i in range(len(list_dist)):

  dict_temp = {}

  for j in range(len(list_dist[i])):

  dict_temp[j] = list_dist[i][j]

  list_each_dist.append(dict_temp)

  # print('list_each_dist:', list_each_dist)

  # print('len of count_mix:', len(list_each_dist))

  list_k_nearest_dist = []

  for i in range(len(list_each_dist)):

  # 键值对排序

  dict_sorted_dist = dict(sorted(list_each_dist[i].items(), key=lambda x: x[1], reverse=False))

  # print('dict_sorted_dist', dict_sorted_dist)

  top = 0

  dict_knearest_distance = {}

  for key in dict_sorted_dist:

  dict_knearest_distance[key] = dict_sorted_dist[key]

  top += 1

  if top == self.k:

  break

  list_k_nearest_dist.append(dict_knearest_distance)

  # print('list_k_nearest_dist:', list_k_nearest_dist)

  # 注意缩进!!!

  return list_k_nearest_dist

  def vote(self, k_nearest_dist):

  """

  投票

  :param k_nearest_dist: k个最近距离

  :return: self

  """

  # 所有测试点的topK个标签

  list_all_test = []

  for i in range(len(k_nearest_dist)):

  # 每个测试点的topK个标签

  list_each_test = []

  for key in k_nearest_dist[i]:

  # 数据结构

  list_each_test.append(self.y_train[key])

  # list_each_test.append(self.y_train[key][0])

  list_all_test.append(list_each_test)

  # print('list_class2', list_each_test)

  # print('list_all_test:', list_all_test)

  # 利用set去重->优化速度

  set_list_class = []

  for i in range(len(list_all_test)):

  set_list_class.append(set(list_all_test[i]))

  # print('set_list_class', set_list_class)

  for i in range(len(set_list_class)):

  dict_count = {}

  for item in set_list_class[i]:

  dict_count.update({item: list_all_test[i].count(item)})

  # print('dict_count', dict_count)

  # 获得字典dict_count中value最大值对应的key,即为每个点的分类结果

  each_result = max(dict_count, key=dict_count.get)

  # print('each_result', each_result)

  self.result.append(each_result)

  # print('result:', self.result)

  return self.result

  def predict(self, X_predict):

  """

  预测

  :param X_predict: 待测集

  :return: self

  """

  assert X_predict.shape[1] == self.X_train.shape[1], '特征数不匹配'

  # 获取待测点与标准点的距离

  distances = self.get_distance(X_predict)

  # print("distances:", distances)

  # 获取k个最近距离

  k_nearest_dist = self.get_k_nearest_dist(distances)

  # print("k_nearest_dist:", k_nearest_dist)

  # 投票

  result = self.vote(k_nearest_dist)

  return result

  2.test.py

  # !/usr/bin/env python

  # -*- encoding: utf-8 -*-

  # @Project : machinelearning

  # @File : test.py

  # @Author : yanchengxu

  # @Contact : [email protected]

  # @Time : 2019/10/7 16:57

  # @IDE : PyCharm

  from sklearn.datasets import load_iris

  from sklearn.model_selection import train_test_split

  from myknn.knn import KNNClassifier

  import numpy as np

  import matplotlib.pyplot as plt

  # import itertools

  # import random

  kn = KNNClassifier(3)

  # 训练数据

  # X = [[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3], [3, 1], [3, 2], [3, 3],

  # [6, 6], [6, 7], [6, 8], [7, 6], [7, 7], [7, 8], [8, 6], [8, 7], [8, 8],

  # [11, 1], [11, 2], [11, 3], [12, 1], [12, 2], [12, 3], [13, 1], [13, 2], [13, 3]]

  #无锡做人流多少钱 http://www.xasgyy.net/

  # Y = [['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'], ['A'],

  # ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'], ['B'],

  # ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C'], ['C']]

  # # 随机

  # random_list = list(itertools.product(range(1, 13), range(1, 8)))

  # X = random.sample(random_list, len(Y))

  # # print('random_list', X)

  # print('shape y:', y_train.shape)

  iris_dataset = load_iris()

  # test

  # print(iris_dataset)

  X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0)

  X = np.asarray(X_train)

  Y = np.asarray(y_train)

  # print('X:', X)

  # print('Y,shape', Y.shape)

  # print('Y.type', type(Y))

  # 模型训练

  kn.fit(X, Y)

  # 数据预测

  x_test = [[5, 2.9, 1, 0.2], [6.7, 3.2, 5.2, 2.3], [5.6, 3.1, 4.5, 1.5]]

  X_test = np.asarray(x_test)

  prediction = kn.predict(X_test)

  # 打印预测结果

  for i in range(len(prediction)):

  print(x_test[i], '->', iris_dataset['target_names'][prediction[i]])

  # # 散点图观察

  # x1 = []

  # y1 = []

  #

  # # 训练集

  # for i in np.asarray(X):

  # x1.append(i[0])

  # y1.append(i[1])

  #

  # x2 = []

  # y2 = []

  # # 测试集

  # for i in np.asarray(x_test):

  # x2.append(i[0])

  # y2.append(i[1])

  #

  # plt.plot(x1, y1, 'r*')

  # plt.plot(x2, y2, 'g+')

  # plt.show()