K近邻算法——简单分类实现

定义:

在特征空间中,如果一个样本附近的 k个 最近( 即特征空间中最邻近 )样本的大多数属于某一个类别,则该样本也属于这个类别。

特点:

  1. 没有模型的算法
  2. 为了和其他算法统一,可以认为数据集就是模型本身( sklearn 就是参照这样的方式设计的 )
  3. 通过查看周边最多的分类, 从而决定数据最终情况的算法。

 代码编辑步骤:

  1. 获取数据集
  2. 训练
  3. 输入新的样本
  4. 预测

直接享用代码吧:

import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier

# 定义特征值
raw_x = [[3.3144558, 2.33542461],
         [3.75497175, 1.93856648],
         [1.38327539, 3.38724496],
         [3.09203999, 4.47090056],
         [2.58593831, 2.13055653],
         [7.41206251, 4.80305318],
         [5.912852, 3.72918089],
         [9.21547627, 2.8132231],
         [7.36039738, 3.35043406],
         [7.13698009, 0.40130301]]
# 定义目标值
raw_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

x_train = np.array(raw_x)
y_train = np.array(raw_y)

# 利用 matplotlib 绘制图像
# plt.scatter(x_train[y_train == 0, 0], x_train[y_train == 0, 1], color='g')
# plt.scatter(x_train[y_train == 1, 0], x_train[y_train == 1, 1], color='r')
# plt.show()

# 要预测的值
x = np.array([8.093607318, 3.365731514])
# plt.scatter(x[0], x[1], color='b')
# plt.show()

# 计算点与点之间的距离
distances = np.sum(np.square(x_train - x), axis=1)
# 对计算的距离进行排序
nearest = np.argsort(distances)

# 定义k值,得出计算结果:
k = 6
# 获取距离最近的前k个数据
top_k = [y_train[i] for i in nearest[:k]]
# 统计每个y值出现的次数
votes = Counter(top_k)
# 获取出现次数最多的那个 y的取值
predict_y = votes.most_common(1)[0][0]

# 创建knn算法的分类器实例
knn_classifier = KNeighborsClassifier(n_neighbors=6)
# 拟合训练数据
knn_classifier.fit(x_train, y_train)
# 将样本维度变为二维
x1 = x.reshape(1, -1)
# 利用knn进行预测
y_predict = knn_classifier.predict(x1)
# 得出预测结果
print(y_predict[0])

封装后(以下代码有bug,相信你看懂了上面的代码可以将它改掉的,Right ? Let's try ! ):

import numpy as np
from math import sqrt
from collections import Counter


class KNNClassifier:
    def __init__(self, k):
        '''初始化KNN分类器'''
        assert k >= 1, 'k must be valid'
        self.k = k
        self.x_train = None
        self.y_train = None

    def fit(self, x_train, y_train):
        '''根据训练数据集训练knn分类器'''
        # assert x_train.shape[0] == y_train.shape[0], '长度不对'
        # assert self.k <= x_train.shape[0], '长度不相等'
        self._x_train = x_train
        self._y_train = y_train
        return self

    def predict(self, x_predict):
        '''给定单个待预测数据X,返回X的预测结果值'''
        assert self._x_train is not None and self._y_train is not None, '必须满足预测值'
        # assert x_predict.shape[1] == self._x_train.shape[1], '特征数必须等于xtrain'
        y_predict = [self._predict(x) for x in x_predict]
        return np.array(y_predict)

    def _predict(self, x):
        '''给定单个预测数据x,返回x的结果预测结果值'''
        # assert x.shape[0] == self._x_train.shape[1], 'the feature number of x must be equal to X_train'
        distances = [sqrt(np.sum((x_train - x) ** 2))
                     for x_train in self._x_train]
        nearest = np.argsort(distances)
        topK_y = [self._y_train[i] for i in nearest[:self.k]]
        votes = Counter(topK_y)
        return votes.most_common(1)[0][0]

    def __repr__(self):
        return f'KNN(k={self.k})'


if __name__ == '__main__':
    # 定义特征值
    raw_x = [[3.3144558, 2.33542461],
             [3.75497175, 1.93856648],
             [1.38327539, 3.38724496],
             [3.09203999, 4.47090056],
             [2.58593831, 2.13055653],
             [7.41206251, 4.80305318],
             [5.912852, 3.72918089],
             [9.21547627, 2.8132231],
             [7.36039738, 3.35043406],
             [7.13698009, 0.40130301]]

    # 定义目标值
    raw_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
    # 需要预测的值
    predict_x = [8.093607318, 3.365731514]
    knn = KNNClassifier(10)
    # 拟合训练
    knn.fit(raw_x, raw_y)
    # 预测输入样例
    y_predict = knn.predict(predict_x)
    # 打印预测结果
    print(y_predict[0])

什么?上面的代码无法运行,然后也不会改?

All right ! 

直接上可运行版本,来,张嘴,啊~

import numpy as np
from collections import Counter

from sklearn.neighbors import KNeighborsClassifier


class ProKNNClassifier(object):

    def __init__(self, raw_x, raw_y, x, k):
        if len(raw_x) != len(raw_y):
            print(f'特征值与目标值的长度应该相等')
            return
        if len(x) != 2:
            print(f'被预测的值应为列表类型,包含x,y值')
            return
        for i in x:
            if not isinstance(i, (float, int)):
                print(f'被预测的值应为数字')
                return
        if k > len(raw_y) and isinstance(k, int):
            print(f'k应该小于y并且是数字类型')
            return
        self.train_x = np.array(raw_x)
        self.train_y = np.array(raw_y)
        self.predict_x = np.array(x)
        self.k = k

    def count_predict(self):
        '''实现原理'''
        # 计算点与点之间的距离
        distances = np.sum(np.square(self.train_x - self.predict_x), axis=1)
        # 对计算的距离进行排序
        nearest = np.argsort(distances)
        # 获取距离最近的前k个数据
        top_k = [self.train_y[i] for i in nearest[:self.k]]
        # 统计每个y值出现的次数
        votes = Counter(top_k)
        # 获取出现次数最多的那个y的取值
        predict_y = votes.most_common(1)[0][0]
        # 打印
        print(predict_y)

    def train(self):
        '''knn算法 在 sklearn 中的使用方法'''
        # 创建knn算法的分类器实例
        knn_classifier = KNeighborsClassifier(n_neighbors=self.k)
        # 拟合训练数据
        knn_classifier.fit(self.train_x, self.train_y)
        # 将样本维度变为二维
        x1 = self.predict_x.reshape(1, -1)
        # 利用knn进行预测
        y_predict = knn_classifier.predict(x1)
        # 得出预测结果
        return y_predict[0]


if __name__ == '__main__':
    # 定义特征值
    raw_x = [[3.3144558, 2.33542461],
             [3.75497175, 1.93856648],
             [1.38327539, 3.38724496],
             [3.09203999, 4.47090056],
             [2.58593831, 2.13055653],
             [7.41206251, 4.80305318],
             [5.912852, 3.72918089],
             [9.21547627, 2.8132231],
             [7.36039738, 3.35043406],
             [7.13698009, 0.40130301]]
    # 定义目标值
    raw_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
    # 要预测的值
    x = [8.093607318, 3.365731514]
    # 定义k值
    k = 6
    knn = ProKNNClassifier(raw_x, raw_y, x, k)
    predict = knn.train()
    print(predict)

你可能感兴趣的:(机器学习,近邻算法,算法,分类)