定义:
在特征空间中,如果一个样本附近的 k个 最近( 即特征空间中最邻近 )样本的大多数属于某一个类别,则该样本也属于这个类别。
特点:
代码编辑步骤:
直接享用代码吧:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
# 定义特征值
raw_x = [[3.3144558, 2.33542461],
[3.75497175, 1.93856648],
[1.38327539, 3.38724496],
[3.09203999, 4.47090056],
[2.58593831, 2.13055653],
[7.41206251, 4.80305318],
[5.912852, 3.72918089],
[9.21547627, 2.8132231],
[7.36039738, 3.35043406],
[7.13698009, 0.40130301]]
# 定义目标值
raw_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
x_train = np.array(raw_x)
y_train = np.array(raw_y)
# 利用 matplotlib 绘制图像
# plt.scatter(x_train[y_train == 0, 0], x_train[y_train == 0, 1], color='g')
# plt.scatter(x_train[y_train == 1, 0], x_train[y_train == 1, 1], color='r')
# plt.show()
# 要预测的值
x = np.array([8.093607318, 3.365731514])
# plt.scatter(x[0], x[1], color='b')
# plt.show()
# 计算点与点之间的距离
distances = np.sum(np.square(x_train - x), axis=1)
# 对计算的距离进行排序
nearest = np.argsort(distances)
# 定义k值,得出计算结果:
k = 6
# 获取距离最近的前k个数据
top_k = [y_train[i] for i in nearest[:k]]
# 统计每个y值出现的次数
votes = Counter(top_k)
# 获取出现次数最多的那个 y的取值
predict_y = votes.most_common(1)[0][0]
# 创建knn算法的分类器实例
knn_classifier = KNeighborsClassifier(n_neighbors=6)
# 拟合训练数据
knn_classifier.fit(x_train, y_train)
# 将样本维度变为二维
x1 = x.reshape(1, -1)
# 利用knn进行预测
y_predict = knn_classifier.predict(x1)
# 得出预测结果
print(y_predict[0])
封装后(以下代码有bug,相信你看懂了上面的代码可以将它改掉的,Right ? Let's try ! ):
import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier:
def __init__(self, k):
'''初始化KNN分类器'''
assert k >= 1, 'k must be valid'
self.k = k
self.x_train = None
self.y_train = None
def fit(self, x_train, y_train):
'''根据训练数据集训练knn分类器'''
# assert x_train.shape[0] == y_train.shape[0], '长度不对'
# assert self.k <= x_train.shape[0], '长度不相等'
self._x_train = x_train
self._y_train = y_train
return self
def predict(self, x_predict):
'''给定单个待预测数据X,返回X的预测结果值'''
assert self._x_train is not None and self._y_train is not None, '必须满足预测值'
# assert x_predict.shape[1] == self._x_train.shape[1], '特征数必须等于xtrain'
y_predict = [self._predict(x) for x in x_predict]
return np.array(y_predict)
def _predict(self, x):
'''给定单个预测数据x,返回x的结果预测结果值'''
# assert x.shape[0] == self._x_train.shape[1], 'the feature number of x must be equal to X_train'
distances = [sqrt(np.sum((x_train - x) ** 2))
for x_train in self._x_train]
nearest = np.argsort(distances)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
def __repr__(self):
return f'KNN(k={self.k})'
if __name__ == '__main__':
# 定义特征值
raw_x = [[3.3144558, 2.33542461],
[3.75497175, 1.93856648],
[1.38327539, 3.38724496],
[3.09203999, 4.47090056],
[2.58593831, 2.13055653],
[7.41206251, 4.80305318],
[5.912852, 3.72918089],
[9.21547627, 2.8132231],
[7.36039738, 3.35043406],
[7.13698009, 0.40130301]]
# 定义目标值
raw_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
# 需要预测的值
predict_x = [8.093607318, 3.365731514]
knn = KNNClassifier(10)
# 拟合训练
knn.fit(raw_x, raw_y)
# 预测输入样例
y_predict = knn.predict(predict_x)
# 打印预测结果
print(y_predict[0])
什么?上面的代码无法运行,然后也不会改?
All right !
直接上可运行版本,来,张嘴,啊~
import numpy as np
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
class ProKNNClassifier(object):
def __init__(self, raw_x, raw_y, x, k):
if len(raw_x) != len(raw_y):
print(f'特征值与目标值的长度应该相等')
return
if len(x) != 2:
print(f'被预测的值应为列表类型,包含x,y值')
return
for i in x:
if not isinstance(i, (float, int)):
print(f'被预测的值应为数字')
return
if k > len(raw_y) and isinstance(k, int):
print(f'k应该小于y并且是数字类型')
return
self.train_x = np.array(raw_x)
self.train_y = np.array(raw_y)
self.predict_x = np.array(x)
self.k = k
def count_predict(self):
'''实现原理'''
# 计算点与点之间的距离
distances = np.sum(np.square(self.train_x - self.predict_x), axis=1)
# 对计算的距离进行排序
nearest = np.argsort(distances)
# 获取距离最近的前k个数据
top_k = [self.train_y[i] for i in nearest[:self.k]]
# 统计每个y值出现的次数
votes = Counter(top_k)
# 获取出现次数最多的那个y的取值
predict_y = votes.most_common(1)[0][0]
# 打印
print(predict_y)
def train(self):
'''knn算法 在 sklearn 中的使用方法'''
# 创建knn算法的分类器实例
knn_classifier = KNeighborsClassifier(n_neighbors=self.k)
# 拟合训练数据
knn_classifier.fit(self.train_x, self.train_y)
# 将样本维度变为二维
x1 = self.predict_x.reshape(1, -1)
# 利用knn进行预测
y_predict = knn_classifier.predict(x1)
# 得出预测结果
return y_predict[0]
if __name__ == '__main__':
# 定义特征值
raw_x = [[3.3144558, 2.33542461],
[3.75497175, 1.93856648],
[1.38327539, 3.38724496],
[3.09203999, 4.47090056],
[2.58593831, 2.13055653],
[7.41206251, 4.80305318],
[5.912852, 3.72918089],
[9.21547627, 2.8132231],
[7.36039738, 3.35043406],
[7.13698009, 0.40130301]]
# 定义目标值
raw_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
# 要预测的值
x = [8.093607318, 3.365731514]
# 定义k值
k = 6
knn = ProKNNClassifier(raw_x, raw_y, x, k)
predict = knn.train()
print(predict)