K近邻法理论推导:https://blog.csdn.net/ACM_hades/article/details/89644882
# encoding=utf-8
from collections import Counter
import pandas as pd
import numpy as np
import cv2
import random
import time
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
# 一个和快排一样的思想
def partition_sort(arr, k, axis):
"""
以位置k为中心将数组划分为两部分, 左侧的元素不大于位置k值;右侧的元素大于位置k值
:param arr: 待划分数组
:param p: 枢纽前部元素个数
:param key: 比较方式
:return: None
"""
start, end = 0, len(arr) - 1
assert 0 <= k <= end
while True:
i, j, pivot = start, end, deepcopy(arr[start])
while i < j:
# 从右向左查找较小元素
while i < j and pivot[axis] <= arr[j][axis]:
j -= 1
if i == j:
break
arr[i] = arr[j]
i += 1
# 从左向右查找较大元素
while i < j and arr[i][axis] <= pivot[axis]:
i += 1
if i == j:
break
arr[j] = arr[i]
j -= 1
arr[i] = pivot
if i == k:
return
elif i < k:
start = i + 1
else:
end = i - 1
class Node:
def __init__(self, data, depth=0, lchild=None, rchild=None):
self.data = data
self.depth = depth
self.lchild = lchild
self.rchild = rchild
class KdTree:
def __init__(self, dimens):
self.KdTree = None
self.dimens = dimens
self.nearest = None
self.Max_depth = -1
def create(self, dataSet, depth=0):
if len(dataSet) > 0:
m = np.shape(dataSet)[0] # 样本数
axis = depth % self.dimens # 切割轴(最后的轴是label)
mid = m >> 1 # 中位数索引
partition_sort(dataSet, mid, axis) # 拆分
node = Node(dataSet[mid], depth) # 构建当前节点
if depth == 0: # 根节点
self.KdTree = node
node.lchild = self.create(dataSet[:mid], depth + 1) # 递归构树
node.rchild = self.create(dataSet[mid + 1:], depth + 1)
return node
return None
def Search(self, x, count=1):
nearest = []
for i in range(count): # 初始化[k个最近点]
nearest.append([-1, None])
self.nearest = np.array(nearest)
self.Count_node=0
self.Count_node1=0
def recurve(node): # DFS
if node is not None:
self.Count_node+=1
axis = node.depth % self.dimens # 当前节点的轴
daxis = x[axis] - node.data[axis] # x与当前节点在对应轴上的距离
if daxis < 0: # 递归到叶
recurve(node.lchild)
else:
recurve(node.rchild)
dist = np.sqrt(np.sum((x - node.data[:-1]) ** 2)) # x与当前节点的距离
# 更新最小的k个距离,主要应该k一般比较小所以这里使用插入排序的思想
#如果k较大可以使用堆来降低时间复杂度
for i, d in enumerate(self.nearest):
if d[0] < 0 or dist < d[0]:
self.nearest = np.insert(self.nearest, i, [dist, node], axis=0)
self.nearest = self.nearest[:-1] # 长度不变
break
n = list(self.nearest[:, 0]).count(-1) # 统计-1的个数
#k给最小距离的最大距离形成的球体是否与当前节点的分割面相交,
# 相交就要考虑另一个子结点
# print(axis,":",self.nearest[-n - 1, 0],":",abs(daxis))
if self.nearest[-n - 1, 0] > abs(daxis):
if daxis < 0: # 递归其对应的另一个子结点
recurve(node.rchild)
else:
recurve(node.lchild)
else:
self.Count_node1+=1
recurve(self.KdTree)
knn = self.nearest[:, 1]#取出k个最近样本
belong = []
for i in knn:
belong.append(i.data[-1])#取样本label
b = Counter(belong).most_common(1)[0][0]
return b, self.Count_node,self.Count_node1
def preOrder(self, node):
if node is not None:
self.preOrder(node.lchild)
self.preOrder(node.rchild)
if node.depth>self.Max_depth:
self.Max_depth=node.depth
def Predict(testset,kdt,k):
predict = []
C=0
S=time.time()
for test_vec in testset:
C+=1
Label,C_n,C_m=kdt.Search(test_vec[:-1],k)
# print(C,":",C_n,":",C_m)
predict.append(Label)
if C%1000==0:
print("当前样本数:",C," Cost: ",time.time()-S)
S = time.time()
return np.array(predict)
if __name__ == '__main__':
print('Start read data')
S = time.time()
iris = load_iris()
Data=iris.data
Label=iris.target
Data_set=np.hstack((Data, np.reshape(Label,(-1,1))))
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
Train_set, Test_set = train_test_split(Data_set, test_size=0.33, random_state=23323)
print("Data shape:", Data.shape,type(Data))
print("Label shape:", Label.shape,type(Label))
print("Data_set shape:", Data_set.shape, type(Data_set))
print("Train_set shape:", Train_set.shape, type(Train_set))
print("Train_set shape:", Test_set.shape, type(Test_set))
print('read data cost ', time.time() - S, ' second')
print('Start Train (build KdTree)')
S = time.time()
kdt = KdTree(Train_set.shape[-1]-1)
kdt.create(Train_set)
# kdt.preOrder(kdt.KdTree)
# print("树的最大深度:",kdt.Max_depth)
print('training cost ', time.time() - S, ' second')
print('Start predicting')
S = time.time()
k = 5
test_predict = Predict(Test_set,kdt,k)
print('predicting cost ', time.time() - S, ' second')
score = accuracy_score(Test_set[:,-1], test_predict)
print("The accruacy socre is ", score)
结果:
Start read data
Data shape: (150, 4)
Label shape: (150,)
Data_set shape: (150, 5)
Train_set shape: (100, 5)
Train_set shape: (50, 5)
read data cost 0.0020248889923095703 second
Start Train (build KdTree)
training cost 0.0019648075103759766 second
Start predicting
predicting cost 0.04487943649291992 second
The accruacy socre is 0.98