假设有a_{1},a_{2},a_{3}三个数,对应权重为 p 2 , p 2 , p 3 p_{2},p_{2},p_{3} p2,p2,p3,则加权平均值: a ˉ = p 1 a 1 + p 2 a 2 + p 3 a 3 p 1 + p 2 + p 3 \bar{a}=\frac{p_{1} a_{1}+p_{2} a_{2}+p_{3} a_{3}}{p_{1}+p_{2}+p_{3}} aˉ=p1+p2+p3p1a1+p2a2+p3a3
K近邻:就是k个最近的邻居的意思,说的是每个样本都可以用它最接近的k个邻居来代表。
算法原理较简单,主要是通过距离度量进行实现:
距离、k值的选择、决策规则
# python3.7
# -*- coding: utf-8 -*-
#@Author : huinono
#@Software : PyCharm
import warnings
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.metrics import classification_report,mean_squared_error,r2_score
warnings.filterwarnings('ignore')
mpl.rcParams['font.sans-serif'] = 'SimHei'
mpl.rcParams['axes.unicode_minus'] = 'False'
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = 'False'
def knn_Vote():
movie = ['爱情片','动作片']
data = np.array([
[3,104,0],[2,100,0],[1,81,0],[101,10,1],[99,5,1],[98,2,1]
])
#test sample
x = [18,90]
k=5 #k value
dis = []
for i in data:
d = np.sqrt((x[0] - i[0]) ** 2 + (x[1] - i[1]) ** 2)
dis.append([d,i[2]])
dis.sort(key=lambda x:x[0])
print(dis)
count = {}
for i in dis[0:k]:
if count.get(i[1]) == None:
count[i[1]] = 1
else:
count[i[1]] += 1
max_key = max(count,key=count.get)
print(count)
print(movie[max_key])
def knn_VoteRight():
movie = ['爱情片', '动作片']
data = np.array([
[3, 104, 0], [2, 100, 0], [1, 81, 0],
[101, 10, 1], [99, 5, 1], [98, 2, 1]
])
# test sample
x = [18, 90]
k = 5 # k value
dis = []
for i in data:
d = np.sqrt((x[0] - i[0]) ** 2 + (x[1] - i[1]) ** 2)
dis.append([d, i[2]])
dis.sort(key=lambda x: x[0])
print(dis)
count = {}
for i in dis:
if count.get(i[1]) == None:
count[i[1]] = 1/i[0]
else:
count[i[1]] += 1/i[0]
max_key = max(count,key=count.get)
print(count)
print(movie[max_key])
def knn_Regression():
data = np.array([
[3, 104, 98], [2, 100, 93], [1, 81, 95],
[101, 10, 16], [99, 5, 8], [98, 2, 7]
])
# test sample
x = [18, 90]
k = 5 # k value
dis = []
for i in data:
d = np.sqrt((x[0] - i[0]) ** 2 + (x[1] - i[1]) ** 2)
dis.append([d, i[2]])
dis.sort(key=lambda x: x[0])
sum = 0
for i in dis[0:k]:
sum += i[1]
print(sum/k) #the value is predicted by regression
def knn_VoteRegression():
data = np.array([
[3, 104, 98], [2, 100, 93], [1, 81, 95],
[101, 10, 16], [99, 5, 8], [98, 2, 7]
])
# test sample
x = [18, 90]
k = 5 # k value
dis = []
for i in data:
d = np.sqrt((x[0] - i[0]) ** 2 + (x[1] - i[1]) ** 2)
dis.append([d, i[2]])
dis.sort(key=lambda x: x[0])
dis = [[1/i[0],i[1]] for i in dis][0:k]
a = 1/sum([i[1] for i in dis])
res = sum([i[0]*i[1] for i in dis])
print(res*a)
def knn_classification():
iris_dara = load_iris()
x = iris_dara['data']
y = iris_dara['target']
#y的标签化处理
y = LabelEncoder().fit_transform(y)
x = StandardScaler().fit_transform(x)
#使用留出法切分数据
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)
#使用网格搜索交叉验证选取最佳k值
param_grid = {'n_neighbors':[3,4,5,6]}
model = KNeighborsClassifier()
knn = GridSearchCV(model,param_grid=param_grid,cv=5)
knn.fit(x_train,y_train)
print(knn.best_params_)
y_ = knn.predict(x_test)
print(classification_report(y_test,y_))
#计算出的概率
print(knn.predict_proba(x_test))
def knn_regression():
data = pd.read_csv(r'data/Advertising.csv')
x = data.iloc[:,:-1]
y = data.iloc[:,-1:]
x = StandardScaler().fit_transform(x)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)
param_grid = {'n_neighbors':[3,4,5,6]}
model = KNeighborsRegressor()
knn = GridSearchCV(model,param_grid=param_grid)
knn.fit(x_train,y_train)
print(knn.best_params_)
y_ = knn.predict(x_test)
print(mean_squared_error(y_test,y_))
print(r2_score(y_test,y_))
给定一个二维空间的数据集: T = { ( 2 , 3 ) , ( 5 , 4 ) , ( 9 , 6 ) , ( 4 , 7 ) , ( 8 , 1 ) , ( 7 , 2 ) } T=\{(2,3),(5,4),(9,6),(4,7),(8,1),(7,2)\} T={(2,3),(5,4),(9,6),(4,7),(8,1),(7,2)}
假设搜索(2,5)
import numpy as np
from sklearn.neighbors import KDTree
if __name__ == '__main__':
x = np.array([[2,3],[5,4],[9,6],[4,7],[8,1],[7,2]])
tree = KDTree(x,leaf_size=2)
dist,ind = tree.query(np.array([[2,5]]),k=5)
print(ind)
print(dist)