KNN
欧氏距离:
d ( x , y ) = ∑ k = 1 n ( x k − y k ) 2 d(x,y)=\sqrt{\sum_{k=1}^{n}{(x_k-y_k)^2}} d(x,y)=k=1∑n(xk−yk)2
曼哈顿距离:
d ( x , y ) = ∑ k = 1 n ∣ x k − y k ∣ d(x,y)=\sqrt{\sum_{k=1}^{n}{|{x_k-y_k}|}} d(x,y)=k=1∑n∣xk−yk∣
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris=load_iris()
df=pd.DataFrame(data=iris.data,columns=iris.feature_names)
df['class']=iris.target
df['class']=df['class'].map({0:iris.target_names[0],1:iris.target_names[1],2:iris.target_names[2]})
df
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
class |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
setosa |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
setosa |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
setosa |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
setosa |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
setosa |
… |
… |
… |
… |
… |
… |
145 |
6.7 |
3.0 |
5.2 |
2.3 |
virginica |
146 |
6.3 |
2.5 |
5.0 |
1.9 |
virginica |
147 |
6.5 |
3.0 |
5.2 |
2.0 |
virginica |
148 |
6.2 |
3.4 |
5.4 |
2.3 |
virginica |
149 |
5.9 |
3.0 |
5.1 |
1.8 |
virginica |
df.describe()
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
count |
150.000000 |
150.000000 |
150.000000 |
150.000000 |
mean |
5.843333 |
3.057333 |
3.758000 |
1.199333 |
std |
0.828066 |
0.435866 |
1.765298 |
0.762238 |
min |
4.300000 |
2.000000 |
1.000000 |
0.100000 |
25% |
5.100000 |
2.800000 |
1.600000 |
0.300000 |
50% |
5.800000 |
3.000000 |
4.350000 |
1.300000 |
75% |
6.400000 |
3.300000 |
5.100000 |
1.800000 |
max |
7.900000 |
4.400000 |
6.900000 |
2.500000 |
x=iris.data
y=iris.target.reshape(-1,1)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=35,stratify=y)
def l1_distance(a,b):
return np.sum(np.abs(a-b),axis=1)
def l2_distance(a,b):
return np.sqrt(np.sum((a-b)**2,axis=1))
class kNN(object):
def __init__(self,n_neighbors = 1,dist_func = l1_distance):
self.n_neighbors = n_neighbors
self.dist_func = dist_func
def fit(self,x,y):
self.x_train=x
self.y_train=y
def predict(self,x):
y_pred = np.zeros( (x.shape[0], 1), dtype=self.y_train.dtype )
for i,x_test in enumerate(x):
distances = self.dist_func(self.x_train,x_test)
nn_index=np.argsort(distances)
nn_y = self.y_train[nn_index[:self.n_neighbors]].ravel()
y_pred[i] =np.argmax(np.bincount(nn_y))
return y_pred
knn = kNN(n_neighbors = 3)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
print(y_test.ravel())
print(y_pred.ravel())
accuracy = accuracy_score(y_test, y_pred)
print("预测准确率: ", accuracy)
[2 1 2 2 0 0 2 0 1 1 2 0 1 1 1 2 2 0 1 2 1 0 0 0 1 2 0 2 0 0 2 1 0 2 1 0 2
1 2 2 1 1 1 0 0]
[2 1 2 2 0 0 2 0 1 1 1 0 1 1 1 2 2 0 1 2 1 0 0 0 1 2 0 2 0 0 2 1 0 2 1 0 2
1 2 1 1 2 1 0 0]
预测准确率: 0.9333333333333333
knn =kNN()
knn.fit(x_train,y_train)
result_list =[]
for p in [1,2]:
knn.dist_func =l1_distance if p ==1 else l2_distance
for k in range(1,10,2):
knn.n_neighbors =k
y_pred=knn.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
result_list.append([k,'l1_diatance' if p==1 else 'l2_diatance',accuracy])
df = pd.DataFrame(result_list,columns=['k','距离函数','预测准确率'])
df
|
k |
距离函数 |
预测准确率 |
0 |
1 |
l1_diatance |
0.933333 |
1 |
3 |
l1_diatance |
0.933333 |
2 |
5 |
l1_diatance |
0.977778 |
3 |
7 |
l1_diatance |
0.955556 |
4 |
9 |
l1_diatance |
0.955556 |
5 |
1 |
l2_diatance |
0.933333 |
6 |
3 |
l2_diatance |
0.933333 |
7 |
5 |
l2_diatance |
0.977778 |
8 |
7 |
l2_diatance |
0.977778 |
9 |
9 |
l2_diatance |
0.977778 |
结果分析:使用含哈顿距离且k值为5-9时,预测准确率更高