import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
1.数据预处理
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['class'] = iris.target
df['class'] = df['class'].map({0: iris.target_names[0], 1: iris.target_names[1], 2: iris.target_names[2]})
df.head()
|
sepal length (cm) |
sepal width (cm) |
petal length (cm) |
petal width (cm) |
class |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
setosa |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
setosa |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
setosa |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
setosa |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
setosa |
x = iris.data
y = iris.target.reshape(-1, 1)
print("x shape: " , x.shape)
print("y shape: ", y.shape)
x shape: (150, 4)
y shape: (150, 1)
x_train, x_test, y_train, y_test = train_test_split(x,
y,test_size=0.3, random_state=42, stratify=y)
2. 模型实现
def l1_distance(a, b):
return np.sum(np.abs(a - b), axis = 1)
def l2_distance(a, b):
return np.sqrt(np.sum((a - b)**2, axis = 1))
class KnnModel(object):
def __init__(self, k_neighbors = 1, distance_func = l1_distance):
self.k_neighbors = k_neighbors;
self.distance_func = distance_func
def fit(self, x, y):
self.x_train = x
self.y_train = y
def predict(self, test):
y_predict = np.zeros((test.shape[0],1), dtype=self.y_train.dtype)
for i, x_test in enumerate(test):
distances = self.distance_func(self.x_train, x_test)
sort_index = np.argsort(distances)
neighbors_predict = self.y_train[sort_index[:self.k_neighbors]].ravel()
y_predict[i] = np.argmax(np.bincount(neighbors_predict))
return y_predict
3.测试
knn = KnnModel(k_neighbors = 9)
knn.fit(x_train, y_train);
result_list = []
for df in [1, 2]:
knn.distance_func = l1_distance if pd == 1 else l2_distance
for k in range(1, 20 , 2):
knn.k_neighbors = k
y_predict = knn.predict(x_test)
acc = accuracy_score(y_test, y_predict) * 100
result_list.append([k, 'l1_dist' if df == 1 else 'l2_dist', acc])
result_df = pd.DataFrame(result_list, columns=['k', '距离函数', '准确率'])
print(result_df)
k 距离函数 准确率
0 1 l1_dist 93.333333
1 3 l1_dist 95.555556
2 5 l1_dist 97.777778
3 7 l1_dist 95.555556
4 9 l1_dist 95.555556
5 11 l1_dist 93.333333
6 13 l1_dist 93.333333
7 15 l1_dist 95.555556
8 17 l1_dist 95.555556
9 19 l1_dist 95.555556
10 1 l2_dist 93.333333
11 3 l2_dist 95.555556
12 5 l2_dist 97.777778
13 7 l2_dist 95.555556
14 9 l2_dist 95.555556
15 11 l2_dist 93.333333
16 13 l2_dist 93.333333
17 15 l2_dist 95.555556
18 17 l2_dist 95.555556
19 19 l2_dist 95.555556