KNeighborsClassifier(
n_neighbors=5, # 可选(默认为5),查询的邻居数量
weights=’uniform’,
algorithm=’auto’,
leaf_size=30,
p=2,
metric=’minkowski’,
metric_params=None,
n_jobs=1,
**kwargs
)
from sklearn.neighbors import KNeighborsClassifier # K邻近模型
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.spatial import distance # 求欧式距离
data = pd.read_excel('FilmCategory.xlsx')
X = data[['搞笑镜头', '拥抱镜头', '打斗镜头']]
y = data['电影类型']
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X, y.map({'喜剧片': 0, '动作片': 1, '爱情片': 2}))
model.predict([[50, 30, 49]])
距离度量
欧式距离(Euclidean Distance)
采用矩阵来提高计算效率: d a b = ( a − b ) T ( a − b ) {d_a}_b = \sqrt{(a-b)^T(a-b)} dab=(a−b)T(a−b)
欧式距离API:scipy.spatial.distance.cdist(XA, XB, metric=‘euclidean’, *args, **kwargs)
from scrip.spatial import distance
ab = distance.cdist([[65000000, 1.74]], [[60000000, 1.70]], metric='euclidean')
print(ab) # array([[5000000.]])
ac = distance.cdist([[65000000, 1.74]], [[65000000, 1.40]], metric='euclidean')
print(ac) # array([[0.34]])
标准化欧式距离(Standardized Euclidean Distance)
标准化变量公式: s = ∑ 1 n X − m n {s = \frac{\sum_1^n {X - m}}{n}} s=n∑1nX−m
标准欧式距离公式: d 1 2 = ∑ k = 1 n ( x 1 k − x 2 k s k ) 2 {d_1}_2 = \sqrt{\sum_{k=1}^{n}{(\frac{{x_1}_k - {x_2}_k}{s_k})^2}} d12=∑k=1n(skx1k−x2k)2
from scrip.spatial import distance
ab = distance.cdist([[65000000, 1.74]], [[60000000, 1.70]], metric='seuclidean')
print(ab) # array([[2.]])
ac = distance.cdist([[65000000, 1.74]], [[65000000, 1.40]], metric='seuclidean')
print(ac) # array([[nan]])
曼哈顿距离(Manhattan Distance)
在曼哈顿街区要从一个十字路口开车到另一个十字路口,驾驶距离显然不是两点间的直线距离。这个实际驾驶距离就 是“曼哈顿距离”。曼哈顿距离也称为“城市街区距离”(City Block distance)。
曼哈顿距离公式: d 1 2 = ∑ k = 1 n ∣ x 1 k − x 2 k ∣ {d_1}_2 = \sum_{k=1}^{n} \left| {{x_1}_k - {x_2}_k} \right| d12=∑k=1n∣x1k−x2k∣
切比雪夫距离(Chebyshev Distance)
国际象棋中,国王可以直行、横行、斜行,所以国王走一步可以移动到相邻8个方格中的任意一个。国王从格子 走到格子 最少需要多少步?这个距离就叫切比雪夫距离。
切比雪夫距离公式: d a b = max ( ∣ x 1 k − x 2 k ∣ ) {d_a}_b = \max(\left | {{x_1}_k - {x_2}_k} \right |) dab=max(∣x1k−x2k∣)
闵可夫斯基距离(Minkowski Distance)
闵氏距离不是一种距离,而是一组距离的定义,是对多个距离度量公式的概括性的表述。
闵可夫斯基距离公式: d a b = p ∑ k = 1 n ∣ x 1 k − x 2 k ∣ p {d_a}_b = p\sqrt{\sum_{k=1}^{n}{\left | {{x_1}_k - {x_2}_k} \right | ^ p}} dab=p∑k=1n∣x1k−x2k∣p
闵可夫斯基距离公式参数:
闵可夫斯基距离的缺点:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
import operator # 运算符高效函数
def file_data(path):
"""
加载文件,转换为numpy的数组
"""
trains = os.listdir(path)
train_data = []
labels = []
for train in trains:
labels.append(int(train.split("_")[0]))
with open(path + train, 'r', encoding='utf8') as f:
line = f.readline()
train_lines = []
while line:
for l in line[:-1]:
train_lines.append(int(l))
line = f.readline()
arr = np.array(train_lines)
train_data.append(arr)
return np.array(train_data), np.array(labels)
def file_txt(txt_path):
"""
加载txt文件
"""
data = []
with open(txt_path, 'r', encoding='utf8') as f:
line = f.readline()[:-1]
while line:
for l in line:
data.append(int(l))
line = f.readline()[:-1]
f.close()
return np.array(data)
def file_img(img_path, img_size=32):
"""
加载图片转为numpy数组
"""
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
arr = []
img = cv2.resize(img, (img_size, img_size))
for i in img:
for j in i:
if j > 170:
arr.append(0)
else:
arr.append(1)
return np.array(arr)
def classfy(input_data, train_data, labels, k):
"""
K邻居算法,预测结果
"""
size = train_data.shape[0]
distance = np.sqrt(
np.sum(
np.power(
np.tile(input_data, (size, 1)) - train_data,
2
),
axis=1
)
)
distance_sort = np.argsort(distance)
distance_dict = {}
for i in range(k):
k_label = labels[distance_sort[i]]
distance_dict[k_label] = distance_dict.get(k_label, 0) + 1
sort_count = sorted(
distance_dict.items(),
key=operator.itemgetter(1),
reverse=True
)
return sort_count[0][0]
# 对训练集数据进行整合
path = "./data/trainingDigits/"
train_data, labels = file_data(path)
# 对测试数据进行测试,并输出准确率
test_path = "./data/testDigits/"
test_data, test_label = file_data(test_path)
test_count = len(test_data)
success = 0
for i in range(test_count):
predict = classfy(test_data[i], train_data, labels, 3)
if predict == test_label[i]:
success += 1
print(success / test_count)
# 对图片进行测试,并输出numpy数组、预测结果和正确数字
for j in range(10):
img_path = './data/selfDigits/image/{}.png'.format(j)
input_data = file_img(img_path)
for i in range(1024):
print(input_data[i], end='')
if (i + 1) % 32 == 0:
print()
print(classfy(input_data, train_data, labels, k=3), j)
from sklearn.datasets import load_iris # 鸢尾花数据集
from sklearn.model_selection import train_test_split # train_test_split对数据分割成训练集和测试集;
from sklearn.preprocessing import StandardScaler # 特征工程,标准化,对数据进行归一化
from sklearn.neighbors import KNeighborsClassifier # K邻居模型
import numpy as np
import matplotlib.pyplot as plt
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
data['data'],
data['target'],
test_size=0.2,
random_state=22
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
model_list = []
train_score = []
test_score = []
for i in range(1, len(X_train) + 1):
model = KNeighborsClassifier(n_neighbors=i)
model.fit(X_train, y_train)
test_score.append(model.score(X_test, y_test))
train_score.append(model.score(X_train, y_train))
model_list.append(model)
score_max = np.max(test_score)
number = np.argmax(test_score) + 1
print("第{}个模型,训练成绩最好,成绩为:{},".format(number, score_max))
plt.plot(train_score, label='train')
plt.plot(test_score, label='test')
plt.legend()
plt.show()
from sklearn.datasets import load_iris # 鸢尾花数据集
from sklearn.model_selection import train_test_split, GridSearchCV # train_test_split对数据分割成训练集和测试集; GridSearchCV对数据进行交叉验证
from sklearn.preprocessing import StandardScaler # 特征工程,标准化,对数据进行归一化
from sklearn.neighbors import KNeighborsClassifier # K邻居模型
import numpy as np
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
data['data'],
data['target'],
test_size=0.2,
random_state=22
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
model = KNeighborsClassifier()
model = GridSearchCV(estimator=model, param_grid={'n_neighbors': [i + 1 for i in range(90)]}, cv=4)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(score)
"""
model.best_score_ 获取最好的成绩
model.best_estimator_ 获取最好的模型
"""