-
简述
-
LoadData.py
-
KNNclassifier.py
-
KNN_main.py
-
参阅
一、简述
KNN使用的是cifar的数据集,需要自己去这里进行下载。数据集不大,才一百多M。总共60000张图片。导入的时候由于自己电脑配置的原因可能会无法全部导入,所以在load_data类里我将range(1,6)改成了range(1,2)(由原本的5份导入改成了1份导入)。
- 开发环境Pycharm
numpy的导入最好能有办法科学上网(手动滑稽)
二、首先要导入数据集(Load_Data.py)
import pickle
import numpy as np
import os
def load_cifar_batch(filename):
with open(filename, 'rb') as f:
datadict = pickle.load(f, encoding='bytes')
# print(datadict)
x = datadict[b'data'] # 数据本身存放在名为b'data'的array中
y = datadict[b'labels']
# print(x)
x = x.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1).astype('float')
# print(x)
print(y)
y = np.array(y) # 将列表创建成数组后就可以利用下标来访问数组中的指定元素了
print(y)
return x, y
def load_cifar10(root):
xs = []
ys = []
for b in range(1, 2):
f = os.path.join(root, 'data_batch_%d' % b) # 从当前项目根目录下寻找'data_batch_%d'
x, y = load_cifar_batch(f)
xs.append(x)
ys.append(y)
Xtrain = np.concatenate(xs) # concatenate()将多组列表连接成一个数组
Ytrain = np.concatenate(ys)
del x, y
Xtest, Ytest = load_cifar_batch(os.path.join(root, 'test_batch'))
# os.path.join()函数用来拼接路径
# 如:os.path.join(“home”, "me", "mywork")
# 在Linux系统上会返回 “home/me/mywork"
# 在Windows系统上会返回 "home\me\mywork"
# 好处是可以根据系统自动选择正确的路径分隔符"/"或"\"
return Xtrain, Ytrain, Xtest, Ytest
三、KNN的分类器(KNNclassifier.py)
import numpy as np
class K_Nearest_Neighbor:
def __init__(self):
pass
def train(self, x, y):
self.X_train = x
self.y_train = y
def predict(self, X, k=1, num_loops=0):
if num_loops == 0:
dists = self.compute_distances_no_loops(X)
elif num_loops == 1:
dists = self.compute_distances_one_loop(X)
elif num_loops == 2:
dists = self.cumpute_distances_two_loops(X)
else:
raise ValueError('Invalid value %d for num_loops' % num_loops)
return self.predict_labels(dists, k=k)
def cumpute_distances_two_loops(self, X):
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
print(dists.shape)
print("X.shape, self.X_train.shape",X.shape, self.X_train.shape)
"""
对测试集中的每一幅图片都与训练集中所有图片进行距离计算,计算结果保留在dists当中
dists有num_test行,num_train列。第i行j列存放测试集中第i幅图片与训练集中第j幅图片见的距离
(其中num_test为测试集的行数(或者说个数?),num_train为训练集的个数(这样理解比较清楚))
"""
for i in range(num_test):
for j in range(num_train):
dists[i, j] = np.sqrt(np.sum((X[i, :] - self.X_train[j, :]) ** 2))
return dists
def compute_distances_one_loop(self, X):
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
for i in range(num_test):
dists[i, :] = np.sqrt(np.sum(np.square(self.X_train-X[i, :]), axis=1))
return dists
def compute_distances_no_loops(self, X):
num_test = X.shape[0]
num_train = self.X_train.shape[0]
dists = np.zeros((num_test, num_train))
dists = np.multiply(np.dot(X, self.X_train.T), -2)
sq1 = np.sum(np.square(X), axis=1, keepdims=True) # keepdims = True使计算后依然为维度是测试样本数量的列向量
sq2 = np.sum(np.square(self.X_train), axis=1)
dists = np.add(dists, sq1)
dists = np.add(dists, sq2)
dists = np.sqrt(dists)
# test_sum = np.sum(np.square(X), axis=1)
# train_sum = np.sum(np.square(self.X_train), axis=1)
# inner_product = np.dot(X, self.X_train.T)
# dists = np.sqrt(-2 * inner_product + test_sum.reshape(-1, 1) + train_sum)
return dists
"""
传入距离矩阵,从距离矩阵中寻找每一行中的前k大值
"""
def predict_labels(self, dists, k=1):
num_test = dists.shape[0]
y_pred = np.zeros(num_test) # np.zeros(5) :array([ 0., 0., 0., 0., 0.])
for i in range(num_test):
closest_y = []
# y_indicies = np.argsort(dists[i, :], axis=0)
"""
np.argsort()返回数组值从小到大的索引值
np.argsort(dists[i])[:k]返回dists[i]中从小到大的前k个数,同过y_train将这k的数对应到对应的分类标签
"""
closest_y = self.y_train[np.argsort(dists[i])[:k]]
"""
np.bincount()返回参数中各个值出现的次数,并将其对应到索引上
# 我们可以看到x中最大的数为7,因此bin的数量为8,那么它的索引值为0->7
x = np.array([0, 1, 1, 3, 2, 1, 7])
# 索引0出现了1次,索引1出现了3次......索引5出现了0次......
np.bincount(x)
#因此,输出结果为:array([1, 3, 1, 1, 0, 0, 0, 1])
np.argmax()取最大值所对应的索引
"""
y_pred[i] = np.argmax(np.bincount(closest_y))
return y_pred
四、KNN_main.py
import numpy as np
import time
from Load_data import load_cifar10
import matplotlib.pyplot as plt
from KNNclassifier import K_Nearest_Neighbor
x_train, y_train, x_test, y_test = load_cifar10('cifar-10-batches-py')
print('training data shape:', x_train.shape)
print('training labels shape:', y_train.shape)
print('test data shape:', x_test.shape)
print('test labels shape:', y_test.shape)
# 图片加载部分
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
num_claesses = len(classes)
samples_per_class = 7
for y, cls in enumerate(classes): # y是下标,cls是classes对应下标的元素
idxs = np.flatnonzero(y_train == y) # 输入一个矩阵返回扁平化后矩阵中非零元素的位置,此处返回类型为y的元素所处的位置
# print(idxs)
idxs = np.random.choice(idxs, samples_per_class, replace=False) # 从idxs中随机抽取samples_per_class个样本且不重用
for i, idx in enumerate(idxs):
"""
在子图中所占位置的计算(1,11...61)(2,12...62)...(10,20...70)从上往下绘制出现的情况
"""
plt_idx = i * num_claesses + y + 1
# print(plt_idx)
plt.subplot(samples_per_class, num_claesses, plt_idx) # samples_per_class行,num_claesses列,第plt_idx个图
plt.imshow(x_train[idx].astype('uint8')) # 按unit8的类型显示x_train中下标为idx的元素
plt.axis('off') # 关闭坐标轴
if i == 0:
plt.title(cls) # 标题行
plt.show()
num_training = 5000
mask = list(range(num_training))
print(mask)
x_train = x_train[mask]
y_train = y_train[mask]
num_test = 500
mask = list(range(num_test))
x_test = x_test[mask]
y_test = y_test[mask]
# 将图像数据拉长成行向量用于计算欧氏距离
"""
x_train.shape[0]在这里就是图片的个数,相当于将数据集按单张图片进行了划分
"""
x_train = np.reshape(x_train, (x_train.shape[0], -1)) # 将x_train变成x_train.shape[0]行、剩下列(自动补全-1所处维度)
# print(x_train)
x_test = np.reshape(x_test, (x_test.shape[0], -1)) # shape[0]为第一个维度
print('x_train.shape(5000):')
print(x_train.shape, x_test.shape)
classifier = K_Nearest_Neighbor()
classifier.train(x_train, y_train)
dists = classifier.cumpute_distances_two_loops(x_test)
print(dists.shape)
plt.imshow(dists, interpolation='none') # 距离矩阵可视化
plt.show()
# dists_one = classifier.compute_distances_one_loop(x_test)
# dists_no = classifier.compute_distances_no_loops(x_test)
print('dists', dists)
# print('dists-one', dists_one)
# print('dist-no', dists_no)
"""
将k设置为1使用最近邻算法
"""
y_test_pred = classifier.predict_labels(dists, k=5)
# 计算并打印准确率
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / num_test
print('got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
dists_one=classifier.compute_distances_one_loop(x_test)
"""
用Frobenius范数来比较两个矩阵
"""
difference=np.linalg.norm(dists-dists_one,ord='fro')
print('difference was: %f' % difference)
if difference < 0.001:
print('Good! The distance matrices are the same')
else:
print('Uh-oh! The distance matrices are different')
dists_no=classifier.compute_distances_no_loops(x_test)
difference=np.linalg.norm(dists-dists_no,ord='fro')
print('difference was: %f' % difference)
"""
对比各方法执行速度
"""
def time_function(f,*args):
tic=time.time()
f(*args)
toc=time.time()
return toc-tic
two_loop_time=time_function(classifier.cumpute_distances_two_loops,x_test)
print('two loops version took %f seconds' % two_loop_time)
one_loop_time=time_function(classifier.compute_distances_one_loop,x_test)
print('one loop version took %f seconds' % one_loop_time)
no_loops_time=time_function(classifier.compute_distances_no_loops,x_test)
print('no loops version took %f seconds' % no_loops_time)
"""交叉验证"""
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
X_train_folds = []
y_train_folds = []
X_train_folds = np.array_split(x_train, num_folds)
y_train_folds = np.array_split(y_train, num_folds)
k_to_accuracies = {}
classifier = K_Nearest_Neighbor()
for k in k_choices:
accuracies = np.zeros(num_folds)
for fold in range(num_folds):
temp_X = X_train_folds[:]
temp_y = y_train_folds[:]
X_validate_fold = temp_X.pop(fold)
y_validate_fold = temp_y.pop(fold)
temp_X = np.array([y for x in temp_X for y in x])
temp_y = np.array([y for x in temp_y for y in x])
classifier.train(temp_X, temp_y)
y_test_pred = classifier.predict(X_validate_fold, k=k)
num_correct = np.sum(y_test_pred == y_validate_fold)
accuracy = float(num_correct)/num_test
accuracies[fold] = accuracy
k_to_accuracies[k] = accuracies
for k in sorted(k_to_accuracies):
for accuracy in k_to_accuracies[k]:
print('k = %d, accuracy = %f' % (k, accuracy))
for k in k_choices:
accuracies = k_to_accuracies[k]
plt.scatter([k]*len(accuracies), accuracies)
accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('cross-validation on k')
plt.xlabel('k')
plt.ylabel('cross-validation accuracy')
plt.show()
"""
根据交叉验证得到的最优K进行分类
"""
best_k = 10
classifier = K_Nearest_Neighbor()
classifier.train(x_train, y_train)
y_test_pred = classifier.predict(x_test, k=best_k)
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct)/num_test
print('Got %d/%d correct => accuracy: %f' %(num_correct, num_test, accuracy))
五、参阅
- Asignment 1-Q1 k-Nearest Neighbor (kNN) exercise
- 图像分类笔记(上)
- 图像分类笔记(下)