之前的近邻算法(NN)是仅仅选择一个最近的图像标签,K-NN是选出K个差值最小的图像标签,然后看那个标签的数量多就选用那个标签作为预测值,这样就提高了泛化能力。
有时候,训练集数量较小(因此验证集的数量更小)。如果是交叉验证集,将训练集平均分成5份,其中4份用来训练,1份用来验证。然后我们循环着取其中4份来训练,其中1份来验证,最后取所有5次验证结果的平均值作为算法验证结果。
import numpy as np
import pickle
import matplotlib.pyplot as plt
'''
输入训练集及测试集
'''
file_path = "E:/cifar-10-python/cifar-10-batches-py/"
def unpickle(file):
import pickle
with open(file, 'rb') as fo:
dict = pickle.load(fo, encoding='latin1')
return dict
'''
加载数据集
'''
def load_CIFAR10(file):
# dictTrain1 = unpickle(file + "data_batch_1")
# dataTrain1 = dictTrain1['data']
# labelTrain1 = dictTrain1['labels']
#
# dictTrain2 = unpickle(file + "data_batch_2")
# dataTrain2 = dictTrain2['data']
# labelTrain2 = dictTrain2['labels']
#
# dictTrain3 = unpickle(file + "data_batch_3")
# dataTrain3 = dictTrain3['data']
# labelTrain3 = dictTrain3['labels']
#
# dictTrain4 = unpickle(file + "data_batch_4")
# dataTrain4 = dictTrain4['data']
# labelTrain4 = dictTrain4['labels']
#
# dictTrain5 = unpickle(file + "data_batch_5")
# dataTrain5 = dictTrain5['data']
# labelTrain5 = dictTrain5['labels']
# dataTrain = np.vstack([dataTrain1, dataTrain2, dataTrain3, dataTrain4, dataTrain5])
# labelTrain = np.concatenate([labelTrain1, labelTrain2, labelTrain3, labelTrain4, labelTrain5])
dictTrain = unpickle(file + "data_batch_1")
dataTrain = dictTrain['data']
labelTrain = dictTrain['labels']
for i in range(2,6):
dictTrain = unpickle(file+"data_batch_"+str(i))
dataTrain = np.vstack([dataTrain, dictTrain['data']])
labelTrain = np.hstack([labelTrain, dictTrain['labels']])
dictTest = unpickle(file + "test_batch")
dataTest = dictTest['data']
labelTest = dictTest['labels']
labelTest = np.array(labelTest)
return dataTrain, labelTrain, dataTest, labelTest
class KNearestNeighbor(object):
def __init__(self):
self.X_train = None
self.y_train = None
def train(self, X_train, y_train):
"""KNN无需训练
"""
self.X_train = X_train
self.y_train = y_train
def compute_distances_L1(self, X_test):
"""计算测试集和每个训练集的曼哈顿距离
:param X_test: 测试集 numpy.ndarray
:return: 测试集与训练集的欧氏距离数组 numpy.ndarray
"""
dists = np.zeros((X_test.shape[0], self.X_train.shape[0]))
for i in range(X_test.shape[0]):
dists[i] = np.sum( np.abs(self.X_train- X_test[i]), axis=1)
return dists
def compute_distances_L2(self, X_test):
"""计算测试集和每个训练集的欧氏距离
向量化实现需转化公式后实现(单个循环不需要)
:param X_test: 测试集 numpy.ndarray
:return: 测试集与训练集的欧氏距离数组 numpy.ndarray
"""
dists = np.zeros((X_test.shape[0], self.X_train.shape[0]))
value_2xy = np.multiply(X_test.dot(self.X_train.T), -2)
value_x2 = np.sum(np.square(X_test), axis=1, keepdims=True) #保持其维度不变
value_y2 = np.sum(np.square(self.X_train), axis=1)
dists = value_2xy + value_x2 + value_y2
return dists
def predict_label(self, dists, k):
"""选择前K个距离最近的标签,从这些标签中选择个数最多的作为预测分类
:param dists: 欧氏距离
:param k: 前K个分类
:return: 预测分类(向量)
"""
y_pred = np.zeros(dists.shape[0])
for i in range(dists.shape[0]):
# 取前K个标签
closest_y = self.y_train[np.argsort(dists[i, :])[:k]]
# 取K个标签中个数最多的标签
y_pred[i] = np.argmax(np.bincount(closest_y))
return y_pred
def predict(self, X_test, k, L):
"""选择前K个距离最近的标签,从这些标签中选择个数最多的作为预测分类
:param k: 前K个分类
:param L: 1 : L1(曼哈顿距离) 2:L2(欧氏距离)
:return: 预测向量
"""
if(L==1):
dists = self.compute_distances_L1(X_test)
else:
dists = self.compute_distances_L2(X_test)
y_pred = self.predict_label(dists, k)
return y_pred
def Cross_validation(X_train, y_train):
"""交叉验证,确定超参K,同时可视化K值
:param X_train: 训练集
:param y_train: 训练标签
"""
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
k_accuracy = {}
# 将数据集分为5份, X_train_folds ([],[],[],[],[]) 列表里面有个5个narray
X_train_folds = np.array_split(X_train, num_folds)
y_train_folds = np.array_split(y_train, num_folds)
print("length of x_train_folds", len(X_train_folds))
print("X_train shape", type(X_train[0]))
print("X_train len", X_train_folds[0].shape)
# 计算每种K值
for k in k_choices:
k_accuracy[k] = []
# 每个K值分别计算每份数据集作为测试集时的正确率
for index in range(num_folds):
# 构建数据集
X_te = X_train_folds[index]
y_te = y_train_folds[index]
X_tr = np.reshape( np.array(X_train_folds[:index] + X_train_folds[index + 1:]),
(int(X_train.shape[0] * (num_folds - 1) / num_folds), -1) )
y_tr = np.reshape(y_train_folds[:index] + y_train_folds[index + 1:],
int(X_train.shape[0] * (num_folds - 1) / num_folds))
# 预测结果
classify = KNearestNeighbor()
classify.train(X_tr, y_tr)
y_te_pred = classify.predict(X_te, k, 2)
accuracy = np.mean(y_te_pred == y_te)
k_accuracy[k].append(accuracy)
for k, accuracylist in k_accuracy.items():
for accuracy in accuracylist:
print("k = %d, accuracy = %.3f" % (k, accuracy))
# 可视化K值效果
for k in k_choices:
accuracies = k_accuracy[k]
plt.scatter([k] * len(accuracies), accuracies)
accuracies_mean = np.array([np.mean(v) for k, v in sorted(k_accuracy.items())])
accuracies_std = np.array([np.std(v) for k, v in sorted(k_accuracy.items())])
# 根据均值和方差构建误差棒图
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validation accuracy')
plt.show()
dataTrain, labelTrain, dataTest, labelTest = load_CIFAR10(file_path)
# print(dataTrain.shape)
# print(type(labelTrain))
# print(dataTest.shape)
# print(len(labelTest))
Cross_validation(dataTrain[:1000,:], labelTrain[:1000])
'''
# find hyperparameters that work best on the validation set
validation_accuracies = []
AccuaracyL1L2 = np.zeros([2, 8])
k_value = [1,2,4,8,16,32,64,128]
for k in range(len(k_value)):
knn = KNearestNeighbor()
knn.train(dataTrain[:5000,:], labelTrain[:5000])
label_predict = knn.predict(dataTest[:50,:], k_value[k], 1)
AccuaracyL1L2[0][k] = np.mean( label_predict == labelTest[:50] )
label_predict = knn.predict(dataTest[:50, :], k_value[k], 2)
AccuaracyL1L2[1][k] = np.mean( label_predict == labelTest[:50] )
accuracy = np.mean(AccuaracyL1L2, axis = 1)
print(AccuaracyL1L2)
if(accuracy[0] > accuracy[1]):
print("L1 准确率大于 L2")
print("最好的K取值为%d,最大准确率为 %f" % (k_value[ np.argmax(AccuaracyL1L2[0]) ], np.max(AccuaracyL1L2[0])))
else:
print("L2 准确率大于 L1")
print("最好的K取值为", np.max(AccuaracyL1L2[1]))
print("最好的K取值为%d,最大准确率为 %f" % (k_value[np.argmax(AccuaracyL1L2[1])], np.max(AccuaracyL1L2[1])))
'''
可以看到每个K对应5个训练集,这里使用的L1距离(欧氏距离),下图是对应的可视化图像,相对与L2距离,L1在交叉验证,总体验证时的效果要比L2好。
下图是L2距离的效果验证
参考https://zhuanlan.zhihu.com/p/20900216