1机器学习&图像分类之KNN实现与sklearn调用

图像分类中最简单的算法KNN,选取距离待测图像距离最近的k个图像,统计各类的比例,选择比例最高的图像种类,作为新图像的分类结果。
显然,KNN算法有一个鲜明的特点,那就是训练过程几乎没有,而预测过程必须花费大量的时间,在与训练库进行匹配上。
以下是选择MNIST作为训练集,实现的KNN算法,用以记录。

import torch
import operator
import numpy as np
from torch.utils.data import DataLoader
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

class Knn:
    def __init__(self):
        pass

    def fit(self,X_train,y_train):
        self.Xtr=X_train
        self.ytr=y_train

    def predict(self,k,dis,X_test):
        assert dis == 'M' or dis == 'E'
        num_test = X_test.shape[0]
        labellist = []
        #几何距离
        if (dis == 'E'):
            for i in range(num_test):
                #距离计算
                distance = np.sum((self.Xtr - np.tile(X_test[i], (self.Xtr.shape[0], 1))) ** 2, axis=1)
                #前k个选择
                nearest_k = np.argsort(distance)
                topK = nearest_k[:k]
                classCount = {}
                for i in topK:
                    classCount[self.ytr[i]] = classCount.get(self.ytr[i], 0) + 1
                sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
                labellist.append(sortedClassCount[0][0]) #选择第一个
            return np.array(labellist)

        #曼哈顿距离
        elif (dis == 'M'):
            for i in range(num_test):
                distance = np.sum(np.abs(self.Xtr - np.tile(X_test[i], (self.Xtr.shape[0], 1))), axis=1)
                nearest_k = np.argsort(distance)
                topK = nearest_k[:k]
                classCount = {}
                for i in topK:
                    classCount[self.ytr[i]] = classCount.get(self.ytr[i], 0) + 1
                sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
                labellist.append(sortedClassCount[0][0])
            return np.array(labellist)


def getXmean(x_train):
    x_train = np.reshape(x_train, (x_train.shape[0], -1))  # 一维化
    mean_image = np.mean(x_train, axis=0)  # 求每一列均值。即求所有图片每一个像素上的平均值
    return mean_image


def centralized(x_test, mean_image):
    x_test = np.reshape(x_test, (x_test.shape[0], -1))
    x_test = x_test.astype(np.float)
    x_test -= mean_image  # Subtract the mean from the graph, and you get zero mean graph
    return x_test

batch_size=100
#MNIST
trans_dataset=dsets.MNIST(root='D:/ML/MNIST',train=True,transform=None,download=True)
test_dataset=dsets.MNIST(root='D:/ML/MNIST',train=False,transform=None,download=True)

#加载&打乱
train_loader=torch.utils.data.DataLoader(dataset=trans_dataset,batch_size=batch_size,shuffle=True)
test_loader=torch.utils.data.DataLoader(dataset=test_dataset,batch_size=batch_size,shuffle=True)

# digit=train_loader.dataset.data[0]
# plt.imshow(digit,cmap=plt.cm.binary)
# plt.show()
# print(train_loader.dataset.targets[0])

if __name__=='__main__':
    """训练集"""
    X_train=train_loader.dataset.data.numpy()
    mean_image=getXmean(X_train)
    X_train=centralized(X_train,mean_image)
    #X_train=X_train.reshape(X_train.shape[0],28*28)
    Y_train=train_loader.dataset.targets.numpy()
    
    """测试集"""
    X_test=test_dataset.data.data[:1000].numpy()
    #X_test=X_test.reshape(X_test.shape[0],28*28)
    X_test=centralized(X_test,mean_image)
    Y_test=test_loader.dataset.targets[:1000].numpy()
    num_test=Y_test.shape[0]
    # y_test_pred=KNN_classify(5,'M',X_train,Y_train,X_test)
    
    """KNN执行"""
    classifier=Knn()
    classifier.fit(X_train,Y_train)
    y_test_pred=classifier.predict(5,'M',X_test)
    num_correct=np.sum(y_test_pred==Y_test)
    accuracy=float(num_correct)/num_test
    print('Got %d/%d correct\naccuracy=%f'%(num_correct,num_test,accuracy))

2020.12.6

sklearn中提供了KNN类,可以方便我们的调用
class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’, metric_params=None, n_jobs=1, **kwargs)
n_neighbors:选取几个邻居(即k)
weights:邻居的权重是平均呢,还是越重话语权越大呢
p与metric:距离表示,p=1是曼哈顿距离,p=2是欧氏距离

algorithm:(算法优化层面的,之后再展开)

  • ‘ball_tree’ will use BallTree
  • ‘kd_tree’ will use KDTree
  • ‘brute’ will use a brute-force search.
  • ‘auto‘默认选项,是看哪个好用那个

leaf_size:只有ball_tree和kd_tree才有必要

示例具体如下(省略上述代码中的数据载入和预处理)

#达到和上述代码一致的效果
from sklearn.neighbors import KNeighborsClassifier as KNN
knn=KNN(n_neighbors=5,p=1)
knn.fit(X_train,Y_train)
y_test_pred=knn.predict(X_test)

12.13

你可能感兴趣的:(pytorch,机器学习,python,算法,计算机视觉)