KNN邻近算法可用于分类,也可用于回归,具体的原理介绍参考博文http://www.cnblogs.com/v-July-v/archive/2012/11/20/3125419.html
本篇主要包括:手写数字图片的识别和KNN的实现。因为KNN分类算法通过距离来确定最近邻。距离算法有欧氏距离、曼哈顿距离和闵可夫斯基距离等,本篇主要使用欧氏距离。
通过找到K个最近邻做预测,计算预测样本和所有训练集中样本的距离,计算出最小的k个距离,然后多数表决。
(一)处理手写数字图片
数据下载地址:https://pan.baidu.com/s/1c1GXdJi
样本图片都为底色为黑色,手写数字为白色的位图(.bmp格式),而且图片大小固定,为28*28pi
处理方法:
抽样生成测试集数据文件和训练集数据文件。
代码:
import os
import datetime
import numpy as np
from numpy import *
from PIL import Image
import pandas as pd
import traceback
#图片处理函数
def Picture_handle(file_route, final_route):
img = Image.open(file_route)
# img = img.rotate(270)
width = img.size[0]
height = img.size[1]
with open(final_route, 'a') as fh:
for i in range(0,height): #写入文件,一行一行的写,写完一行后换行,所以以高为准,写宽。
for j in range(0,width):
rgb = img.getpixel((j,i))
if (rgb < 50): fh.write("1")
else: fh.write("0")
fh.write("\n")
fh.close()
def Get_route(file_name):
file_list = os.listdir(file_name) #该函数能得到所有的文件名
#进行随机抽样,组成训练集和检验集
sample_PC = pd.DataFrame(file_list)
train_PC = sample_PC.sample(frac=2/3)
test_PC = sample_PC.sample(frac=1/3)
return train_PC, test_PC
def Main():#构造文件打开路径
file_name = r"E:/data/t10k-images-bmp/t10k-images"
write_route = r"E:/data/digit_data/"
train_PC, test_PC = Get_route(file_name)
for i in train_PC.values:
file_route = file_name+'/'+str(i[0])
name = str(i[0]).split('.')[0]
final_route = write_route+'train/'+str(name)+'.txt'
Picture_handle(file_route, final_route)#使用图片处理函数,处理图片
for j in test_PC.values:
file_route = file_name+'/'+str(j[0])
name = str(j[0]).split('.')[0]
final_route = write_route+'test/'+str(name)+'.txt'
Picture_handle(file_route, final_route)
if __name__ == "__main__":
start_time = datetime.datetime.now()
Main()
end_time = datetime.datetime.now()
print('运行时间:',end_time - start_time)
本次代码主要是批量生成训练样本数据和检验数据,也可以自己手写一张图片进行处理。
(二)KNN算法的实现
#knn分类器,通过计算其他数据到样本数据的距离,k值为第几个最近邻
#手写实现knn分类和调用sklearn模块实现knn算法
from numpy import *
import operator
import os
import pandas as pd
from datetime import datetime
def Knn(k, test_data, train_data, labels):
train_data_size = train_data.shape[0]
dif = tile(test_data, (train_data_size, 1)) - train_data
sqrt_dif = dif ** 2
sum_sqrt_dif = sqrt_dif.sum(axis=1)
distance = sum_sqrt_dif**0.5
sort_distance = distance.argsort()
count = {}
for i in range(k):
vote = labels[sort_distance[i]]
count[vote] = count.get(vote, 0)+1
sort_count = sorted(count.items(), key=operator.itemgetter(1), reverse=True)
return sort_count[0][0]
def data_array(file_name):
array = []
fh = open(file_name)
#获取目标数据文件的行数和列数
data = fh.readlines()
col_num = len(data[0].replace('\n', ''))
for this_line in data:
for j in range(col_num):
array.append(int(this_line[j]))
return array
#获取文件的文件名
def sep_label(file_name):
filestr = file_name.split(".")[0]
label = int(filestr.split("_")[0])
return label
#建立训练数据
def train_data():
labels = []
train_route = r"E:/data/digit_data/train/"
train_file = os.listdir(train_route)
# print('训练文件名:',train_file, len(train_file))
num = len(train_file)
#用数组存放训练数据,行:文件总数,列:文件总数*文件总数
train_arrary = zeros((num, 28**2)) #生成全为0的数据
for i in range(num):
this_file_name = train_file[i]
this_labels = sep_label(this_file_name)
# print('this_labels为:',this_labels)
labels.append(this_labels)
# print('数组序号',i)
train_arrary[i,:] = data_array(train_route+this_file_name)
# print('值为:',len(train_arrary[i,:]))
return train_arrary,labels
def test_data():
train_arrary,labels = train_data()
test_route = r"E:/data/digit_data/test/"
test_list = os.listdir(test_route)
test_num = len(test_list)
# num = 0
file,real_data,forecast_data = [],[],[]
for i in range(0, test_num):
this_test_file = test_list[i]
test_arrary= data_array(test_route+this_test_file)
result = Knn(3, test_arrary, train_arrary, labels)
ref_labels = sep_label(test_list[i])
# print("真实值:",ref_labels,"预测值:", result)
# if (int(result) == int(ref_labels)):
# num += 0
# else:#输出错误的文件名以及错误识别数字
file.append(this_test_file)
real_data.append(ref_labels)
forecast_data.append(result)
# Accuracy = num/test_num
df = pd.DataFrame({'file_name':file,'real_data':real_data,'forecast_data':forecast_data})
return df
#构造混淆矩阵,即真实数据和预测数据的统计值
#计算模型评价指标,准确率
def Evaluation_Index():
df = test_data()
#计算ACC,即判对率占所有数据的比例
Accuracy = len(df[df.real_data == df.forecast_data])/len(df)
Confusion_Matrix = df.groupby(['real_data','forecast_data']).count()
Confusion_Matrix = Confusion_Matrix.unstack()
Confusion_Matrix = Confusion_Matrix.fillna(0).astype(int)
Rec_Matrix = df[df.real_data != df.forecast_data]
print('混淆矩阵:\n',Confusion_Matrix)
print('精确度:',round(Accuracy,4))
# print('错误数据:\n',Rec_Matrix)
return Confusion_Matrix,round(Accuracy,4),Rec_Matrix
if __name__ == "__main__":
start_time = datetime.now()
Confusion_Matrix,Accuracy,Rec_Matrix = Evaluation_Index()
end_time = datetime.now()
print('使用时间:',end_time - start_time)
运行结果:
精确度为:0.9703
混淆矩阵:
预测数据 0 1 2 3 4 5 6 7 8 9
真实数据
0 312 0 0 0 0 3 1 0 0 0
1 0 380 0 0 0 0 0 0 0 0
2 2 2 315 1 0 0 1 5 1 0
3 0 0 1 307 0 3 0 2 2 1
4 0 4 0 0 315 0 1 0 0 11
5 1 1 0 7 1 293 5 0 0 0
6 2 1 0 0 0 0 340 0 0 0
7 0 7 1 0 1 0 0 333 0 3
8 2 5 1 1 1 4 2 0 295 1
9 0 3 1 0 2 1 0 4 0 344
耗时:2分58秒。耗时较长,运行过程中该程序占用大量内存,而且这仅仅是KNN分类最简单的实现方法。