import numpy as np
import os
from os import listdir
import operator
def classify0(inX,dataSet,labels,k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX,(dataSetSize,1))-dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndices = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndices[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0)+1
sortedClassCount = sorted(classCount.items(),\
key = operator.itemgetter(1),reverse = True)
return sortedClassCount[0][0]
def img2vector(filename):
returnVect = np.zeros((1,1024))
with open(filename) as fr:
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
hwLabels = []
trainingFileLList = listdir("E:\\A21\\New_\
python\\机器学习新\\trainingDigits")
m = len(trainingFileLList)
trainngMat = np.zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileLList[i]
fileStr = fileNameStr.split(".")[0]
classNumStr = int(fileStr.split("_")[0])
hwLabels.append(classNumStr)
trainngMat[i,:] = img2vector("E:\\A21\\New_\
python\\机器学习新\\trainingDigits\\{}".
format(fileNameStr))
testFileList = listdir("E:\\A21\\New\
_python\\机器学习新\\testDigits")
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split(".")[0]
classNumStr = int(fileStr.split("_")[0])
vectortUnderTest = img2vector("E:\\A21\\New_\
python\\机器学习新\\testDigits\\{}".format(fileNameStr))
classierResult = \classify0(vectortUnderTest,\
trainngMat,hwLabels,3)
print("the classifier came back with:{},\
the real answer is {}".format(classierResult,classNumStr))
if classierResult != classNumStr:
errorCount += 1
print("the total error rate is {}".format(errorCount/mTest))
if __name__ == "__main__":
handwritingClassTest()
classify0(indeX,dataset,labels,k):
求取待表决分类点与各个点的距离,投票表决indeX属于哪个类别
tile:
将inX作为整体,变成形状为(dataSetSize,1)的矩阵
distances即计算待预测点与各个点的距离,结果为numpy数组
argsort:
返回数组数值由小到大的索引值
classCount:
创建字典为了得到标签对应的次数
voteIlabel记录了sortedDistIndices[i]的标签(此标签顺序,即距离从小到大点的顺序)
classCount则将标签加入字典,有此键则在此键值基础上加1,无此键则记为0创建键值
sortedClassCount:
key = operator.itemgetter(1)表示将字典转换为对应的元组并且根据第二个元素进行排序,默认升序,reverse=True改为降序
img2vector(filename):
完成图像向一个向量的转换
**打开一个文档,按行读取,将一行的每个元素依次存入returnVect中(每此存入32个)**图像
listdir:得到该路径下所有文件名,此处得到的为trainDigits中所有txt文档名
handwritingClassTest():
完成一个向量向可读取数据的转换(矩阵,每行为一个特征值),进行模型预测,评估结果
fileStr = fileNameStr.split(".")[0]和classNumStr = int(fileStr.split("_")[0]):将文件名转换为此文件的标签:
例如:1_35.txt” ---- [“1_35”,“txt”] ----“1_35” ---- [“1”,“35”] ----“1” 。 因而,有个int强制转换用于hwLabels保存标签
trainngMat[i,:]:每行保存一个图像转换后的向量值
classierResult = \classify0(vectortUnderTest,
trainngMat,hwLabels,3):
用于测试每个向量(vectortUnderTest)的所属
最后计算错误率:预测值不等于标签值,则errorCount+1