Knn算法—识别手写数字(机器学习实战)
1.通俗的说就是:对于给定的输入向量在训练集中找到与该输入实例最近的k个实例,统计这k个实例中每个实例(按照标签分类)所属的类。根据我们统计结果,把输入向量划分到该类中,预测值便是该类别(类中包含最多的实例)。
代码如下(示例):
from numpy import *
import operator
代码如下(示例):
#createDataSet()返回值为:数据集(矩阵),标签列表(向量)。用来创建一个简单的数据集和标签
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels
#classify0(),返回值为标签,输入一个向量,和数据集来判断输入向量的结果,一个三个形参
#inX是要预测的数据,dataSet为数据集,labels为数据集的对应标签列表(一维向量),k是kNN算法的k个最近邻。
def classify0(inX,dataSet,Labels,k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX,(dataSetSize,1)) - dataSet
sqDiffMat = diffMat **2
sqDistances = sqDiffMat.sum(axis = 1)
distance = sqDistances ** 0.5
sortedDistIndices = sqDistances.argsort() #argsort()实现升序排列数据,返回的是原数据排列之前的索引值列表
classCount = {
} #创建一个字典用于统计要预测的inX最近的k个近邻Labels标签
for i in range(k): #for循环将k个最近的邻居存入到classCount{
}字典中。形如{
A:3,B:4}
voteLabel = Labels[sortedDistIndices[i]]
classCount[voteLabel] = classCount.get(voteLabel,0) + 1
sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
return sortedClassCount[0][0]
#将文本文件转换成可以处理的矩阵
def file2matrix(filename):
fr = open(filename) #打开文件
arrayOfLines = fr.readlines() #将行文件逐行读出,readlines()返回值是以文件内容行为单位的列表
numberOfLines = len(arrayOfLines) #长度计算函数len()用来判断序列类型的长度,返回值是整数
returnMat = zeros((numberOfLines,3))
classLabelVector =[]
index = 0
for line in arrayOfLines:
line = line.strip() #剔除文件行两端的空字符
listFromLine = line.split("\t") #文件的行一“\t"字符为分割符,分成最多分,然后存储到特征矩阵中
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))#用append()函数将文件的标签列表追加到classLableVector向量中
index += 1
return returnMat,classLabelVector #将处理好的矩阵数据和标签向量返回
#归一化特征值
def autoNorm(dataSet):
minValues = dataSet.min(0)
maxValues = dataSet.max(0)
ranges = maxValues - minValues
#多余的初始化normData = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minValues,(m,1))
normDataSet = normDataSet/tile(ranges,(m,1))
return normDataSet,ranges,minValues
#测试分类错误率
def datingClassTest():
hoRatio = 0.1
dataMatrix,dataLabels = file2matrix("datingTestSet2.txt")
normMatrix,ranges,minValues = autoNorm(dataMatrix)
m = normMatrix.shape[0]
useNumber = int(hoRatio*m)
errorCounter = 0
for i in range(useNumber):
testResult = classify0(normMatrix[i,:],normMatrix[useNumber:m,:],dataLabels[useNumber:m],3)
print("the classifier came back with:{},the real result is:{}".format(testResult,dataLabels[i]))
if testResult != dataLabels[i]:
errorCounter +=1
print(errorCounter)
print("the total error rate is:{}".format(errorCounter/(m*hoRatio)))
#将照片转换成1*1024的向量(二维)
def img2Vector(filename):
returnVector = zeros((1,1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVector[0,32*i+j] = lineStr[j]
return returnVector
'''a = convertImg("trainingDigits/0_13.txt")
print(a[32:64])'''
from os import listdir
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir("trainingDigits")
m = len(trainingFileList)
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split(".")[0]
classNumStr = int(fileStr.split("_")[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2Vector("trainingDigits/%s" % fileNameStr)
testFileList = listdir("testDigits")
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2Vector("testDigits/%s" % fileNameStr)
classifierResult = classify0(vectorUnderTest,trainingMat,hwLabels,3)
print("the classifier came back with :{},the real answer is:{}".format(classifierResult,classNumStr))
if classifierResult != classNumStr : errorCount += 1.0
print("the total number of error is:{}".format(errorCount))
print("the total error rate is:{}".format(errorCount/float(mTest)))
handwritingClassTest()
链接:(数据集)https://pan.baidu.com/s/1P2YqVIBXk826oFv2pO37Ag
提取码:w2mr