k-近邻算法
from numpy import * # python里的计算包Numpy
import operator # 运算符模块
import os
# 数据准备所需的函数
def createDataSet():
group = array([
[1.0, 1.1],
[1.0, 1.0],
[0, 0],
[0, 0.1]
])
labels = ['A', 'A', 'B', 'B']
return group, labels
# 测试时调用函数
group, labels = createDataSet()
print("训练数据:", group)
print("标签:", labels)
# k-近邻算法核心函数
def classify0(inX, dataSet, labels, k): # 输入向量,训练数据,标签,参数k
dataSetSize = dataSet.shape[0] # 数据的个数
diffMat = tile(inX, (dataSetSize, 1)) - dataSet # tile()函数,求输入数据与训练数据对应值的相减
sqDiffMat = diffMat ** 2 # 平方
sqDistances = sqDiffMat.sum(axis=1) # 求和
distances = sqDistances ** 0.5 # 开根号
sortedDistIndicies = distances.argsort() # 返回数组值从小到大的索引
classCount = {} # 创建一个字典,用于记录每个实例对应的频数
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]] # 选择k个距离最小的点,对应标签
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # 统计频数
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 排序,reverse=True降序
# python2中用iteritems
return sortedClassCount[0][0] # 返回最多的那个数据
# tile函数
a = [0, 1, 2]
b = tile(a, 2)
print(b) # [0 1 2 0 1 2]
b = tile(a, (1, 2))
print(b) # [[0 1 2 0 1 2]]
b = tile(a, (2, 1))
print(b) # [[0 1 2],[0 1 2]]
# 测试
result = classify0([1.2, 1], group, labels, 3)
print("预测标签为:", result)
# 手写识别系统
# 将图像格式转化为向量 32*32 --> 1*1024
def img2vector(filename):
returnVect = zeros((1, 1024)) # 创建1*1024的0填充向量矩阵
fr = open(filename) # 打开文件
for i in range(32): # 读取文件的前32行,前32列
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32 * i + j] = int(lineStr[j])
return returnVect # 返回每个图像的向量
testVector = img2vector('testnumber/0_13.txt')
print(testVector[0, 0:31])
print(testVector[0, 32:63])
# 手写数字识别系统的测试代码
def handwritingClassTest():
hwLabels = []
trainingFileList = os.listdir('trainingnumber') # 获取目录内容
m = len(trainingFileList)
trainingMat = zeros((m, 1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingnumber/%s' % fileNameStr)
testFileList = os.listdir('testnumber')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testnumber/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print("预测结果为:%d,真实值为:%d" % (classifierResult, classNumStr))
if (classifierResult != classNumStr):
errorCount += 1.0
print("预测错误的总数为:%d" % errorCount)
print("手写数字识别系统的错误率为:%f" % (errorCount / float(mTest)))
# 测试手写数字识别系统
handwritingClassTest()