对机器学习实战这本书的KNN部分进行整理,代码分两部分,一部分是myKNN.py,包含了所需的函数,另一部分是myKNN_run.py,对所需函数进行调用。python版本为3.X。
myKNN.py:
# coding: utf-8
# In[3]:
from numpy import *
import operator
import os
def classify0(inX, dataSet, labels, k):
#inX:用于分类的输入向量,1*2
#dataSet:训练样本集,4*2
dataSetSize = dataSet.shape[0]
#距离计算,用矩阵快
diffMat = tile(inX, (dataSetSize, 1)) - dataSet# (A-B)
#tile:重复
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
#按距离从小到大排序
sortedDistIndices = distances.argsort()
#确定前k个点所在类别出现频率
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndices[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1#字典统计每个label出现的次数
#出现频率最高点为预测类别
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
print(sortedClassCount)
return sortedClassCount[0][0]
def file2matrix(filename):
#将约会数据文件转成矩阵
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()#去掉所有回车符
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))#转换成整数
#print(int(listFromLine[-1])
index += 1
return returnMat, classLabelVector
def autoNorm(dataSet):
#将约会数据归一化,保证每个特征有同等重要性
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1))
return normDataSet, ranges, minVals
def datingClassTest(filename):
#进行测试
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix(filename)
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
print ("the total error rate is: %f" % (errorCount/float(numTestVecs)))
print (errorCount)
def classifyPerson(filename):
#进行预测
resultList = ['not at all', 'in small doses', 'in larhe doses']
percenTats = float(input('percentage of time spent playing video games?'))
ffMiles = float(input('frequent fliter miles earned per year?'))
iceCream = float(input('liters of ice cream consumed per year?'))
datingDataMat, datingLabels = file2matrix(filename)
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percenTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3)
print('You will probably like person: ',resultList[classifierResult-1])
def img2vector(filename):
reVect = zeros((1,1024))#由32*32变为1*1024
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
reVect[0, 32*i+j] = int(lineStr[j])
return reVect
def handwritingClassTest(dirtrain,dirtest):
hwlabels = []
trainlist = os.listdir(dirtrain)#获得的是list
m = len(trainlist)#list用len,array用shape
trainmat = zeros((m,1024))
for i in range(m):
filename = trainlist[i]
filestr = filename.split('.')[0]#只剩0_189
classname = int(filestr.split('_')[0])#取0,别忘了变成整数
hwlabels.append(classname)
trainpath = os.path.join(dirtrain, filename)
#print('***********',trainpath)
trainmat[i,:] = img2vector(trainpath)
testlist = os.listdir(dirtest)
errorcount = 0.0
mtest = len(testlist)
for i in range(mtest):
filename = testlist[i]
filestr = filename.split('.')[0]
classname = int(filestr.split('_')[0])
testpath = os.path.join(dirtest, filename)
#print('***********',testpath)
testvector = img2vector(testpath)
classresult = classify0(testvector, trainmat, hwlabels, 3)
print('the classifier came back with: %d, the real answer is: %d'%(classresult, classname))
if(classresult != classname): errorcount += 1.0
print('\nthe total number of error is: %d' % errorcount)
print('\nthe total error rate is: %f' %(errorcount/float(mtest)))
myKNN_run.py 进行函数调用。
import myKNN
import matplotlib.pyplot as plt
from numpy import *
#约会网站数据
mat, lab = myKNN.file2matrix('datingTestSet2.txt')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(mat[:,1], mat[:,2], 15.0*array(lab), 15.0*array(lab))#不同颜色显示
plt.show()
nor, ranges, min = myKNN.autoNorm(mat)
myKNN.datingClassTest('datingTestSet2.txt')#约会网站测试
myKNN.classifyPerson('datingTestSet2.txt')#约会网站预测
#手写数据
shan = myKNN.img2vector('E:/ML/ML_Learn/机器学习实战(中文版+英文版+源代码)/机器学习实战源代码/machinelearninginaction/Ch02/trainingDigits/0_0.txt')
print(shan[0,0:39])
train = 'E:/ML/ML_Learn/机器学习实战(中文版+英文版+源代码)/机器学习实战源代码/machinelearninginaction/Ch02/trainingDigits'
test = 'E:/ML/ML_Learn/机器学习实战(中文版+英文版+源代码)/机器学习实战源代码/machinelearninginaction/Ch02/testDigits'
myKNN.handwritingClassTest(train, test)
上述代码,应用了两套数据,第一套数据为约会数据,第二套为手写数字分类。
其中部分结果为:
思考:
1、需保存全部数据集,占内存;
2、如手写数据集有2000个测试样本,则每次需计算2000个距离,计算效率低。1024*2000*900
3、无法给出数据的基础结构信息,如平均样例,典型样例。(抽象了)
具体数据及代码链接为:https://pan.baidu.com/s/1G4DWmV2Nc5t6kR8d-178dA,提取码:ycrc