该书代码及数据http://www.manning-source.com/books/pharrington/MLiA_SourceCode.zip
样本数据:
一个样本代表一约会对象,样本分类为用户A对该对象的喜欢程度,1、2、3分别代表'not at all','in small doses','in large doses'
每年获得的飞行常客里程数 | 玩视频游戏所耗时间百分比 | 每周消费冰淇淋公升数 |
样本分类 |
|
1 |
40920 | 8.326976 | 0.953952 | 3 |
2 |
26052 | 1.441871 | 0.805124 | 1 |
问题描述:
用户A输入待约会的对象的特征,KNN分类器对其做出分类,预测A对该对象的喜欢程度
数据展示:
输入样例:
代码(knn.py):
# -*- coding:utf-8 -*- from numpy import * import operator import sys import matplotlib import matplotlib.pyplot as plt from numpy import array def file2matrix(filename): fr=open(filename) arrayOfLines=fr.readlines() numberOfLines=len(arrayOfLines) returnMat=zeros((numberOfLines,3)) classLabelVector=[] index=0 for line in arrayOfLines: line=line.strip() listFromLine=line.split('\t') returnMat[index,:]=listFromLine[0:3] #can't plot if do not convert to 'int' classLabelVector.append(int(listFromLine[-1])) index+=1 return returnMat,classLabelVector def plotpic(dataset,labels): zwfont=matplotlib.font_manager.FontProperties(fname='/usr/share/fonts/truetype/arphic/ukai.ttc') fig=plt.figure() ax=fig.add_subplot(111) #更改前两个参数绘制不同图像,dataset[:,1/2/3] ax.scatter(dataset[:,1],dataset[:,2],15.0*array(labels),15.0*array(labels)) # plt.xlabel(u'每年获得的飞行常客里程数',fontproperties=zwfont) # plt.xlabel(u'玩视频游戏所耗时间百分比',fontproperties=zwfont) # plt.ylabel(u'每周消费的冰淇淋公升数',fontproperties=zwfont) plt.show() #由于各个特征数值范围差距较大,在计算距离时会产生较大偏差,因此需要归一化特征值 def autoNorm(dataset): minVals=dataset.min(0) #获取每列最小值 maxVals=dataset.max(0) ranges=maxVals-minVals normDataSet=zeros(shape(dataset)) m=dataset.shape[0] normDataSet=dataset-tile(minVals,(m,1)) normDataSet=normDataSet/tile(ranges,(m,1)) return normDataSet,ranges,minVals #分类函数 def classify(inX,dataSet,labels,k): dataSetSize=dataSet.shape[0] diffMat=tile(inX,(dataSetSize,1))-dataSet sqDiffMat=diffMat**2 sqDistances=sqDiffMat.sum(axis=1) distances=sqDistances**0.5 sortedDistIndices=distances.argsort() classCount={} for i in range(k): voteIlabel=labels[sortedDistIndices[i]] classCount[voteIlabel]=classCount.get(voteIlabel,0)+1 sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True) return sortedClassCount[0][0] #打印分类过程及错误率 def datingClassTest(): k=4 hoRatio=0.10 #将数据集的10%用作测试 datingDataMat,datingLabels=file2matrix('datingTestSet.txt') normMat,ranges,minVals=autoNorm(datingDataMat) m=normMat.shape[0] numTestVecs=int(m*hoRatio) errorCount=0.0 for i in range(numTestVecs): classifierResult=classify(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],k) print "the result is : %d, the real answer is : %d"%(classifierResult,datingLabels[i]) if (classifierResult != datingLabels[i]): errorCount+=1.0 print "the total error rate is : %f"%(errorCount/float(numTestVecs)) #与用户交互 def classifyPerson(): k=4 resultList=['not at all','in small doses','in large doses'] percentTats=float(raw_input("percentage of time spend playing video games?")) ffMiles=float(raw_input("frequent flier miles earned per year?")) iceCream=float(raw_input("liters of ice cream consumed per year?")) datingDataMat,datingLabels=file2matrix('datingTestSet.txt') normMat,ranges,minVals=autoNorm(datingDataMat) inArr=array([ffMiles,percentTats,iceCream]) classifierResult=classify((inArr-minVals)/ranges,normMat,datingLabels,k) print "You will probably like this person: ",resultList[classifierResult-1] if __name__=='__main__': #画图 # dataset,labels=file2matrix('datingTestSet.txt') # plotpic(dataset,labels) #测试 # datingClassTest() classifyPerson()