机器学习,k近邻分类器,python,

#-*-coding:utf-8 -*-
from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

#读取文件数据
def file2matrix(filename):
    fr=open(filename)#打开文件
    arrayOLines=fr.readlines()#将文件读入一个字符串列表,在列表中每个字符串就是一行
    numberOFlines=len(arrayOLines)#读入字符串列表的数量,即文件的行数
    returnMat=zeros((numberOFlines,3))#创建numberOFlines行3列的numpy矩阵
    classLabelVector=[]#创建标签数组
    index=0
    for line in arrayOLines:
        line=line.strip()#删除每行两侧的空格
        listFormLine=line.split('\t')#将每行的字符串列表以‘\t’为间隔分为序列
        returnMat[index,:]=listFormLine[0:3]#将每一行数据存入returnMat数组中
        classLabelVector.append(int(listFormLine[-1]))#将每一行的最后一列即标签存入classLabelVector中
        index+=1
    return returnMat,classLabelVector#返回样本特征矩阵与标签向量

#归一化数据
def autoNorm(dataset):
    minVals=dataset.min(0)#列中最小值
    maxVals=dataset.max(0)#列中的最大值
    ranges=maxVals-minVals
    normDataSet=zeros(shape(dataset))#创建与样本特征矩阵同大小的数值全是0的矩阵
    m=dataset.shape[0]#m是dataset的列数,即样本特征的维数
    normDataSet=dataset-tile(minVals,(m,1))#tile()是将minVals复制成m行3列,即与dataset同大小的矩阵
    normDataSet=normDataSet/tile(ranges,(m,1))
    return normDataSet,ranges,minVals#返回归一化的样本特征矩阵,范围,每列最小值

#K近邻分类
def classify(inX,dataSet,labels,k):
    dataSetSize=dataSet.shape[0]#读取样本的特征矩阵的维数
    diffMat=tile(inX,(dataSetSize,1))-dataSet#计算测试数据与每一个样本特征矩阵的欧氏距离
    sqDiffMat=diffMat**2
    sqDistances=sqDiffMat.sum(axis=1)#每一行的相加
    distances=sqDistances**0.5
    sortedDistIndicies=distances.argsort()#测试数据与每一个样本特征矩阵的欧氏距离从小到大排列后,将原样本的索引值赋值给sortedDistIndicies
    classCount={}#创建字典
    for i in range(k):
        voteIlabel=labels[sortedDistIndicies[i]]#将sortedDistIndicies相对应的标签赋值给voteIlabel
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1#get是取字典里的元素,
                              #如果之前这个voteIlabel是有的,那么就返回字典里这个voteIlabel里的值,
                              #如果没有就返回0(后面写的),这行代码的意思就是算离目标点距离最近的k个点的类别,
                        #这个点是哪个类别哪个类别就加1
        sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)#key=operator.itemgetter(1)的意思是按照字典里的第一个排序,
                                          #{A:1,B:2},要按照第1个(AB是第0个),即‘1’‘2’排序。reverse=True是降序排序
        return sortedClassCount[0][0]#返回发生频率最高的元素标签


def datingClassTest():
      hoRatio=0.10
      datingDataMat,datingLabels=file2matrix(r'F:\ML_use\datingTestSet2.txt')
      normMat,ranges,minVals=autoNorm(datingDataMat)
      m=normMat.shape[0]
      numTestVecs=int(m*hoRatio)
      errorCount=0.0
      for i in range(numTestVecs):
         classifierResult=classify(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
         print "the classifier came back with: %d,the real answer is: %d"%(classifierResult,datingLabels[i])
         if(classifierResult!=datingLabels[i]):
            errorCount+=1.0
      print "the total error rate is:%f"%(errorCount/float(numTestVecs))


def classifyPerson():
   resultList=['not at all','in small doses','in large doses']
   percentTats=float(raw_input("percentage of time spent playing vidio games?"))
   ffMines=float(raw_input("frequent flier miles earned per year?"))
   iceCream=float(raw_input("liters of ice cream consumed per year?"))

   datingDataMat,datingLabels=file2matrix(r'F:\ML_use\datingTestSet2.txt')

   normMat,ranges,minVals=autoNorm(datingDataMat)
   inArr=array([ffMines,percentTats,iceCream])
   classifierResult=classify((inArr-minVals)/ranges,normMat,datingLabels,3)
   print "you will probably like this person:",resultList[classifierResult-1]


   dataArr = array(datingDataMat)
   n = shape(dataArr)[0]
   xcord1 = []; ycord1 = [];zcord1=[]
   xcord2 = []; ycord2 = [];zcord2=[]
   xcord3 = []; ycord3 = [];zcord3=[]
   for i in range(n):
      if int(datingLabels[i])== 1:
         xcord1.append(dataArr[i,0]); ycord1.append(dataArr[i,1]);zcord1.append(dataArr[i,2])
      elif int(datingLabels[i])== 2:
         xcord2.append(dataArr[i,0]); ycord2.append(dataArr[i,1]);zcord2.append(dataArr[i,2])
      elif int(datingLabels[i])== 3:
         xcord3.append(dataArr[i,0]); ycord3.append(dataArr[i,1]);zcord3.append(dataArr[i,2])
   fig = plt.figure()
   ax = fig.add_subplot(111, projection='3d')
   ax.set_title('KNN')
   type1=ax.scatter(xcord1, ycord1,zcord1, s=30, c='red', marker='s')
   type2=ax.scatter(xcord2, ycord2,zcord2, s=30, c='green',marker='o')
   type3=ax.scatter(xcord3, ycord3,zcord3, s=30, c='b',marker='+')
   ax.scatter(inArr[0], inArr[1],inArr[2], s=100, c='k', marker='8')
   plt.figtext(0.02,0.92,'class1:Did Not Like',color='red')
   plt.figtext(0.02,0.90,'class2:Liked in Small Doses',color='green')
   plt.figtext(0.02,0.88,'class3:Liked in Large Doses',color='b')

   ax.set_zlabel('frequent flier miles earned per year')
   ax.set_ylabel('percentage of time spent playing vidio games')
   ax.set_xlabel('liters of ice cream consumed per year')

   plt.show()



classifyPerson()

你可能感兴趣的:(机器学习,k近邻分类器,python,)