KNN(一)

对机器学习实战这本书的KNN部分进行整理,代码分两部分,一部分是myKNN.py,包含了所需的函数,另一部分是myKNN_run.py,对所需函数进行调用。python版本为3.X。

myKNN.py:

# coding: utf-8

# In[3]:


from numpy import *
import operator
import os

def classify0(inX, dataSet, labels, k):
    #inX:用于分类的输入向量,1*2
    #dataSet:训练样本集,4*2
    dataSetSize = dataSet.shape[0]
    #距离计算,用矩阵快
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet# (A-B)
    #tile:重复
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    #按距离从小到大排序
    sortedDistIndices = distances.argsort()
    #确定前k个点所在类别出现频率
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndices[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1#字典统计每个label出现的次数
    #出现频率最高点为预测类别
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    print(sortedClassCount)
    return sortedClassCount[0][0]

def file2matrix(filename):
    #将约会数据文件转成矩阵
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines)
    returnMat = zeros((numberOfLines,3))
    classLabelVector = []
    index = 0
    for line in arrayOLines:
        line = line.strip()#去掉所有回车符
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))#转换成整数
        #print(int(listFromLine[-1])
        index += 1
    return returnMat, classLabelVector

def autoNorm(dataSet):
    #将约会数据归一化,保证每个特征有同等重要性
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m,1))
    normDataSet = normDataSet/tile(ranges, (m,1))
    return normDataSet, ranges, minVals

def datingClassTest(filename):
    #进行测试
    hoRatio = 0.10
    datingDataMat, datingLabels = file2matrix(filename)
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
        print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print ("the total error rate is: %f" % (errorCount/float(numTestVecs)))
    print (errorCount)
    

def classifyPerson(filename):
    #进行预测
    resultList = ['not at all', 'in small doses', 'in larhe doses']
    percenTats = float(input('percentage of time spent playing video games?'))
    ffMiles = float(input('frequent fliter miles earned per year?'))
    iceCream = float(input('liters of ice cream consumed per year?'))
    datingDataMat, datingLabels = file2matrix(filename)
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = array([ffMiles, percenTats, iceCream])
    classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3)
    print('You will probably like person: ',resultList[classifierResult-1])

def img2vector(filename):
    reVect = zeros((1,1024))#由32*32变为1*1024
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            reVect[0, 32*i+j] = int(lineStr[j])
    return reVect


def handwritingClassTest(dirtrain,dirtest):
    hwlabels = []
    trainlist = os.listdir(dirtrain)#获得的是list
    m = len(trainlist)#list用len,array用shape
    trainmat = zeros((m,1024))
    for i in range(m):
        filename = trainlist[i]
        filestr = filename.split('.')[0]#只剩0_189
        classname = int(filestr.split('_')[0])#取0,别忘了变成整数
        hwlabels.append(classname)
        trainpath = os.path.join(dirtrain, filename)
        #print('***********',trainpath)
        trainmat[i,:] = img2vector(trainpath)
    testlist = os.listdir(dirtest)
    errorcount = 0.0
    mtest = len(testlist)
    for i in range(mtest):
        filename = testlist[i]
        filestr = filename.split('.')[0]
        classname = int(filestr.split('_')[0])
        testpath = os.path.join(dirtest, filename)
        #print('***********',testpath)
        testvector = img2vector(testpath)
        classresult = classify0(testvector, trainmat, hwlabels, 3)
        print('the classifier came back with: %d, the real answer is: %d'%(classresult, classname))
        if(classresult != classname): errorcount += 1.0
    print('\nthe total number of error is: %d' % errorcount)
    print('\nthe total error rate is: %f' %(errorcount/float(mtest)))

myKNN_run.py 进行函数调用。

import myKNN
import matplotlib.pyplot as plt
from numpy import *

#约会网站数据
mat, lab = myKNN.file2matrix('datingTestSet2.txt')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(mat[:,1], mat[:,2], 15.0*array(lab), 15.0*array(lab))#不同颜色显示
plt.show()

nor, ranges, min = myKNN.autoNorm(mat)


myKNN.datingClassTest('datingTestSet2.txt')#约会网站测试
myKNN.classifyPerson('datingTestSet2.txt')#约会网站预测

#手写数据
shan = myKNN.img2vector('E:/ML/ML_Learn/机器学习实战(中文版+英文版+源代码)/机器学习实战源代码/machinelearninginaction/Ch02/trainingDigits/0_0.txt')
print(shan[0,0:39])
train = 'E:/ML/ML_Learn/机器学习实战(中文版+英文版+源代码)/机器学习实战源代码/machinelearninginaction/Ch02/trainingDigits'
test = 'E:/ML/ML_Learn/机器学习实战(中文版+英文版+源代码)/机器学习实战源代码/machinelearninginaction/Ch02/testDigits'
myKNN.handwritingClassTest(train, test)

上述代码,应用了两套数据,第一套数据为约会数据,第二套为手写数字分类。

其中部分结果为:

KNN(一)_第1张图片

思考:

1、需保存全部数据集,占内存;

2、如手写数据集有2000个测试样本,则每次需计算2000个距离,计算效率低。1024*2000*900

3、无法给出数据的基础结构信息,如平均样例,典型样例。(抽象了)

 

具体数据及代码链接为:https://pan.baidu.com/s/1G4DWmV2Nc5t6kR8d-178dA,提取码:ycrc

 

 

 

 

 

你可能感兴趣的:(机器学习)