KNN实现

from array import array
from os import listdir
from numpy import *
import operator


def createDataSet():
        group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
        labels = ['A','A','B','B']
        return group, labels


#k-邻近算法核心
#inX:用于分类的输入向量。即将对其进行分类。
#dataSet:训练样本集
#labels:标签向量
def classify0(inX, dataSet, labels,k):
        dataSetSize = dataSet.shape[0] #得到数组的行数
        diffMat= tile(inX, (dataSetSize,1)) - dataSet
        sqDiffMat = diffMat**2
        sqDistances = sqDiffMat.sum(axis=1)
        distances = sqDistances**0.5
        sortedDisIndicies = distances.argsort()
        classCount = {}
        for i in range(k):
                voteIlabel = labels[sortedDisIndicies[i]]
                classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
        sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1), reverse=True)
        return sortedClassCount[0][0]


#从文本文件解析数据
def file2matrix(filename):
        fr= open(filename)
        arrayOLines = fr.readlines()
        numberOfLines = len(arrayOLines)
        returnMat = zeros((numberOfLines, 3))
        d = {'didntLike': 1, 'smallDoses': 2, 'largeDoses': 3}
        classLabelVector = []
        index =0
        for line in arrayOLines:
                line = line.strip()
                listFromLine = line.split('\t')
                returnMat[index,:] = listFromLine[0:3]
                #classLabelVector.append(d[listFromLine[-1]])  # 取到字典中对应的label值
                classLabelVector.append(listFromLine[-1])  # 取到字典中对应的label值
                index += 1
        return returnMat, classLabelVector


#归一化特征值:处理不同范围的特征值
def autoNorm(dataSet):
        minVals = dataSet.min(0)
        maxVals = dataSet.max(0)
        ranges = maxVals - minVals
        normDataSet = zeros(shape(dataSet))
        m = dataSet.shape[0]
        normDataSet = dataSet- tile(minVals ,(m,1))
        normDataSet = normDataSet/tile(ranges, (m,1))
        return normDataSet, ranges, minVals


#分类器测试代码
def datingClassTest():
        hoRatio = 0.10
        datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
        normMat, ranges, minVals = autoNorm(datingDataMat)
        m = normMat.shape[0]
        numTestVecs = int(m*hoRatio)
        errorCount = 0.0
        for i in range(numTestVecs):
            classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
            print("the classifier came back with: %s, the real answer is: %s" % (classifierResult, datingLabels[i]))
            if (classifierResult != datingLabels[i]): errorCount += 1.0
        print("the total error rate is : %f " %(errorCount/float(numTestVecs)))


#appointment site预测函数
def classifyPerson():
    resultList = ['not at all', 'in small doses','in large doses']
    percentTats = float(input("percentage of time spent playing video games?"))
    ffMiles = float(input("frequent flier miles earned per year?"))
    iceCream = float(input("liters of ice cream consumed per year?"))
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = array([ffMiles, percentTats, iceCream])
    classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels,3)
    print("You will probably like this person:",resultList[int(classifierResult)- 1])


#图像转测试向量
def img2vector(filename):
    returnVect = zeros((1,1024))
    fr = open(filename)
    for i in range(32):
        lineStr =fr.readline()
        for j in  range(32):
            returnVect[0,32*i+j] = lineStr[j]
    return returnVect


def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('digits/trainingDigits')
    m = len(trainingFileList)
    trainingMat = zeros((m, 1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('digits/trainingDigits/%s' % fileNameStr)
    testFileList = listdir('digits/testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print("the classifier came back with: %d ,the real answer is : %d"%(int(classifierResult),int(classNumStr)))
        if(classifierResult != classNumStr): errorCount += 1.0
    print("\nthe total number of errors is: %d"% errorCount)
    print("\nthe total error rate is: %f" % (errorCount/float(mTest)))


你可能感兴趣的:(机器学习实战,机器学习,python,KNN)