python3实现K-邻近算法(机器学习实战中代码)

from numpy import *
import operator

#inx:待预测数据  dateSet:训练样本集  labels:训练样本的标签  k:k邻近算法的k值
def classify0(inX, dateSet, labels, k):
    dateSetSize = dateSet.shape[0] #获得样本的个数(也就是dateset矩阵的行数)
    diffMat = tile(inX, (dateSetSize, 1)) - dateSet #将inx复制dateSetSize个,然后减去所有样本
    sqDiffMat = diffMat ** 2 #求平方
    sqDistances = sqDiffMat.sum(axis=1) #矩阵sqDiffMat按行相加
    distances = sqDistances ** 0.5 #开根号
    sortedDistIndicies = distances.argsort() #获得distance排序的索引
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    #计算前k个最近的数据中,每个标记的个数
    sortedClassCount = sorted(classCount.items(), 
        key = operator.itemgetter(1), reverse = True)#按每个标记出现次数,将标记从大到小排序
    return sortedClassCount[0][0] #返回出现次数最多的标记的值

def autoNorm(dataSet):#归一化函数
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    m = dataSet.shape[0]
    normDataSet = zeros(shape(dataSet))
    normDataSet = dataSet - tile(minVals, (m, 1))
    normDataSet = normDataSet/tile(ranges, (m,1))
    return normDataSet, ranges, minVals
def file2matrix(filename):#读取文件函数
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines)
    returnMat = zeros((numberOfLines,3))
    classLabelVector = []
    index = 0
    for line in arrayOLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector

def dataClassTest():#主函数
    hoRatio = 0.1
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')#输入文件名
    normMat, ranges, minVals = autoNorm(datingDataMat)#归一化处理数据
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:], 
            datingLabels[numTestVecs:m], 3)
        print ("the classifier came back with: {}, the real answer is: {}".format(classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print ("the total error rate is: %f" % (errorCount / float(numTestVecs)))

你可能感兴趣的:(python3,机器学习)