[置顶] k近邻法 kNN算法 in Python




k值 + 距离测度 + 判断策略

NOTE: k值较大,可减小估计误差,但学习的近似误差会增大,同时意味着模型变得简单。k值通常是采用交叉检验来确定。


NOTE: 多数表决策略等价于损失函数为0-1损失函数时的经验风险最小化。

NOTE: 需要考虑高维度对距离衡量的影响,当变量数越多,欧式距离的区分能力就越差。


NOTE: 一般距离测度除了欧氏距离等,还有皮尔逊相关系数、余弦相似度。

欧氏距离 linalg.norm(inA-inB)

皮尔逊相关系数 corrcoef(inA,inB,rowvar=0)[0][1]度量的是两向量之间的相似度,相对欧氏距离的优势在于,对输入向量的两级并不敏感,详见P258

余弦相似度 float(inA.T*inB) / (linalg.norm(inA)*linalg.norm(inB))


3.1 易于理解、易于实现,无需训练学习,无需估计参数

3.2 对异常值不敏感、没有数据输入假定,支持增量学习

3.3 计算时间和空间线性于训练集的规模(在一些场合不算太大)。

3.4 适合对稀有事件进行分类(例如当流失率很低时,比如低于0.5%,构造流失预测模型)

3.5 特别适合于多分类问题,例如根据基因特征来判断其功能分类,kNN比SVM的表现要好

3.6 由于KNN方法主要靠周围有限的邻近的样本,而不是靠判别类域的方法来确定所属类别的,因此对于类域的交叉或重叠较多的待分样本集来说,KNN方法较其他方法更为适合。

3.7 该算法比较适用于样本容量比较大的类域的自动分类,而那些样本容量较小的类域采用这种算法比较容易产生误分。


4.1 懒惰算法,每次预测都需重新计算,计算复杂度高、空间复杂度高

4.2 不适用于高维数据

4.3 可解释性较差,无法给出决策树那样的规则

4.4 该算法在分类时有个主要的不足是,当样本不平衡时,如一个类的样本容量很大,而其他类样本容量很小时,有可能导致当输入一个新样本时,该样本的K个邻居中大容量类的样本占多数。该算法只计算“最近的”邻居样本,某一类的样本数量很大,那么或者这类样本并不接近目标样本,或者这类样本很靠近目标样本。无论怎样,数量并不能影响运行结果。可以采用权值的方法(和该样本距离小的邻居权值大)来改进。


5.1 准备数据



5.2 分析数据

5.3 训练算法


5.4 测试算法


5.5 使用算法


5.6 改进


6、kNN in Python

from numpy import *
from os import listdir
import operator

### preparing data ###
# for the first case_dating
def file2matrix(filename):
    fr = open(filename)
    arrayOfLines = fr.readlines()
    numberOfLines = len(arrayOfLines)
    returnMat = zeros((numberOfLines,3))
    classLabelVector = []
    index = 0
    for line in arrayOfLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        index += 1
    return returnMat, classLabelVector

# for the second case
def img2vector(filename):
    returnVect = zeros((1,32*32))
    fr = open(filename)
    for ii in range(32):
        lineStr = fr.readline()
        for jj in range(32):
            returnVect[0,32*ii+jj] = int(lineStr[jj])
    return returnVect    

# normalizing the data of dataSet
def autoNorm(dataSet):
    # the type of dataSet should be matrix
    minVals = dataSet.min(0) # obtaining a row vector
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    N = dataSet.shape[0]
    normDataSet = (dataSet-tile(minVals, (N, 1)))/tile(ranges, (N, 1))
    return normDataSet, ranges, minVals

### classifying function ###
# inX : input variable x, 1*n
# dataSet : x[i] in training set T, N*n
# labels : y[i] in training set T, 1*N
def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0] # N
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet # generates a N*n matrix (actually is an array) where each row is x[i]-x
    sqDiffMat = diffMat**2 # squaring each components for array, the command is wrong for matrix 
    sqDistances = sqDiffMat.sum(axis=1) # sum of each element in array, generates a N*1 sq_dis vectoc 
    distances = sqDistances**0.5
    sorteDistIndicies = distances.argsort() # N*1
    classCount = {}
    for ii in range(k): # from 0 to k-1 
        voteIlabel = labels[sorteDistIndicies[ii]] # y[sorteDistIndicies[ii]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

### testing ###
# for the first case
def datingClassTest(testRatio):
    hoRatio = testRatio
    datingDataMat, datingLabels = file2matrix('F:\ResearchData\MyCode\Python\kNN\datingTestSet.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    N = normMat.shape[0]
    numTestVecs = int(N*hoRatio) # the number of samples for testing
    errorCount = 0.0;
    for ii in range(numTestVecs):
        classifierResult = classify0(normMat[ii,:], normMat[numTestVecs:N,:], datingLabels[numTestVecs:N],3)
        print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[ii])
        if (classifierResult != datingLabels[ii]) : errorCount += 1.0
    print "the total error rate is: %f" % (errorCount/float(numTestVecs))

# for the second case
def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir(r'F:\ResearchData\MyCode\Python\kNN\trainingDigits')
    N = len(trainingFileList)
    trainingMat = zeros((N,1024))
    for ii in range(N):
        fileNameStr = trainingFileList[ii]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        trainingMat[ii,:] = img2vector(r'F:\ResearchData\MyCode\Python\kNN\trainingDigits\%s' % fileNameStr)
    testFileList = listdir(r'F:\ResearchData\MyCode\Python\kNN\testDigits')
    errorCount = 0.0
    NTest = len(testFileList)
    for ii in range(NTest):
        fileNameStr = testFileList[ii]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector(r'F:\ResearchData\MyCode\Python\kNN\testDigits\%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print "the classifier came back with: %d, the realanswer is: %d" % (classifierResult, classNumStr)
        if (classifierResult != classNumStr) : errorCount += 1.0
    print "\nthe total number of errors is: %d" % errorCount
    print "\nthe total error rate is: %f" % (errorCount/float(NTest))

### predicting ###
# for the first case
def classifyPerson():
    resultList = ['not at all', 'in small doses', 'in large doses']
    percentTats = float(raw_input("percentage of time spent playing video games:"))
    ffMiles = float(raw_input("frequent flier miles earned per years:"))
    iceCream = float(raw_input("liters of ice cream consumed per years:"))
    datingDataMat, datingLabels = file2matrix('F:\ResearchData\MyCode\Python\kNN\datingTestSet.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = array([ffMiles, percentTats, iceCream])
    classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3)
    print "You will probably like this person: ", resultList[classifierResult - 1]

