

还有就是需要了解cross validation(交叉验证),可参考我的另外一片博客:交叉验证

A. 算法原理




B. 实现


# !/usr/bin/env python2
# -*- coding: utf-8 -*-
__author__ = 'jacket'

# standard module
import sys

# third-party module
import numpy as np
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split

def getDataSet():
    # X = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    # Y = ['A', 'A', 'B', 'B']
    # return (X, Y)
    Iris = load_iris()
    return train_test_split(Iris.data, Iris.target, test_size=0.3, random_state=np.random)

def insertSort(array, data, len_upper_bound=None):
    """ insertion sort hold on a complexity: O(NK), N is the total data size, while K is the len_upper_bound """
    tail = len(array) - 1
    pos = tail
    key = data[0]

    if (not len_upper_bound) or (len(array) < len_upper_bound):
        tail += 1

    while pos >= 0 and key < array[pos][0]:
        if pos < tail:
            array[pos+1] = array[pos]
        pos -= 1

    if pos < tail:
            array[pos+1] = data

def classifyOne(train_x, train_y, test_x, K):
    # first calculate the distance between test_x with each train_x
    # Euclid distance = sqrt(sum([(x1-x2)**2 for (x1, x2) in zip(X1, X2)])) / 2
    # use numpy's ufunc to speed up the calculation!
    distances = np.sqrt(np.sum((train_x - test_x)**2, axis=1)) / 2
    k_shortest = []

    # use insertion sort to select the k_shortest distances
    for i in range(distances.shape[0]):
        insertSort(k_shortest, (distances[i], train_y[i]), K)

    # vote for each label corresponding to the k_shortest distances
    votes = dict.fromkeys(train_y, 0)
    for (_, y) in k_shortest:
        votes[y] += 1

    # select the most frequent label to be the predict category
    [most_label, max_votes] = [None, 0]
    for label in votes:
        if votes[label] > max_votes:
            max_votes = votes[label]
            most_label = label

    return most_label

def predict(train_x, train_y, test_x, K):
    results = np.zeros(test_x.shape[0], dtype=train_y.dtype)
    for (i, x) in enumerate(test_x):
        results[i] = classifyOne(train_x, train_y, x, K)

    return results

def measure(true_labels, predict_labels):
    correct = np.sum([true_labels == predict_labels])
    return correct * 1.0 / len(true_labels)


# !/usr/bin/env python2
# -*- coding: utf-8 -*-
__author__ = 'jacket'

# standard module
import sys

# self-define module
import KNN

def main(args):
    [train_x, test_x, train_y, test_y] = KNN.getDataSet()
    results = KNN.predict(train_x, train_y, test_x, 2)

    for (predict, y) in zip(results, test_y):
        print('Actual: {0} | Predict: {1}'.format(y, predict))

    correct_rate = KNN.measure(test_y, results)
    print('correct_rate = {0}'.format(correct_rate))

if __name__ == '__main__':

C. 结果和复杂度分析


复杂度的话,首先knn不需要训练,测试的话,设训练集大小为M,测试集大小为N。对于每个test_sample,它需要遍历一遍训练集来计算距离,O(M),然后对这M个距离进行排序,可以用python或numpy的sort函数,它们都是快排实现的,平均复杂度是O(Mlog M),而我自己实现时用的是变形的插入排序,复杂度为O(KM),其中K就是KNN中的K了,所以我上面实现的KNN测试每个test_sample的复杂度为:O((K+1)M),故而总的算法复杂度为O((K+1)MN)(因为有N个test_sample,每个都要做一遍)。
