Python实现kNN(k nearest neighbor algorithm)

下面的代码实现根据http://blog.csdn.net/bdss58/article/details/40928827这篇文章的算法介绍,理解算法请查看这篇文章。这里就不再详细介绍算法了。

__author__ = 'jianyong'

# 从csv文件中加载数据,并且将数据分成训练集和测试集,训练集和测试集比例是split
import  csv
import random
def loadDataset(filename,split,trainingset=[],testset=[]):
    with open(filename,'rb') as csvfile:
        lines=csv.reader(csvfile)
        dataset=list(lines)
        for x in range(len(dataset)-1):
            for y in range(4):
                dataset[x][y]=float(dataset[x][y])
            if random.random()<split:
                trainingset.append(dataset[x])
            else:
                testset.append(dataset[x])

# 计算欧几里得几何距离
import math
def euclideanDistance(instance1,instance2,length):
    distance=0
    for x in range(length):
        distance+=math.pow((instance1[x]-instance2[x]),2)
    return distance

# 根据欧几里得距离确定k个邻居
import operator
def getNeighbors(trainingset,testinstance,k):
    distances=[]
    for x in len(trainingset):
        dist=euclideanDistance(trainingset[x],testinstance,len(testinstance)-1)
        distances.append((trainingset,dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors=[]
    for x in range(k):
        neighbors.append(distances[x][x])
    return neighbors

# 找出邻居中的主要成分
def getResponse(neighbors):
    votes={}
    for x in len(neighbors):
        response=neighbors[x][-1]
        if response in votes:
            votes[response]+=1
        else:
            votes[response]=1
    sortedvotes=sorted(votes.iteritems(),key=operator.itemgetter(1),reverse=True)
    return sortedvotes[0][0]

# 计算正确率
def getAccuracy(testset,predictions):
    correct=0
    for x in range(len(testset)):
        if testset[x][-1] is predictions[x]:
            correct+=1
    return (correct/float(len(testset)))*100.0

# 运行一下试试
def main():
    trainingset=[]
    testset=[]
    split=0.67
    loadDataset('iris.data',split,trainingset,testset)
    print 'training set:' + repr(len(trainingset))
    print 'test set:' + repr(len(testset))
    predictions=[]
    k=3
    for x in range(len(testset)):
        neighbors=getNeighbors(trainingset,testset[x],k)
        result=getResponse(neighbors)
        predictions.append(result)
        print 'predicted:' + repr(result) + ',actual:'+repr(testset[x][-1])
    accuracy=getAccuracy(testset,predictions)
    print 'accuracy:'+ repr(accuracy)+'%'


你可能感兴趣的:(Python实现kNN(k nearest neighbor algorithm))