机器学习实战(一)——kNN算法

最近选修了计算机学院的模式识别课,其中讲到了Parzen窗和kNN算法这两种对于总体的非参数估计方法,之前在机器学习实战中用kNN算法进行了手写数字识别,今天继续用Parzen窗原理的PNN(概率神经网络)算法写出python程序比较一下优劣。
PNN算法将是我写的第一个需要训练的机器学习程序。
2016.3.15

———分割线,以下为之前内容,kNN算法—————————————————————

kNN.py

# -*- coding: utf-8 -*-
###########
#KNN: the k nearest neighbours

###########


from numpy import *
import operator
import os
##############
#new_input : a matrix with [1,1024]
#dataset : a matrix with [num,1024]
#labels : 0:9
# k : the k in knn
def kNNClassify(new_input,dataset,labels,k):
#cal the distance
    num = dataset.shape[0]
    subMatrix=tile(new_input,[num,1])
    dist=subMatrix-dataset
    dist=dist**2
    #distance=dist.sum(axis=1)
    distance=sum(dist,axis=1)
    disttance=distance**0.5

#sort
    sortedDistIndice=argsort(distance)
#cal the most
    voteCount={}
    for i in xrange(k):
        voteIndice=labels[sortedDistIndice[i]]
        #cal the votes
        voteCount[voteIndice]=voteCount.get(voteIndice,0)+1

    #max votecount
    maxCount = 0
    for key,value in voteCount.items():
        if value > maxCount:
            maxCount = value
            maxIndex = key

    return maxIndex


def img2Vector(filename):
    rows = 32
    cols = 32
    imgVector = zeros([1,rows*cols])
    fileIn = open(filename)
    for row in xrange(rows):
        lineStr = fileIn.readline()
        for col in xrange(cols):
            imgVector[0,row*cols+col] = int(lineStr[col])

    return imgVector

def loadDataSet():
    #getting training set
    print 'getting training set'
    rows = 32
    cols = 32
    dirTrain = './1/'
    fileTrain = os.listdir('./1/')
    train_x = zeros((len(fileTrain),rows*cols))
    #labels = []
    #zeros(1,len(filename))
    train_y =[]
    for i in xrange(len(fileTrain)):
        dirFile=dirTrain+fileTrain[i]
        train_x[i,:] = img2Vector(dirFile)
        label = int(fileTrain[i].split('_')[0])
        train_y.append(label)

    print 'getting test set'
    dirTest = './2/'
    fileTest = os.listdir('./2/')
    test_x=zeros((len(fileTest),rows*cols))
    test_y=[]
    for j in xrange(len(fileTest)):
        dirFile=dirTest+fileTest[j]
        test_x[j,:]=img2Vector(dirFile)
        label = int(fileTest[j].split('_')[0])
        test_y.append(label)

    return train_x,train_y,test_x,test_y

def testHandWriting():
    print 'loading data'
    train_x,train_y,test_x,test_y =loadDataSet()

    print 'traning'
    pass

    print 'testing'

    numTestSamples = test_x.shape[0]
    matchCount = 0
    for i in xrange(numTestSamples):
        predict = kNNClassify(test_x[i],train_x,train_y,3)
        if predict == test_y[i]:
            matchCount += 1

    accuracy = float(matchCount)/numTestSamples

    print 'show the result...\n'
    print '%.2f%%' %(accuracy*100)

test_kNN.py

#test-kNN
import kNN
kNN.testHandWriting()

你可能感兴趣的:(python,ML)