kaggle入门digits Recognizer

经典的数字识别问题,调用Knn,  randforest,   svm&pca这3种方法。

主要利用的是sklearn库,pandas库, numpy库


1.knn是是看了别人的博客,然后自己动手重复了一下,后来发现这种方法的提取数据太冗长了,后续会贴出更精炼的code

from numpy import *
import operator
import csv
def loadTrainData():
    l = []
    with open('train.csv') as file:
        lines = csv.reader(file)
        for line in lines:
            l.append(line)
    l.remove(l[0])
    l = array(l)
    label = l[:,0]
    data = l[:,1:]
    return nomalizing(toInt(data)),toInt(label)
    #label 1*42000 data 42000*784
    #return data label

def toInt(array):
    array = mat(array)
    m,n = shape(array)
    newArray = zeros((m,n))
    for i in xrange(m):
        for j in xrange(n):
            newArray[i,j] = int(array[i,j])
    return newArray

def nomalizing(array):
    m,n = shape(array)
    for i in xrange(m):
        for j in xrange(n):
            if array[i,j] !=0:
                array[i,j]=1
    return array

def loadTestData():
    l = []
    with open('test.csv') as file:
        lines = csv.reader(file)
        for line in lines:
            l.append(line)
            #28001*784
    l.remove(l[0])  #del first row
    data = array(l)
    return nomalizing(toInt(data))  #28000*784

'''important code'''
def classify(inX,dataSet,labels,k):
    inX = mat(inX)
    dataSet = mat(dataSet)
    labels = mat(labels)
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX,(dataSetSize,1))-dataSet
    sqDiffMat = array(diffMat)**2
    sqDistance = sqDiffMat.sum(axis=1)
    distances = sqDistance**0.5
    sortedDistIndex = distances.argsort()
    classCount = {}
    for i in range(k):
        votelabel = labels[sortedDistIndex[i],0]
        classCount[votelabel] = classCount.get(votelabel,0)+1
    sortedClassCount = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse = True)
    return  sortedClassCount[0][0]

def saveResult(result):
    with open('result.csv','wb') as myFile:
        myWriter = csv.writer(myFile)
        for i in result:
            tmp = []
            tmp.append(i)
            myWriter.writerow(tmp)


def handwritingClassTest():
    trainData,trainlabel = loadTrainData()
    testData = loadTestData()
    m,n = shape(testData)
    resultlist=[]
    for i in range(m):
        classifierResult = classify(testData[i],trainData[0:20000],trainlabel.transpose()[0:20000],5)
        print "the classifier came back with: %d" % (classifierResult)
        resultlist.append(classifierResult)
    saveResult(resultlist)

handwritingClassTest()
因为只训练了20000个样本,所以提交的准确度只有90.9%,而且运行时间特别长,特别长,特别长,因为每次都需要遍历整个训练集


2.随机森林

import pandas as pd
import numpy as np

dataset = pd.read_csv('train.csv')
label = dataset[[0]].values.ravel()
traindata = dataset.ix[:,1:].values
testdata = pd.read_csv('test.csv').values

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10)
rf.fit(traindata,label)
pred = rf.predict(testdata)

np.savetxt('result_RF.csv',np.c_[range(1,len(testdata)+1),pred],delimiter=',',header='ImageId,Label',comments='',fmt='%d')

#accuracy = 0.96329
利用pandas库进行csv文件的读取是非常简单的,相比上述的方法

RF运行时间只有1min左右,准确率提高到了96%


3.SVM & PCA

考虑到样本的特征太多了,有784个 ,所以利用了一下PCA进行降维

import pandas as pd
from sklearn.svm import SVC
import numpy as np
from sklearn.decomposition import PCA

dataset = pd.read_csv('train.csv')
label = dataset[[0]].values.ravel()
traindata = dataset.ix[:,1:].values
test = pd.read_csv('test.csv')
#pca
pca = PCA(n_components=50,whiten=True)
pca.fit(traindata)
traindata_pca = pca.transform(traindata)
test_pca = pca.transform(test)
#svc
svc = SVC()
svc.fit(traindata_pca,label)
pred = svc.predict(test_pca)

np.savetxt('submission_SVM.csv',np.c_[range(1,len(test)+1),pred],delimiter=',',header='ImageId,Label',comments='',fmt='%d')

#accuracy = 0.98229
SVM搭配PCA的准确率就达到了98%

kaggle入门digits Recognizer_第1张图片

你可能感兴趣的:(数据挖掘)