经典的数字识别问题,调用Knn, randforest, svm&pca这3种方法。
主要利用的是sklearn库,pandas库, numpy库
1.knn是是看了别人的博客,然后自己动手重复了一下,后来发现这种方法的提取数据太冗长了,后续会贴出更精炼的code
from numpy import * import operator import csv def loadTrainData(): l = [] with open('train.csv') as file: lines = csv.reader(file) for line in lines: l.append(line) l.remove(l[0]) l = array(l) label = l[:,0] data = l[:,1:] return nomalizing(toInt(data)),toInt(label) #label 1*42000 data 42000*784 #return data label def toInt(array): array = mat(array) m,n = shape(array) newArray = zeros((m,n)) for i in xrange(m): for j in xrange(n): newArray[i,j] = int(array[i,j]) return newArray def nomalizing(array): m,n = shape(array) for i in xrange(m): for j in xrange(n): if array[i,j] !=0: array[i,j]=1 return array def loadTestData(): l = [] with open('test.csv') as file: lines = csv.reader(file) for line in lines: l.append(line) #28001*784 l.remove(l[0]) #del first row data = array(l) return nomalizing(toInt(data)) #28000*784 '''important code''' def classify(inX,dataSet,labels,k): inX = mat(inX) dataSet = mat(dataSet) labels = mat(labels) dataSetSize = dataSet.shape[0] diffMat = tile(inX,(dataSetSize,1))-dataSet sqDiffMat = array(diffMat)**2 sqDistance = sqDiffMat.sum(axis=1) distances = sqDistance**0.5 sortedDistIndex = distances.argsort() classCount = {} for i in range(k): votelabel = labels[sortedDistIndex[i],0] classCount[votelabel] = classCount.get(votelabel,0)+1 sortedClassCount = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse = True) return sortedClassCount[0][0] def saveResult(result): with open('result.csv','wb') as myFile: myWriter = csv.writer(myFile) for i in result: tmp = [] tmp.append(i) myWriter.writerow(tmp) def handwritingClassTest(): trainData,trainlabel = loadTrainData() testData = loadTestData() m,n = shape(testData) resultlist=[] for i in range(m): classifierResult = classify(testData[i],trainData[0:20000],trainlabel.transpose()[0:20000],5) print "the classifier came back with: %d" % (classifierResult) resultlist.append(classifierResult) saveResult(resultlist) handwritingClassTest()因为只训练了20000个样本,所以提交的准确度只有90.9%,而且运行时间特别长,特别长,特别长,因为每次都需要遍历整个训练集
2.随机森林
import pandas as pd import numpy as np dataset = pd.read_csv('train.csv') label = dataset[[0]].values.ravel() traindata = dataset.ix[:,1:].values testdata = pd.read_csv('test.csv').values from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=10) rf.fit(traindata,label) pred = rf.predict(testdata) np.savetxt('result_RF.csv',np.c_[range(1,len(testdata)+1),pred],delimiter=',',header='ImageId,Label',comments='',fmt='%d') #accuracy = 0.96329利用pandas库进行csv文件的读取是非常简单的,相比上述的方法
RF运行时间只有1min左右,准确率提高到了96%
3.SVM & PCA
考虑到样本的特征太多了,有784个 ,所以利用了一下PCA进行降维
import pandas as pd from sklearn.svm import SVC import numpy as np from sklearn.decomposition import PCA dataset = pd.read_csv('train.csv') label = dataset[[0]].values.ravel() traindata = dataset.ix[:,1:].values test = pd.read_csv('test.csv') #pca pca = PCA(n_components=50,whiten=True) pca.fit(traindata) traindata_pca = pca.transform(traindata) test_pca = pca.transform(test) #svc svc = SVC() svc.fit(traindata_pca,label) pred = svc.predict(test_pca) np.savetxt('submission_SVM.csv',np.c_[range(1,len(test)+1),pred],delimiter=',',header='ImageId,Label',comments='',fmt='%d') #accuracy = 0.98229SVM搭配PCA的准确率就达到了98%