import pylab as pl
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split,StratifiedKFold,cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
import sklearn.preprocessing as pp
def dsplit(train_init,target_init):
train,test,train_target,test_target = train_test_split(train_init,target_init,test_size=0.1,random_state=42)
print len(train_init)
print len(train)
print len(train_target)
return train,test,train_target,test_target
def dopca(train,train_target,test_init):
pca = PCA(n_components=12,whiten=True)
train = pca.fit_transform(train,train_target)
test_init =pca.transform(test_init)
return train,test_init
def classifier(train,train_target):
kclass = KNeighborsClassifier(n_neighbors=13,algorithm='kd_tree',weights='uniform',p=1)
kclass.fit(train,train_target)
# res = kclass.predict(train)
# print classification_report(train_target,res)
# res1 = kclass.predict(test)
# print classification_report(test_target,res1)
return kclass
train_init = np.genfromtxt(open('train.csv', 'rb'), delimiter=',')
target_init = np.genfromtxt(open('trainLabels.csv', 'rb'), delimiter=',')
test_init = np.genfromtxt(open('test.csv','rb'), delimiter=',')
#train,test,train_target,test_target = dsplit(train_init,target_init)
#train,test,test_init = dopca(train,train_target,test,test_init)
train,test_init = dopca(train_init,target_init,test_init)
kclass = classifier(train,target_init)
res = kclass.predict(test_init)
idcol = np.arange(start=1,stop=9001)
res2 = np.column_stack((idcol,res))
np.savetxt('prediction.csv',res2,fmt='%d',delimiter=",")
使用了KD-Tree的KNN算法,并做了PCA分解。
实验过后发现,采用PCA可以提高分类的正确性。未使用PCA的,ps=0.87221。
函数:
分解数据:
train_test_split(train_init,target_init,test_size=0.1,random_state=42)
pca = PCA(n_components=12,whiten=True)
train = pca.fit_transform(train,train_target)
test_init =pca.transform(test_init)
kclass = KNeighborsClassifier(n_neighbors=13,algorithm='kd_tree',weights='uniform',p=1)
kclass.fit(train,train_target)
res = kclass.predict(test_init)