Kaggle-Data Science London-1

import pylab as pl
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split,StratifiedKFold,cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
import sklearn.preprocessing as pp

def dsplit(train_init,target_init):
    train,test,train_target,test_target = train_test_split(train_init,target_init,test_size=0.1,random_state=42)
    print len(train_init)
    print len(train)
    print len(train_target)
    return train,test,train_target,test_target

def dopca(train,train_target,test_init):
    pca = PCA(n_components=12,whiten=True)
    train = pca.fit_transform(train,train_target)
    test_init =pca.transform(test_init)
    return train,test_init

def classifier(train,train_target):

    
    kclass = KNeighborsClassifier(n_neighbors=13,algorithm='kd_tree',weights='uniform',p=1)
    kclass.fit(train,train_target)
#    res = kclass.predict(train)
   
#    print classification_report(train_target,res)
    
#    res1 = kclass.predict(test)
#    print classification_report(test_target,res1)
    return kclass

train_init = np.genfromtxt(open('train.csv', 'rb'), delimiter=',')
target_init = np.genfromtxt(open('trainLabels.csv', 'rb'), delimiter=',')
test_init = np.genfromtxt(open('test.csv','rb'), delimiter=',')

#train,test,train_target,test_target = dsplit(train_init,target_init)
#train,test,test_init = dopca(train,train_target,test,test_init)
train,test_init = dopca(train_init,target_init,test_init)

kclass = classifier(train,target_init)

res = kclass.predict(test_init)
idcol = np.arange(start=1,stop=9001)
res2 = np.column_stack((idcol,res))

np.savetxt('prediction.csv',res2,fmt='%d',delimiter=",")

Public score=0.92399

使用了KD-Tree的KNN算法,并做了PCA分解。

实验过后发现,采用PCA可以提高分类的正确性。未使用PCA的,ps=0.87221。


函数:

分解数据:

train_test_split(train_init,target_init,test_size=0.1,random_state=42)

PCA:

pca = PCA(n_components=12,whiten=True)
train = pca.fit_transform(train,train_target)
test_init =pca.transform(test_init)

KNN:

kclass = KNeighborsClassifier(n_neighbors=13,algorithm='kd_tree',weights='uniform',p=1)
kclass.fit(train,train_target)
res = kclass.predict(test_init)


你可能感兴趣的:(Machine,Learning,Python)