Python源码:
#coding=utf-8
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
#-------------download data
news=fetch_20newsgroups(subset='all')
#-------------select front 3000 data,split data,25% for test
X_train,X_test,y_train,y_test=train_test_split(news.data[:3000],news.target[:3000],test_size=0.25,random_state=33)
#-------------use Pipeline to simplify processing flow,contact wordsVectorizer with classifier model
clf=Pipeline([('vect',TfidfVectorizer(stop_words='english',analyzer='word')),('svc',SVC())])
#-------------create geometric progression,total 4*3 =12 parameters combination
parameters={'svc__gamma':np.logspace(-2,1,4),'svc__C':np.logspace(-1,1,3)}
gs=GridSearchCV(clf,parameters,verbose=2,refit=True,cv=3)
time_=gs.fit(X_train,y_train)
gs.best_params_,gs.best_score_
print 'Best accuracy is:',gs.score(X_test,y_test)
Result:
#coding=utf-8
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
#-------------download data
news=fetch_20newsgroups(subset='all')
#-------------select front 3000 data,split data,25% for test
X_train,X_test,y_train,y_test=train_test_split(news.data[:3000],news.target[:3000],test_size=0.25,random_state=33)
#-------------use Pipeline to simplify processing flow,contact wordsVectorizer with classifier model
clf=Pipeline([('vect',TfidfVectorizer(stop_words='english',analyzer='word')),('svc',SVC())])
#-------------create geometric progression (等比数列),total 4*3 =12 parameters combination
parameters={'svc__gamma':np.logspace(-2,1,4),'svc__C':np.logspace(-1,1,3)}
#-------------n_jobs=-1 means use all available CPU
gs=GridSearchCV(clf,parameters,verbose=2,refit=True,cv=3,n_jobs=-1)
time_=gs.fit(X_train,y_train)
gs.best_params_,gs.best_score_
print 'Best accuracy is:',gs.score(X_test,y_test)
Result:
同样是网格搜索,使用多线程并行搜索技术,执行同样的36项任务一共只花费了1.2min,寻找到的最佳的超参数组合在测试ji上的最高分类准确性依然为82.27%。在没有影响准确性的情况下,几乎3倍的提升了运算速度!
P.S:关于Pipeline,后面会单独介绍
可以参考:http://scikit-learn.org/stable/modules/pipeline.html#pipeline-chaining-estimators