https://mlnote.wordpress.com/2015/12/16/python%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E8%B7%B5%E4%B8%8Ekaggle%E5%AE%9E%E6%88%98-machine-learning-for-kaggle-competition-in-python/
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
# 读取数据
iris = load_iris()
# 选取特征与标签
X_iris, y_iris = iris.data, iris.target
# 选择前两列数据作为特征
X, y = X_iris[:, :2], y_iris
# 选取一部分,25%的训练数据作为测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 33)
# 对原特征数据进行标准化预处理,这个其实挺重要,但是经常被一些选手忽略
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.linear_model import SGDClassifier
# 选择使用SGD分类器,适合大规模数据,随机梯度下降方法估计参数
clf = SGDClassifier()
clf.fit(X_train, y_train)
# 导入评价包
from sklearn import metrics
y_train_predict = clf.predict(X_train)
# 内测,使用训练样本进行准确性能评估
print metrics.accuracy_score(y_train, y_train_predict)
# 标准外测,使用测试样本进行准确性能评估
y_predict = clf.predict(X_test)
print metrics.accuracy_score(y_test, y_predict)
0.660714285714 0.684210526316
# 如果需要更加详细的性能报告,比如precision, recall, accuracy,可以使用如下的函数。 print metrics.classification_report(y_test, y_predict, target_names = iris.target_names)
precision recall f1-score support setosa 1.00 1.00 1.00 8 versicolor 0.43 0.27 0.33 11 virginica 0.65 0.79 0.71 19 avg / total 0.66 0.68 0.66 38
# 如果想详细探查SGDClassifier的分类性能,我们需要充分利用数据,因此需要把数据切分为N个部分,每个部分都用于测试一次模型性能。
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# 这里使用Pipeline,便于精简模型搭建,一般而言,模型在fit之前,对数据需要feature_extraction, preprocessing, 等必要步骤。
# 这里我们使用默认的参数配置
clf = Pipeline([('scaler', StandardScaler()), ('sgd_classifier', SGDClassifier())])
# 5折交叉验证整个数据集合
cv = KFold(X.shape[0], 5, shuffle=True, random_state = 33)
scores = cross_val_score(clf, X, y, cv=cv)
print scores
# 计算一下模型综合性能,平均精度和标准差
print scores.mean(), scores.std()
from scipy.stats import sem
import numpy as np
# 这里使用的偏差计算函数略有不同,参考链接
http://www.graphpad.com/guides/prism/6/statistics/index.htm?stat_semandsdnotsame.htm
print np.mean(scores), sem(scores)
[ 0.56666667 0.73333333 0.83333333 0.76666667 0.8 ] 0.74 0.0928559218479 0.74 0.0464279609239
from sklearn.datasets import fetch_olivetti_faces # 这部分数据没有直接存储在现有包中,都是通过这类函数在线下载 faces = fetch_olivetti_faces()
# 这里证明,数据是以Dict的形式存储的,与多数实验性数据的格式一致
faces.keys()
['images', 'data', 'target', 'DESCR']
# 使用shape属性检验数据规模
print faces.data.shape
print faces.target.shape
(400L, 4096L) (400L,)
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
# 同样是分割数据 25%用于测试
X_train, X_test, y_train, y_test = train_test_split(faces.data, faces.target, test_size=0.25, random_state=0)
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem
# 构造一个便于交叉验证模型性能的函数(模块)
def evaluate_cross_validation(clf, X, y, K):
# KFold 函数需要如下参数:数据量, 叉验次数, 是否洗牌
cv = KFold(len(y), K, shuffle=True, random_state = 0)
# 采用上述的分隔方式进行交叉验证,测试模型性能,对于分类问题,这些得分默认是accuracy,也可以修改为别的
scores = cross_val_score(clf, X, y, cv=cv)
print scores
print 'Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores))
# 使用线性核的SVC (后面会说到不同的核,结果可能大不相同)
svc_linear = SVC(kernel='linear')
# 五折交叉验证 K = 5
evaluate_cross_validation(svc_linear, X_train, y_train, 5)
[ 0.93333333 0.86666667 0.91666667 0.93333333 0.91666667] Mean score: 0.913 (+/-0.012)
from sklearn.datasets import fetch_20newsgroups
# 与之前的人脸数据集一样,20类新闻数据同样需要临时下载函数的帮忙 news = fetch_20newsgroups(subset='all')
# 查验数据,依然采用dict格式,共有18846条样本
print len(news.data), len(news.target)
print news.target
18846 18846 [10 3 17 ..., 3 1 7]
# 查验一下新闻类别和种数 print news.target_names print news.target_names.__len__()
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] 20
# 同样,我们选取25%的数据用来测试模型性能
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25)
print X_train.__len__() print y_train.__len__() print X_test.__len__()
14134 14134 4712
# 许多原始数据无法直接被分类器所使用,图像可以直接使用pixel信息,文本则需要进一步处理成数值化的信息
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import *
from scipy.stats import sem
# 我们在NB_Classifier的基础上,对比几种特征抽取方法的性能。并且使用Pipline简化构建训练流程
clf_1 = Pipeline([('count_vec', CountVectorizer()), ('mnb', MultinomialNB())])
clf_2 = Pipeline([('hash_vec', HashingVectorizer(non_negative=True)), ('mnb', MultinomialNB())])
clf_3 = Pipeline([('tfidf_vec', TfidfVectorizer()), ('mnb', MultinomialNB())])
# 构造一个便于交叉验证模型性能的函数(模块)
def evaluate_cross_validation(clf, X, y, K):
# KFold 函数需要如下参数,数据量, K,是否洗牌
cv = KFold(len(y), K, shuffle=True, random_state = 0)
# 采用上述的分隔方式进行交叉验证,测试模型性能,对于分类问题,这些得分默认是accuracy,也可以修改为别的
scores = cross_val_score(clf, X, y, cv=cv)
print scores
print 'Mean score: %.3f (+/-%.3f)' % (scores.mean(), sem(scores))
clfs = [clf_1, clf_2, clf_3] for clf in clfs: evaluate_cross_validation(clf, X_train, y_train, 5)
[ 0.83516095 0.83374602 0.84471171 0.83622214 0.83227176] Mean score: 0.836 (+/-0.002) [ 0.76052352 0.72727273 0.77538026 0.74778918 0.75194621] Mean score: 0.753 (+/-0.008) [ 0.84435798 0.83409975 0.85496993 0.84082066 0.83227176] Mean score: 0.841 (+/-0.004)
# 从上述结果中,我们发现常用的两个特征提取方法得到的性能相当。 让我们选取其中之一,进一步靠特征的精细筛选提升性能。 clf_4 = Pipeline([('tfidf_vec_adv', TfidfVectorizer(stop_words='english')), ('mnb', MultinomialNB())]) evaluate_cross_validation(clf_4, X_train, y_train, 5)
[ 0.87053414 0.86664308 0.887867 0.87371772 0.86553432] Mean score: 0.873 (+/-0.004)
# 如果再尝试修改贝叶斯分类器的平滑参数,也许性能会更上一层楼。 clf_5 = Pipeline([('tfidf_vec_adv', TfidfVectorizer(stop_words='english')), ('mnb', MultinomialNB(alpha=0.01))]) evaluate_cross_validation(clf_5, X_train, y_train, 5)
[ 0.90060134 0.89741776 0.91651928 0.90909091 0.90410474] Mean score: 0.906 (+/-0.003)
# 这里为了处理数据方便,我们引入一个新的工具包pandas
import pandas as pd
import numpy as np
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
#瞧瞧数据,什么数据特征的都有,有数值型的、类别型的,字符串,甚至还有缺失的数据等等。
titanic.head()
# 使用pandas,数据都转入pandas独有的dataframe格式(二维数据表格),直接使用info(),查看数据的基本特征
titanic.info()
Int64Index: 1313 entries, 0 to 1312 Data columns (total 11 columns): row.names 1313 non-null int64 pclass 1313 non-null object survived 1313 non-null int64 name 1313 non-null object age 633 non-null float64 embarked 821 non-null object home.dest 754 non-null object room 77 non-null object ticket 69 non-null object boat 347 non-null object sex 1313 non-null object dtypes: float64(1), int64(2), object(8) memory usage: 123.1+ KB
# 这份调查数据是真实的泰坦尼克号乘客个人和登船信息,有助于我们预测每位遇难乘客是否幸免。 # 一共1313条数据,有些特征是完整的(比如 pclass, survived, name),有些是有缺失的;有些是数值类型的信息(age: float64),有些则是字符串。 # 机器学习有一个不太被初学者重视,并且耗时,但是十分重要的一环,特征的选择,这个需要基于一些背景知识。根据我们对这场事故的了解,sex, age, pclass这些都很有可能是决定幸免与否的关键因素。 # we keep pclass, age, sex. X = titanic[['pclass', 'age', 'sex']] y = titanic['survived']
X.info()
Int64Index: 1313 entries, 0 to 1312 Data columns (total 3 columns): pclass 1313 non-null object age 633 non-null float64 sex 1313 non-null object dtypes: float64(1), object(2) memory usage: 41.0+ KB
# 下面有几个对数据处理的任务 # 1) age这个数据列,只有633个 # 2) sex 与 pclass两个数据列的值都是类别型的,需要转化为数值特征,用0/1代替 # 首先我们补充age里的数据,使用平均数或者中位数都是对模型偏离造成最小影响的策略 X['age'].fillna(X['age'].mean(), inplace=True)
C:\Anaconda2\lib\site-packages\pandas\core\generic.py:2748: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._update_inplace(new_data)
X.info()
Int64Index: 1313 entries, 0 to 1312 Data columns (total 3 columns): pclass 1313 non-null object age 1313 non-null float64 sex 1313 non-null object dtypes: float64(1), object(2) memory usage: 41.0+ KB
from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 33) # 我们使用scikit-learn中的feature_extraction from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) X_train = vec.fit_transform(X_train.to_dict(orient='record')) print vec.feature_names_ # 我们发现,凡是类别型的特征都单独剥离出来,独成一列特征,数值型的则保持不变
['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']
X_test = vec.transform(X_test.to_dict(orient='record'))
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
dtc.fit(X_train, y_train)
dtc.score(X_test, y_test)
0.79331306990881456
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=3, min_samples_leaf=5)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)
0.77203647416413379
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(max_depth=3, min_samples_leaf=5)
gbc.fit(X_train, y_train)
gbc.score(X_test, y_test)
0.79027355623100304
from sklearn.metrics import classification_report y_predict = gbc.predict(X_test) print classification_report(y_predict, y_test) # 这里的函数可以便于生成分类器性能报告(precision,recall)这些是在二分类背景下才有的指标。
precision recall f1-score support 0 0.93 0.78 0.84 241 1 0.57 0.83 0.68 88 avg / total 0.83 0.79 0.80 329
# 首先预读房价数据
from sklearn.datasets import load_boston
boston = load_boston()
# 查验数据规模
print boston.data.shape
(506L, 13L)
# 多多弄懂数据特征的含义也是一个好习惯
print boston.feature_names
print boston.DESCR
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
'B' 'LSTAT']
Boston House Prices dataset
Notes
------
Data Set Characteristics:
:Number of Instances: 506
:Number of Attributes: 13 numeric/categorical predictive
:Median Value (attribute 14) is usually the target
:Attribute Information (in order):
- CRIM per capita crime rate by town
- ZN proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS proportion of non-retail business acres per town
- CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX nitric oxides concentration (parts per 10 million)
- RM average number of rooms per dwelling
- AGE proportion of owner-occupied units built prior to 1940
- DIS weighted distances to five Boston employment centres
- RAD index of accessibility to radial highways
- TAX full-value property-tax rate per $10,000
- PTRATIO pupil-teacher ratio by town
- B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT % lower status of the population
- MEDV Median value of owner-occupied homes in $1000's
:Missing Attribute Values: None
:Creator: Harrison, D. and Rubinfeld, D.L.
This is a copy of UCI ML housing dataset.
http://archive.ics.uci.edu/ml/datasets/Housing
This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.
The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980. N.B. Various transformations are used in the table on
pages 244-261 of the latter.
The Boston house-price data has been used in many machine learning papers that address regression
problems.
**References**
- Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
- Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
- many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)
# 这里多一个步骤,查验数据是否正规化,一般都是没有的
import numpy as np
print np.max(boston.target)
print np.min(boston.target)
print np.mean(boston.target)
50.0 5.0 22.5328063241
from sklearn.cross_validation import train_test_split
# 依然如故,我们对数据进行分割
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.25, random_state=33)
from sklearn.preprocessing import StandardScaler
# 正规化的目的在于避免原始特征值差异过大,导致训练得到的参数权重不一
scalerX = StandardScaler().fit(X_train)
X_train = scalerX.transform(X_train)
X_test = scalerX.transform(X_test)
scalery = StandardScaler().fit(y_train)
y_train = scalery.transform(y_train)
y_test = scalery.transform(y_test)
# 先把评价模块写好,依然是默认5折交叉验证,只是这里的评价指标不再是精度,而是另一个函数R2,大体上,这个得分多少代表有多大百分比的回归结果可以被训练器覆盖和解释
from sklearn.cross_validation import *
def train_and_evaluate(clf, X_train, y_train):
cv = KFold(X_train.shape[0], 5, shuffle=True, random_state=33)
scores = cross_val_score(clf, X_train, y_train, cv=cv)
print 'Average coefficient of determination using 5-fold cross validation:', np.mean(scores)
#最后让我们看看有多少种回归模型可以被使用(其实有更多)。
# 比较有代表性的有3种
# 先用线性模型尝试, SGD_Regressor from sklearn import linear_model # 这里有一个正则化的选项penalty,目前14维特征也许不会有太大影响 clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None, random_state=42) train_and_evaluate(clf_sgd, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.710809853468
# 再换一个SGD_Regressor的penalty参数为l2,结果貌似影响不大,因为特征太少,正则化意义不大 clf_sgd_l2 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2', random_state=42) train_and_evaluate(clf_sgd_l2, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.71081206667
# 再看看SVM的regressor怎么样(都是默认参数),
from sklearn.svm import SVR
# 使用线性核没有啥子提升,但是因为特征少,所以可以考虑升高维度
clf_svr = SVR(kernel='linear')
train_and_evaluate(clf_svr, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.707838419194
clf_svr_poly = SVR(kernel='poly') # 升高维度,效果明显,但是此招慎用@@,特征高的话, CPU还是受不了,内存倒是小事。其实到了现在,连我们自己都没办法直接解释这些特征的具体含义了。 train_and_evaluate(clf_svr_poly, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.779288545488
clf_svr_rbf = SVR(kernel='rbf') # RBF (径向基核更是牛逼!) train_and_evaluate(clf_svr_rbf, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.833662221567
# 再来个更猛的! 极限回归森林,放大招了!!! from sklearn import ensemble clf_et = ensemble.ExtraTreesRegressor() train_and_evaluate(clf_et, X_train, y_train)
Average coefficient of determination using 5-fold cross validation: 0.853006383633
# 最后看看在测试集上的表现
clf_et.fit(X_train, y_train)
clf_et.score(X_test, y_test)
0.83781467779895469
import numpy as np
# 先热个身,牛刀小试 M = np.array([[1, 2], [2, 4]]) M
np.linalg.matrix_rank(M, tol=None) # 获取M矩阵的秩=1
# 载入手写数字的图像像素数据。对于图像处理,除了后续的各种启发式提取有效特征以外,
# 最直接常用的就是像素数据,每个像素都是一个数值,反映颜色。
from sklearn.datasets import load_digits digits = load_digits() # 这些经典数据的存储格式非常统一。这是好习惯,统一了接口,也便于快速使用。 digits
# 老套路
X_digits, y_digits = digits.data, digits.target
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt # 最关键的参数就是n_components = 2个主成分 estimator = PCA(n_components=2) X_pca = estimator.fit_transform(X_digits) # scikit-learn的接口设计的很统一。 # 聚类问题经常需要直观的展现数据,降维度的一个直接目的也为此;因此我们这里多展现几个图片直观一些。 def plot_pca_scatter(): colors = ['black', 'blue', 'purple', 'yellow', 'white', 'red', 'lime', 'cyan', 'orange', 'gray'] for i in xrange(len(colors)): px = X_pca[:, 0][y_digits == i] py = X_pca[:, 1][y_digits == i] plt.scatter(px, py, c=colors[i]) plt.legend(digits.target_names) plt.xlabel('First Principal Component') plt.ylabel('Second Principal Component') plt.show() plot_pca_scatter()
# 这部分代码和原著的第四章节有相同的效果,但是充分利用pandas会表达的更加简洁,因此我重新编写了更加清晰简洁的代码。
import pandas as pd
import numpy as np
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
print titanic.info()
# 还是这组数据
titanic.head()
Int64Index: 1313 entries, 0 to 1312 Data columns (total 11 columns): row.names 1313 non-null int64 pclass 1313 non-null object survived 1313 non-null int64 name 1313 non-null object age 633 non-null float64 embarked 821 non-null object home.dest 754 non-null object room 77 non-null object ticket 69 non-null object boat 347 non-null object sex 1313 non-null object dtypes: float64(1), int64(2), object(8) memory usage: 123.1+ KB None
# 我们丢掉一些过于特异的,不利于找到共同点的数据列, row.names, name, 同时分离出预测列。
y = titanic['survived']
X = titanic.drop(['row.names', 'name', 'survived'], axis = 1)
# 对于连续的数值特征,我们采用补完的方式 X['age'].fillna(X['age'].mean(), inplace=True) X.fillna('UNKNOWN', inplace=True)
# 剩下的类别类型数据,我们直接向量化,这样的话,对于有空白特征的列,我们也单独视作一个特征 from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() X_train = vec.fit_transform(X_train.to_dict(orient='record')) X_test = vec.transform(X_test.to_dict(orient='record'))
print len(vec.feature_names_)
474
X_train.toarray()
array([[ 31.19418104, 0. , 0. , ..., 0. , 0. , 1. ], [ 31.19418104, 0. , 0. , ..., 0. , 0. , 0. ], [ 31.19418104, 0. , 0. , ..., 0. , 0. , 1. ], ..., [ 12. , 0. , 0. , ..., 0. , 0. , 1. ], [ 18. , 0. , 0. , ..., 0. , 0. , 1. ], [ 31.19418104, 0. , 0. , ..., 0. , 0. , 1. ]])
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)
dt.score(X_test, y_test)
# 采用所有特征的测试精度
0.81762917933130697
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20)
X_train_fs = fs.fit_transform(X_train, y_train)
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
dt.score(X_test_fs, y_test)
# 采用20%高预测性特征的测试精度
0.82370820668693012
from sklearn.cross_validation import cross_val_score
percentiles = range(1, 100, 2)
results = []
for i in percentiles:
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile = i)
X_train_fs = fs.fit_transform(X_train, y_train)
scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
results = np.append(results, scores.mean())
print results
opt = np.where(results == results.max())[0]
print 'Optimal number of features %d' %percentiles[opt]
import pylab as pl
pl.plot(percentiles, results) pl.show()
[ 0.85063904 0.85673057 0.87501546 0.88622964 0.86590394 0.87097506 0.87303649 0.86997526 0.87097506 0.87300557 0.86997526 0.86893424 0.87098536 0.86490414 0.86385281 0.86791383 0.86488353 0.86892393 0.86791383 0.86284271 0.86487322 0.86792414 0.86894455 0.87303649 0.86892393 0.86998557 0.86689342 0.86488353 0.86895485 0.86689342 0.87198516 0.8638322 0.86488353 0.87402597 0.87299526 0.87098536 0.86997526 0.86892393 0.86794475 0.86486291 0.87096475 0.86587302 0.86387343 0.86083282 0.86589363 0.8608019 0.86492476 0.85774067 0.8608122 0.85779221] Optimal number of features 7
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7)
X_train_fs = fs.fit_transform(X_train, y_train)
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
dt.score(X_test_fs, y_test)
# 选取搜索到的最好特征比例的测试精度
0.8571428571428571
# 由此可见,这个技术对于工程上提升精度还是非常有帮助的。
from sklearn.datasets import fetch_20newsgroups import numpy as np news = fetch_20newsgroups(subset='all')
# 我们首先使用grid_search的单核版本
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33)
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())])
# 这里需要试验的2个超参数的的个数分别是4、3, svc__gamma的参数共有10^-2, 10^-1...
# 这样我们一共有12种的超参数组合,12个不同参数下的模型
parameters = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)}
# 再考虑每个模型需要交叉验证3次,因此一共需要训练36次模型,根据下面的结果,单线程下,每个模型的训练任务耗时5秒左右。
gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3)
%time _=gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
print gs.score(X_test, y_test)
Fitting 3 folds for each of 12 candidates, totalling 36 fits [CV] svc__gamma=0.01, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.1s [CV] svc__gamma=0.01, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.3s [CV] svc__gamma=0.01, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=0.1 - 5.2s [CV] svc__gamma=0.1, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.1s [CV] svc__gamma=0.1, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.2s [CV] svc__gamma=0.1, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=0.1 - 5.3s [CV] svc__gamma=1.0, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.7s [CV] svc__gamma=1.0, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.8s [CV] svc__gamma=1.0, svc__C=0.1 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=0.1 - 5.9s [CV] svc__gamma=10.0, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.4s [CV] svc__gamma=10.0, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.5s [CV] svc__gamma=10.0, svc__C=0.1 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=0.1 - 5.5s [CV] svc__gamma=0.01, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.2s [CV] svc__gamma=0.01, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.3s [CV] svc__gamma=0.01, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=0.01, svc__C=1.0 - 5.3s [CV] svc__gamma=0.1, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.2s [CV] svc__gamma=0.1, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.3s [CV] svc__gamma=0.1, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=0.1, svc__C=1.0 - 5.4s [CV] svc__gamma=1.0, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.3s [CV] svc__gamma=1.0, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.4s [CV] svc__gamma=1.0, svc__C=1.0 ...................................... [CV] ............................. svc__gamma=1.0, svc__C=1.0 - 5.5s [CV] svc__gamma=10.0, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.4s [CV] svc__gamma=10.0, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.3s [CV] svc__gamma=10.0, svc__C=1.0 ..................................... [CV] ............................ svc__gamma=10.0, svc__C=1.0 - 5.4s [CV] svc__gamma=0.01, svc__C=10.0 .................................... [CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.2s [CV] svc__gamma=0.01, svc__C=10.0 .................................... [CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.2s [CV] svc__gamma=0.01, svc__C=10.0 .................................... [CV] ........................... svc__gamma=0.01, svc__C=10.0 - 5.3s [CV] svc__gamma=0.1, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.3s [CV] svc__gamma=0.1, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.4s [CV] svc__gamma=0.1, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=0.1, svc__C=10.0 - 5.4s [CV] svc__gamma=1.0, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.3s [CV] svc__gamma=1.0, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.5s [CV] svc__gamma=1.0, svc__C=10.0 ..................................... [CV] ............................ svc__gamma=1.0, svc__C=10.0 - 5.7s [CV] svc__gamma=10.0, svc__C=10.0 .................................... [CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.6s [CV] svc__gamma=10.0, svc__C=10.0 .................................... [CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.6s [CV] svc__gamma=10.0, svc__C=10.0 .................................... [CV] ........................... svc__gamma=10.0, svc__C=10.0 - 5.9s
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 5.1s [Parallel(n_jobs=1)]: Done 36 out of 36 | elapsed: 3.3min finished
Wall time: 3min 27s 0.822666666667
# 然后我们采用多线程并行搜索,观察时间性能的提高情况
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(news.data[:3000], news.target[:3000], test_size=0.25, random_state=33)
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
clf = Pipeline([('vect', TfidfVectorizer(stop_words='english', analyzer='word')), ('svc', SVC())])
parameters = {'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3)}
gs = GridSearchCV(clf, parameters, verbose=2, refit=True, cv=3, n_jobs=-1)
%time _=gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
print gs.score(X_test, y_test) # 并行化寻找最优的超参数配置,同样获得相同的最优解,但是训练耗时基本上随着CPU核的数量成倍减少。
[Parallel(n_jobs=-1)]: Done 1 jobs | elapsed: 8.4s [Parallel(n_jobs=-1)]: Done 22 out of 36 | elapsed: 30.3s remaining: 19.2s [Parallel(n_jobs=-1)]: Done 36 out of 36 | elapsed: 46.8s finished
Fitting 3 folds for each of 12 candidates, totalling 36 fits Wall time: 56.5 s 0.822666666667
# 这里需要补充的是得到这个结果的机器的配置,好让读者有一个对并行计算更好的了解。 ''' CPU: i7 四核 2.4Ghz Memory: DDR3 1600 32GB '''