加载数据
import pandas as pd
df_wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol',
'Malic acid', 'Ash',
'Alcalinity of ash', 'Magnesium',
'Total phenols', 'Flavanoids',
'Nonflavanoid phenols', 'Proanthocyanins',
'Color intensity', 'Hue',
'OD280/OD315 of diluted wines', 'Proline']
y = df_wine['Class label'].values
特征选择
为了方便后面可视化,我们只选取2个特征,通过自变量与因变量y相关系数来选择
# pearsonr可以计算相关系数与p值
# 当p<0.01表示两个变量强相关
from scipy.stats import pearsonr
lable=df_wine.values[:,0]
lr = []
for i, line in enumerate(df_wine.values.T):
lr.append([pearsonr(lable,line),i])
lr.sort()
X = df_wine[[df_wine.columns[lr[0][1]],df_wine.columns[lr[-2][1]]]].values
还可以通过PCA降维来选择,本例降维后分类效果并不好
# pearsonr可以计算相关系数与p值
# 当p<0.01表示两个变量强相关
from scipy.stats import pearsonr
lable=df_wine.values[:,0]
lr = []
for i, line in enumerate(df_wine.values.T):
lr.append([pearsonr(lable,line),i])
lr.sort()
X = df_wine[[df_wine.columns[lr[0][1]],df_wine.columns[lr[-2][1]]]].values
因为这里有标签,还可以通过LDA来降维选择,效果比较好,数据分类达到100%正确
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X = df_wine.iloc[:,range(1,len(df_wine.columns),1)].values
lda = LinearDiscriminantAnalysis(n_components=2)
X = lda.fit(X, y).transform(X)
调参,这里只调一个决策树深度参数
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
# # 拆分训练集的30%作为测试集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state=1)
param_test1 = {'max_depth':range(1,20,1)}
gsearch1 = GridSearchCV(estimator = DecisionTreeClassifier(criterion="entropy",
random_state=10),
param_grid = param_test1,cv=10)
gsearch1.fit(X_train,y_train)
#print gsearch1.grid_scores_,
print gsearch1.best_params_
print gsearch1.best_score_
输出
{'max_depth': 8}
0.822580645161
度量单个决策树的准确性
# 度量单个决策树的准确性
from sklearn.metrics import accuracy_score
tree = DecisionTreeClassifier(criterion="entropy", max_depth=gsearch1.best_params_['max_depth'])
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test))
Decision tree train/test accuracies 0.984/0.815
# 生成50个决策树,详细的参数建议参考官方文档
bag = BaggingClassifier(base_estimator=tree, n_estimators=50,
max_samples=1.0, max_features=1.0,
bootstrap=True, bootstrap_features=False,
n_jobs=1, random_state=1)
# 度量bagging分类器的准确性
bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)
bag_train = accuracy_score(y_train, y_train_pred)
bag_test = accuracy_score(y_test, y_test_pred)
print('Bagging train/test accuracies %.3f/%.3f' % (bag_train, bag_test))
Bagging分类器的效果的确要比单个决策树的效果好,提高了一点
Bagging train/test accuracies 1.000/0.852
Boosting分类器, Bagging是投票平均模式,Boosting
ada = AdaBoostClassifier(base_estimator=tree, n_estimators=1000, learning_rate=0.1, random_state=0)
ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
ada_train = accuracy_score(y_train, y_train_pred)
ada_test = accuracy_score(y_test, y_test_pred)
print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test))