第一部分:三种分类算法(Pipeline)的性能比较
代码:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['figure.dpi']=200
plt.rcParams['savefig.dpi']=200
font = {
'family': 'Times New Roman',
'weight': 'light'}
plt.rc("font", **font)
#Section 1: Load data and split data into train/test datasets
iris=datasets.load_iris()
X,y=iris.data[50:,[1,2]],iris.target[50:]
le=LabelEncoder()
y=le.fit_transform(y)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5,random_state=1,stratify=y)
#Section 2: Model performance among different classifiers
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline,Pipeline
import numpy as np
clf1=LogisticRegression(penalty='l2',
C=0.001,
random_state=1)
clf2=DecisionTreeClassifier(max_depth=1,
criterion='entropy',
random_state=1)
clf3=KNeighborsClassifier(n_neighbors=1,
p=2,
metric='minkowski')
pipe1=Pipeline([["sc",StandardScaler()],["clf",clf1]])
pipe3=Pipeline([["sc",StandardScaler()],["clf",clf3]])
clf_labels=["Logistic Regression","Decision Tree","KNN"]
print("10-fold Cross Validation:")
for clf,label in zip([pipe1,clf2,pipe3],clf_labels):
scores=cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
scoring="roc_auc")
print("ROC AUC: %.2f (+/- %.2f) [%s]" % (scores.mean(),scores.std(),label))
结果:
10-fold Cross Validation:
ROC AUC: 0.87 (+/- 0.17) [Logistic Regression]
ROC AUC: 0.89 (+/- 0.16) [Decision Tree]
ROC AUC: 0.88 (+/- 0.15) [KNN]
第二部分:多数投票方式
代码
#Section 3: Combine individual classifier via MajorityVoting
from sklearn.ensemble import VotingClassifier
"""Return class labels or probabilities for X for each estimator.
probabilities_or_labels
If `voting='soft'` and `flatten_transform=True`:
returns array-like of shape (n_classifiers, n_samples *
n_classes), being class probabilities calculated by each
classifier.
If `voting='soft' and `flatten_transform=False`:
array-like of shape (n_classifiers, n_samples, n_classes)
If `voting='hard'`:
array-like of shape (n_samples, n_classifiers), being
class labels predicted by each classifier.
"""
mv_clf=VotingClassifier(estimators=[('pipe1',pipe1),('clf2',clf2),('pipe3',pipe3)],
voting='soft')
clf_labels+=['Majority Voting']
all_clf=[pipe1,clf2,pipe3,mv_clf]
for clf,label in zip(all_clf,clf_labels):
scores=cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
scoring="roc_auc")
print("Accuracy: %.2f (+/- %.2f) [%s]" % (scores.mean(),scores.std(),label))
结果
Accuracy: 0.87 (+/- 0.17) [Logistic Regression]
Accuracy: 0.89 (+/- 0.16) [Decision Tree]
Accuracy: 0.88 (+/- 0.15) [KNN]
Accuracy: 0.94 (+/- 0.13) [Majority Voting]
对比上述结果,可以得知多数投票方式的分类算法,抗差能力更强。
第三部分:ROC 曲线
在第一部分基础上,进一步添加如下代码。
代码:
#Section 4: Evaluate and tune the ensemble classifier
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
colors=['black','orange','blue','green']
linestyle=[':','--','-.','-']
print("\n")
for clf,label,clr,ls in zip(all_clf,clf_labels,colors,linestyle):
#Assume the label of the positive class is 1
y_pred=clf.fit(X_train,y_train).predict_proba(X_test)[:,1]
fpr,tpr,thresholds=roc_curve(y_true=y_test,y_score=y_pred)
roc_auc=auc(fpr,tpr)
plt.plot(fpr,tpr,color=clr,
linestyle=ls,
label='%s (auc=%.2f)' % (label,roc_auc))
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],
linestyle='--',
color='gray',
linewidth=2)
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.grid(alpha=0.5)
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel("True Positive Rate (TPR)")
plt.savefig('./fig1.png')
plt.show()
第三部分:超参调优
在上述第一部分的基础上,进一步添加如下部分代码片段。
代码
print(mv_clf.get_params())
from sklearn.model_selection import GridSearchCV
params={
'clf2__max_depth':[1,2,4],'pipe1__clf__C':[0.001,0.01,10]}
grid=GridSearchCV(estimator=mv_clf,
param_grid=params,
cv=10,
scoring='roc_auc')
grid.fit(X_train,y_train)
for params,mean_score,std_score in zip(grid.cv_results_['params'],
grid.cv_results_['mean_test_score'],
grid.cv_results_['std_test_score']):
print("%0.3f+/-%.2f %r" % (mean_score,std_score,params))
print("Best Parameters: %s" % grid.best_params_)
print("Accuracy in Train: %.2f" % grid.best_score_)
结果
超参名称获取:
mv_clf.get_params()
Out[2]:
{
'estimators': [('pipe1', Pipeline(memory=None,
steps=[['sc',
StandardScaler(copy=True, with_mean=True, with_std=True)],
['clf',
LogisticRegression(C=0.001, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=1, solver='warn',
tol=0.0001, verbose=0, warm_start=False)]],
verbose=False)),
('clf2',
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False,
random_state=1, splitter='best')),
('pipe3', Pipeline(memory=None,
steps=[['sc',
StandardScaler(copy=True, with_mean=True, with_std=True)],
['clf',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski', metric_params=None,
n_jobs=None, n_neighbors=1, p=2,
weights='uniform')]],
verbose=False))],
'flatten_transform': True,
'n_jobs': None,
'voting': 'soft',
'weights': None,
'pipe1': Pipeline(memory=None,
steps=[['sc',
StandardScaler(copy=True, with_mean=True, with_std=True)],
['clf',
LogisticRegression(C=0.001, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=1, solver='warn',
tol=0.0001, verbose=0, warm_start=False)]],
verbose=False),
'clf2': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False,
random_state=1, splitter='best'),
'pipe3': Pipeline(memory=None,
steps=[['sc',
StandardScaler(copy=True, with_mean=True, with_std=True)],
['clf',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski', metric_params=None,
n_jobs=None, n_neighbors=1, p=2,
weights='uniform')]],
verbose=False),
'pipe1__memory': None,
'pipe1__steps': [['sc',
StandardScaler(copy=True, with_mean=True, with_std=True)],
['clf',
LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=1, solver='warn', tol=0.0001, verbose=0,
warm_start=False)]],
'pipe1__verbose': False,
'pipe1__sc': StandardScaler(copy=True, with_mean=True, with_std=True),
'pipe1__clf': LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=1, solver='warn', tol=0.0001, verbose=0,
warm_start=False),
'pipe1__sc__copy': True,
'pipe1__sc__with_mean': True,
'pipe1__sc__with_std': True,
'pipe1__clf__C': 0.001,
'pipe1__clf__class_weight': None,
'pipe1__clf__dual': False,
'pipe1__clf__fit_intercept': True,
'pipe1__clf__intercept_scaling': 1,
'pipe1__clf__l1_ratio': None,
'pipe1__clf__max_iter': 100,
'pipe1__clf__multi_class': 'warn',
'pipe1__clf__n_jobs': None,
'pipe1__clf__penalty': 'l2',
'pipe1__clf__random_state': 1,
'pipe1__clf__solver': 'warn',
'pipe1__clf__tol': 0.0001,
'pipe1__clf__verbose': 0,
'pipe1__clf__warm_start': False,
'clf2__class_weight': None,
'clf2__criterion': 'entropy',
'clf2__max_depth': 1,
'clf2__max_features': None,
'clf2__max_leaf_nodes': None,
'clf2__min_impurity_decrease': 0.0,
'clf2__min_impurity_split': None,
'clf2__min_samples_leaf': 1,
'clf2__min_samples_split': 2,
'clf2__min_weight_fraction_leaf': 0.0,
'clf2__presort': False,
'clf2__random_state': 1,
'clf2__splitter': 'best',
'pipe3__memory': None,
'pipe3__steps': [['sc',
StandardScaler(copy=True, with_mean=True, with_std=True)],
['clf',
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=1, p=2,
weights='uniform')]],
'pipe3__verbose': False,
'pipe3__sc': StandardScaler(copy=True, with_mean=True, with_std=True),
'pipe3__clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=1, p=2,
weights='uniform'),
'pipe3__sc__copy': True,
'pipe3__sc__with_mean': True,
'pipe3__sc__with_std': True,
'pipe3__clf__algorithm': 'auto',
'pipe3__clf__leaf_size': 30,
'pipe3__clf__metric': 'minkowski',
'pipe3__clf__metric_params': None,
'pipe3__clf__n_jobs': None,
'pipe3__clf__n_neighbors': 1,
'pipe3__clf__p': 2,
'pipe3__clf__weights': 'uniform'}
寻优参数组合,包含平均误差和方差。
0.933+/-0.14 {
'clf2__max_depth': 1, 'pipe1__clf__C': 0.001}
0.947+/-0.14 {
'clf2__max_depth': 1, 'pipe1__clf__C': 0.01}
0.973+/-0.07 {
'clf2__max_depth': 1, 'pipe1__clf__C': 10}
0.947+/-0.14 {
'clf2__max_depth': 2, 'pipe1__clf__C': 0.001}
0.947+/-0.14 {
'clf2__max_depth': 2, 'pipe1__clf__C': 0.01}
0.973+/-0.07 {
'clf2__max_depth': 2, 'pipe1__clf__C': 10}
0.933+/-0.14 {
'clf2__max_depth': 4, 'pipe1__clf__C': 0.001}
0.947+/-0.14 {
'clf2__max_depth': 4, 'pipe1__clf__C': 0.01}
0.973+/-0.07 {
'clf2__max_depth': 4, 'pipe1__clf__C': 10}
Best Parameters: {
'clf2__max_depth': 1, 'pipe1__clf__C': 10}
Accuracy in Train: 0.97
参考文献
Sebastian Raschka, Vahid Mirjalili. Python机器学习第二版. 南京:东南大学出版社,2018.
附录
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['figure.dpi']=200
plt.rcParams['savefig.dpi']=200
font = {
'family': 'Times New Roman',
'weight': 'light'}
plt.rc("font", **font)
#Section 1: Load data and split data into train/test datasets
iris=datasets.load_iris()
X,y=iris.data[50:,[1,2]],iris.target[50:]
le=LabelEncoder()
y=le.fit_transform(y)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5,random_state=1,stratify=y)
#Section 2: Model performance among different classifiers
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline,Pipeline
import numpy as np
clf1=LogisticRegression(penalty='l2',
C=0.001,
random_state=1)
clf2=DecisionTreeClassifier(max_depth=1,
criterion='entropy',
random_state=1)
clf3=KNeighborsClassifier(n_neighbors=1,
p=2,
metric='minkowski')
pipe1=Pipeline([["sc",StandardScaler()],["clf",clf1]])
pipe3=Pipeline([["sc",StandardScaler()],["clf",clf3]])
clf_labels=["Logistic Regression","Decision Tree","KNN"]
print("10-fold Cross Validation:")
for clf,label in zip([pipe1,clf2,pipe3],clf_labels):
scores=cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
scoring="roc_auc")
print("ROC AUC: %.2f (+/- %.2f) [%s]" % (scores.mean(),scores.std(),label))
#Section 3: Combine individual classifier via MajorityVoting
from sklearn.ensemble import VotingClassifier
"""Return class labels or probabilities for X for each estimator.
probabilities_or_labels
If `voting='soft'` and `flatten_transform=True`:
returns array-like of shape (n_classifiers, n_samples *
n_classes), being class probabilities calculated by each
classifier.
If `voting='soft' and `flatten_transform=False`:
array-like of shape (n_classifiers, n_samples, n_classes)
If `voting='hard'`:
array-like of shape (n_samples, n_classifiers), being
class labels predicted by each classifier.
"""
mv_clf=VotingClassifier(estimators=[('pipe1',pipe1),('clf2',clf2),('pipe3',pipe3)],
voting='soft')
clf_labels+=['Majority Voting']
all_clf=[pipe1,clf2,pipe3,mv_clf]
for clf,label in zip(all_clf,clf_labels):
scores=cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
scoring="roc_auc")
print("Accuracy: %.2f (+/- %.2f) [%s]" % (scores.mean(),scores.std(),label))
#Section 4: Evaluate and tune the ensemble classifier
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
colors=['black','orange','blue','green']
linestyle=[':','--','-.','-']
print("\n")
for clf,label,clr,ls in zip(all_clf,clf_labels,colors,linestyle):
#Assume the label of the positive class is 1
y_pred=clf.fit(X_train,y_train).predict_proba(X_test)[:,1]
fpr,tpr,thresholds=roc_curve(y_true=y_test,y_score=y_pred)
roc_auc=auc(fpr,tpr)
plt.plot(fpr,tpr,color=clr,
linestyle=ls,
label='%s (auc=%.2f)' % (label,roc_auc))
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],
linestyle='--',
color='gray',
linewidth=2)
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.grid(alpha=0.5)
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel("True Positive Rate (TPR)")
plt.savefig('./fig1.png')
plt.show()
#Section 5: Hyperparameter adjustation via GridSearch
print(mv_clf.get_params())
from sklearn.model_selection import GridSearchCV
params={
'clf2__max_depth':[1,2,4],'pipe1__clf__C':[0.001,0.01,10]}
grid=GridSearchCV(estimator=mv_clf,
param_grid=params,
cv=10,
scoring='roc_auc')
grid.fit(X_train,y_train)
for params,mean_score,std_score in zip(grid.cv_results_['params'],
grid.cv_results_['mean_test_score'],
grid.cv_results_['std_test_score']):
print("%0.3f+/-%.2f %r" % (mean_score,std_score,params))
print("Best Parameters: %s" % grid.best_params_)
print("Accuracy in Train: %.2f" % grid.best_score_)