机器学习-多数投票方式-MajorityVotingClassifier

Section I: Code Bundle and Result Analyses

第一部分:三种分类算法(Pipeline)的性能比较

代码

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

plt.rcParams['figure.dpi']=200
plt.rcParams['savefig.dpi']=200
font = {
     'family': 'Times New Roman',
        'weight': 'light'}
plt.rc("font", **font)

#Section 1: Load data and split data into train/test datasets
iris=datasets.load_iris()
X,y=iris.data[50:,[1,2]],iris.target[50:]
le=LabelEncoder()
y=le.fit_transform(y)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5,random_state=1,stratify=y)

#Section 2: Model performance among different classifiers
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline,Pipeline
import numpy as np

clf1=LogisticRegression(penalty='l2',
                        C=0.001,
                        random_state=1)
clf2=DecisionTreeClassifier(max_depth=1,
                            criterion='entropy',
                            random_state=1)
clf3=KNeighborsClassifier(n_neighbors=1,
                          p=2,
                          metric='minkowski')

pipe1=Pipeline([["sc",StandardScaler()],["clf",clf1]])
pipe3=Pipeline([["sc",StandardScaler()],["clf",clf3]])
clf_labels=["Logistic Regression","Decision Tree","KNN"]

print("10-fold Cross Validation:")
for clf,label in zip([pipe1,clf2,pipe3],clf_labels):
    scores=cross_val_score(estimator=clf,
                           X=X_train,
                           y=y_train,
                           cv=10,
                           scoring="roc_auc")
    print("ROC AUC: %.2f (+/- %.2f) [%s]" % (scores.mean(),scores.std(),label))

结果

10-fold Cross Validation:
ROC AUC: 0.87 (+/- 0.17) [Logistic Regression]
ROC AUC: 0.89 (+/- 0.16) [Decision Tree]
ROC AUC: 0.88 (+/- 0.15) [KNN]

第二部分:多数投票方式

代码

#Section 3: Combine individual classifier via MajorityVoting
from sklearn.ensemble import VotingClassifier

"""Return class labels or probabilities for X for each estimator.
probabilities_or_labels
    If `voting='soft'` and `flatten_transform=True`:
        returns array-like of shape (n_classifiers, n_samples *
        n_classes), being class probabilities calculated by each
        classifier.
    If `voting='soft' and `flatten_transform=False`:
        array-like of shape (n_classifiers, n_samples, n_classes)
    If `voting='hard'`:
        array-like of shape (n_samples, n_classifiers), being
        class labels predicted by each classifier.
"""
mv_clf=VotingClassifier(estimators=[('pipe1',pipe1),('clf2',clf2),('pipe3',pipe3)],
                        voting='soft')

clf_labels+=['Majority Voting']
all_clf=[pipe1,clf2,pipe3,mv_clf]

for clf,label in zip(all_clf,clf_labels):
    scores=cross_val_score(estimator=clf,
                           X=X_train,
                           y=y_train,
                           cv=10,
                           scoring="roc_auc")
    print("Accuracy: %.2f (+/- %.2f) [%s]" % (scores.mean(),scores.std(),label))

结果

Accuracy: 0.87 (+/- 0.17) [Logistic Regression]
Accuracy: 0.89 (+/- 0.16) [Decision Tree]
Accuracy: 0.88 (+/- 0.15) [KNN]
Accuracy: 0.94 (+/- 0.13) [Majority Voting]

对比上述结果,可以得知多数投票方式的分类算法,抗差能力更强。

第三部分:ROC 曲线

在第一部分基础上,进一步添加如下代码。

代码

#Section 4: Evaluate and tune the ensemble classifier
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

colors=['black','orange','blue','green']
linestyle=[':','--','-.','-']
print("\n")

for clf,label,clr,ls in zip(all_clf,clf_labels,colors,linestyle):
    #Assume the label of the positive class is 1
    y_pred=clf.fit(X_train,y_train).predict_proba(X_test)[:,1]
    fpr,tpr,thresholds=roc_curve(y_true=y_test,y_score=y_pred)
    roc_auc=auc(fpr,tpr)
    plt.plot(fpr,tpr,color=clr,
             linestyle=ls,
             label='%s (auc=%.2f)' % (label,roc_auc))

plt.legend(loc='lower right')
plt.plot([0,1],[0,1],
         linestyle='--',
         color='gray',
         linewidth=2)
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.grid(alpha=0.5)
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel("True Positive Rate (TPR)")
plt.savefig('./fig1.png')
plt.show()

结果
机器学习-多数投票方式-MajorityVotingClassifier_第1张图片

第三部分:超参调优

在上述第一部分的基础上,进一步添加如下部分代码片段。

代码

print(mv_clf.get_params())
from sklearn.model_selection import GridSearchCV

params={
     'clf2__max_depth':[1,2,4],'pipe1__clf__C':[0.001,0.01,10]}

grid=GridSearchCV(estimator=mv_clf,
                  param_grid=params,
                  cv=10,
                  scoring='roc_auc')
grid.fit(X_train,y_train)

for params,mean_score,std_score in zip(grid.cv_results_['params'],
                                       grid.cv_results_['mean_test_score'],
                                       grid.cv_results_['std_test_score']):
    print("%0.3f+/-%.2f %r" % (mean_score,std_score,params))

print("Best Parameters: %s" % grid.best_params_)
print("Accuracy in Train: %.2f" % grid.best_score_)

结果
超参名称获取:

mv_clf.get_params()
Out[2]: 
{
     'estimators': [('pipe1', Pipeline(memory=None,
            steps=[['sc',
                    StandardScaler(copy=True, with_mean=True, with_std=True)],
                   ['clf',
                    LogisticRegression(C=0.001, class_weight=None, dual=False,
                                       fit_intercept=True, intercept_scaling=1,
                                       l1_ratio=None, max_iter=100,
                                       multi_class='warn', n_jobs=None,
                                       penalty='l2', random_state=1, solver='warn',
                                       tol=0.0001, verbose=0, warm_start=False)]],
            verbose=False)),
  ('clf2',
   DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, presort=False,
                          random_state=1, splitter='best')),
  ('pipe3', Pipeline(memory=None,
            steps=[['sc',
                    StandardScaler(copy=True, with_mean=True, with_std=True)],
                   ['clf',
                    KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                         metric='minkowski', metric_params=None,
                                         n_jobs=None, n_neighbors=1, p=2,
                                         weights='uniform')]],
            verbose=False))],
 'flatten_transform': True,
 'n_jobs': None,
 'voting': 'soft',
 'weights': None,
 'pipe1': Pipeline(memory=None,
          steps=[['sc',
                  StandardScaler(copy=True, with_mean=True, with_std=True)],
                 ['clf',
                  LogisticRegression(C=0.001, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=100,
                                     multi_class='warn', n_jobs=None,
                                     penalty='l2', random_state=1, solver='warn',
                                     tol=0.0001, verbose=0, warm_start=False)]],
          verbose=False),
 'clf2': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=1, splitter='best'),
 'pipe3': Pipeline(memory=None,
          steps=[['sc',
                  StandardScaler(copy=True, with_mean=True, with_std=True)],
                 ['clf',
                  KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                       metric='minkowski', metric_params=None,
                                       n_jobs=None, n_neighbors=1, p=2,
                                       weights='uniform')]],
          verbose=False),
 'pipe1__memory': None,
 'pipe1__steps': [['sc',
   StandardScaler(copy=True, with_mean=True, with_std=True)],
  ['clf',
   LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='warn', n_jobs=None, penalty='l2',
                      random_state=1, solver='warn', tol=0.0001, verbose=0,
                      warm_start=False)]],
 'pipe1__verbose': False,
 'pipe1__sc': StandardScaler(copy=True, with_mean=True, with_std=True),
 'pipe1__clf': LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=1, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False),
 'pipe1__sc__copy': True,
 'pipe1__sc__with_mean': True,
 'pipe1__sc__with_std': True,
 'pipe1__clf__C': 0.001,
 'pipe1__clf__class_weight': None,
 'pipe1__clf__dual': False,
 'pipe1__clf__fit_intercept': True,
 'pipe1__clf__intercept_scaling': 1,
 'pipe1__clf__l1_ratio': None,
 'pipe1__clf__max_iter': 100,
 'pipe1__clf__multi_class': 'warn',
 'pipe1__clf__n_jobs': None,
 'pipe1__clf__penalty': 'l2',
 'pipe1__clf__random_state': 1,
 'pipe1__clf__solver': 'warn',
 'pipe1__clf__tol': 0.0001,
 'pipe1__clf__verbose': 0,
 'pipe1__clf__warm_start': False,
 'clf2__class_weight': None,
 'clf2__criterion': 'entropy',
 'clf2__max_depth': 1,
 'clf2__max_features': None,
 'clf2__max_leaf_nodes': None,
 'clf2__min_impurity_decrease': 0.0,
 'clf2__min_impurity_split': None,
 'clf2__min_samples_leaf': 1,
 'clf2__min_samples_split': 2,
 'clf2__min_weight_fraction_leaf': 0.0,
 'clf2__presort': False,
 'clf2__random_state': 1,
 'clf2__splitter': 'best',
 'pipe3__memory': None,
 'pipe3__steps': [['sc',
   StandardScaler(copy=True, with_mean=True, with_std=True)],
  ['clf',
   KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                        metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                        weights='uniform')]],
 'pipe3__verbose': False,
 'pipe3__sc': StandardScaler(copy=True, with_mean=True, with_std=True),
 'pipe3__clf': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                      weights='uniform'),
 'pipe3__sc__copy': True,
 'pipe3__sc__with_mean': True,
 'pipe3__sc__with_std': True,
 'pipe3__clf__algorithm': 'auto',
 'pipe3__clf__leaf_size': 30,
 'pipe3__clf__metric': 'minkowski',
 'pipe3__clf__metric_params': None,
 'pipe3__clf__n_jobs': None,
 'pipe3__clf__n_neighbors': 1,
 'pipe3__clf__p': 2,
 'pipe3__clf__weights': 'uniform'}

寻优参数组合,包含平均误差和方差。

0.933+/-0.14 {
     'clf2__max_depth': 1, 'pipe1__clf__C': 0.001}
0.947+/-0.14 {
     'clf2__max_depth': 1, 'pipe1__clf__C': 0.01}
0.973+/-0.07 {
     'clf2__max_depth': 1, 'pipe1__clf__C': 10}
0.947+/-0.14 {
     'clf2__max_depth': 2, 'pipe1__clf__C': 0.001}
0.947+/-0.14 {
     'clf2__max_depth': 2, 'pipe1__clf__C': 0.01}
0.973+/-0.07 {
     'clf2__max_depth': 2, 'pipe1__clf__C': 10}
0.933+/-0.14 {
     'clf2__max_depth': 4, 'pipe1__clf__C': 0.001}
0.947+/-0.14 {
     'clf2__max_depth': 4, 'pipe1__clf__C': 0.01}
0.973+/-0.07 {
     'clf2__max_depth': 4, 'pipe1__clf__C': 10}
Best Parameters: {
     'clf2__max_depth': 1, 'pipe1__clf__C': 10}
Accuracy in Train: 0.97

参考文献
Sebastian Raschka, Vahid Mirjalili. Python机器学习第二版. 南京:东南大学出版社,2018.

附录

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

plt.rcParams['figure.dpi']=200
plt.rcParams['savefig.dpi']=200
font = {
     'family': 'Times New Roman',
        'weight': 'light'}
plt.rc("font", **font)

#Section 1: Load data and split data into train/test datasets
iris=datasets.load_iris()
X,y=iris.data[50:,[1,2]],iris.target[50:]
le=LabelEncoder()
y=le.fit_transform(y)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5,random_state=1,stratify=y)

#Section 2: Model performance among different classifiers
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline,Pipeline
import numpy as np

clf1=LogisticRegression(penalty='l2',
                        C=0.001,
                        random_state=1)
clf2=DecisionTreeClassifier(max_depth=1,
                            criterion='entropy',
                            random_state=1)
clf3=KNeighborsClassifier(n_neighbors=1,
                          p=2,
                          metric='minkowski')

pipe1=Pipeline([["sc",StandardScaler()],["clf",clf1]])
pipe3=Pipeline([["sc",StandardScaler()],["clf",clf3]])
clf_labels=["Logistic Regression","Decision Tree","KNN"]

print("10-fold Cross Validation:")
for clf,label in zip([pipe1,clf2,pipe3],clf_labels):
    scores=cross_val_score(estimator=clf,
                           X=X_train,
                           y=y_train,
                           cv=10,
                           scoring="roc_auc")
    print("ROC AUC: %.2f (+/- %.2f) [%s]" % (scores.mean(),scores.std(),label))

#Section 3: Combine individual classifier via MajorityVoting
from sklearn.ensemble import VotingClassifier

"""Return class labels or probabilities for X for each estimator.
probabilities_or_labels
    If `voting='soft'` and `flatten_transform=True`:
        returns array-like of shape (n_classifiers, n_samples *
        n_classes), being class probabilities calculated by each
        classifier.
    If `voting='soft' and `flatten_transform=False`:
        array-like of shape (n_classifiers, n_samples, n_classes)
    If `voting='hard'`:
        array-like of shape (n_samples, n_classifiers), being
        class labels predicted by each classifier.
"""
mv_clf=VotingClassifier(estimators=[('pipe1',pipe1),('clf2',clf2),('pipe3',pipe3)],
                        voting='soft')

clf_labels+=['Majority Voting']
all_clf=[pipe1,clf2,pipe3,mv_clf]

for clf,label in zip(all_clf,clf_labels):
    scores=cross_val_score(estimator=clf,
                           X=X_train,
                           y=y_train,
                           cv=10,
                           scoring="roc_auc")
    print("Accuracy: %.2f (+/- %.2f) [%s]" % (scores.mean(),scores.std(),label))

#Section 4: Evaluate and tune the ensemble classifier
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

colors=['black','orange','blue','green']
linestyle=[':','--','-.','-']
print("\n")

for clf,label,clr,ls in zip(all_clf,clf_labels,colors,linestyle):
    #Assume the label of the positive class is 1
    y_pred=clf.fit(X_train,y_train).predict_proba(X_test)[:,1]
    fpr,tpr,thresholds=roc_curve(y_true=y_test,y_score=y_pred)
    roc_auc=auc(fpr,tpr)
    plt.plot(fpr,tpr,color=clr,
             linestyle=ls,
             label='%s (auc=%.2f)' % (label,roc_auc))

plt.legend(loc='lower right')
plt.plot([0,1],[0,1],
         linestyle='--',
         color='gray',
         linewidth=2)
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.grid(alpha=0.5)
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel("True Positive Rate (TPR)")
plt.savefig('./fig1.png')
plt.show()

#Section 5: Hyperparameter adjustation via GridSearch
print(mv_clf.get_params())
from sklearn.model_selection import GridSearchCV

params={
     'clf2__max_depth':[1,2,4],'pipe1__clf__C':[0.001,0.01,10]}

grid=GridSearchCV(estimator=mv_clf,
                  param_grid=params,
                  cv=10,
                  scoring='roc_auc')
grid.fit(X_train,y_train)

for params,mean_score,std_score in zip(grid.cv_results_['params'],
                                       grid.cv_results_['mean_test_score'],
                                       grid.cv_results_['std_test_score']):
    print("%0.3f+/-%.2f %r" % (mean_score,std_score,params))

print("Best Parameters: %s" % grid.best_params_)
print("Accuracy in Train: %.2f" % grid.best_score_)

你可能感兴趣的:(机器学习,python,机器学习)