集成学习Python代码

理论知识见:

https://blog.csdn.net/zwqjoy/article/details/80431496

https://sklearn.apachecn.org/docs/0.21.3/12.html

Bagging:

import itertools
# 相当多的牛逼闪闪的数学算法
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
#调整子图位置大小

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score,train_test_split
#stacking利器:
#绘制学习曲线
from mlxtend.plotting import plot_learning_curves
#画出logistic模型决策边界
from mlxtend.plotting import plot_decision_regions

np.random.seed(0)

iris = datasets.load_iris()
X,y = iris.data[:,0:2],iris.target

clf1 = DecisionTreeClassifier(criterion='entropy',max_depth=1)
#决策树的评价标准为信息熵
clf2 = KNeighborsClassifier(n_neighbors=1)

bagging1 = BaggingClassifier(base_estimator=clf1,n_estimators=10,max_samples=0.8,max_features=0.8)
bagging2 = BaggingClassifier(base_estimator=clf2,n_estimators=10,max_samples=0.8,max_features=0.8)
label = ['Decision Tree','K-NN','Bagging Tree','Bagging K-NN']

clf_list =[clf1,clf2,bagging1,bagging2]

fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2) #分为两行两列
grid = itertools.product([0,1],repeat=2) #result用for循环print:(0, 0)(0, 1)(1, 0)(1, 1)
#求笛卡尔积,前面的取两个相乘,product(list1, list2)
for clf,label,grid in zip(clf_list,label,grid):#zip()见代码末尾
    scores = cross_val_score(clf,X,y,cv=3,scoring='accuracy')
    # cv:选择每次测试折数  accuracy:评价指标是准确度,可以省略使用默认值
    # 交叉验证
    print("Accuracy: %0.2f (+/- %0.2f) [%s]"  % (scores.mean(), scores.std(), label))
    #求均值、标准偏差
    clf.fit(X,y)
    ax = plt.subplot(gs[grid[0],grid[1]])
    fig = plot_decision_regions(X=X,y=y,clf=clf,legend=2)
    plt.title(label)

plt.show()
#上图显示了决策树和k-NN分类器的决策边界,以及它们应用于Iris数据集的bagging集合。
# 决策树显示坐标轴平行边界,
# 由于k-神经网络对训练样本的扰动不敏感,因此称为稳定学习器,
# 因此决策树袋装集成相对于k-神经网络袋装集成具有更高的精度

#plot learning curves
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
plt.figure()
plot_learning_curves(X_train,y_train,X_test,y_test,bagging1,print_model=False,style='ggplot')
plt.show()
#上图显示了装袋树集成的学习曲线。我们可以看到训练数据的平均误差为0.3,
# 测试数据的误差曲线为u型。
# 训练和测试错误之间的最小差距出现在训练集大小为80%左右。
""">>>a = [1,2,3]
>>> b = [4,5,6]
>>> c = [4,5,6,7,8]
>>> zipped = zip(a,b)     # 返回一个对象
>>> zipped

>>> list(zipped)  # list() 转换为列表
[(1, 4), (2, 5), (3, 6)]
>>> list(zip(a,c))              # 元素个数与最短的列表一致
[(1, 4), (2, 5), (3, 6)]
>>> a1, a2 = zip(*zip(a,b))          # 与 zip 相反,zip(*) 可理解为解压,返回二维矩阵式
>>> list(a1)
[1, 2, 3]
>>> list(a2)
[4, 5, 6]
"""

Boosting:

import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import  AdaBoostClassifier
from sklearn.model_selection import cross_val_score,train_test_split
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions

iris = datasets.load_iris()
X,y = iris.data[:,0:2],iris.target

clf = DecisionTreeClassifier(criterion='entropy',max_depth=1)

num_est = [1,2,3,10]
label = ['AdaBoost(n_est=1)','AdaBoost(n_est=2)','AdaBoost(n_est=3)','AdaBoost(n_est=10)']

fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2)
grid = itertools.product([0,1],repeat=2)

for n_est,label,grd in zip(num_est,label,grid):
    boosting = AdaBoostClassifier(base_estimator=clf,n_estimators=n_est)
    #n_estimator为树的数量,默认为10个树
    boosting.fit(X,y)
    ax = plt.subplot(gs[grd[0],grd[1]])
    fig = plot_decision_regions(X=X,y=y,clf=boosting,legend=2)
    plt.title(label)
    # 每个基本学习器由一个深度为1的决策树组成,
    # 从而根据一个特征阈值对数据进行分类,
    # 该特征阈值将空间划分为两个区域,
    # 两个区域之间由一个平行于其中一个轴的线性决策面分隔。
plt.show()

#plot learning curves
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
boosting = AdaBoostClassifier(base_estimator=clf,n_estimators=10)
plt.figure()
plot_learning_curves(X_train,y_train,X_test,y_test,boosting,print_model=False,style='ggplot')
plt.show()

Stacking:

import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions

iris = datasets.load_iris()
X,y = iris.data[:,1:3],iris.target

clf1 = KNeighborsClassifier(n_neighbors=1)#n_neighbors为选取最近的点的个数
clf2 = RandomForestClassifier(random_state=1)#随机数生成器使用的种子
clf3 = GaussianNB()#朴素贝叶斯
lr = LogisticRegression()#meta_classifier元分类器
sclf = StackingClassifier(classifiers=[clf1,clf2,clf3],meta_classifier=lr)
label = ['KNN','RandomForest','Naive Bayes','Stacking Classifier']
clf_list = [clf1,clf2,clf3,sclf]

fig = plt.figure(figsize=(10,8))
gs = gridspec.GridSpec(2,2)
grid = itertools.product([0,1],repeat=2)

clf_cv_mean,clf_cv_std= [],[]
for clf,label,grd in zip(clf_list,label,grid):
    scores = cross_val_score(clf,X,y,cv=3,scoring='accuracy')
    print("Accuracy:均值:%.2f(+/- 标准差:%.2f) [%s]" %(scores.mean(),scores.std(),label))
    #标准差(方差)越小越稳定
    clf_cv_mean.append(scores.mean())
    clf_cv_std.append(scores.std())

    clf.fit(X,y)
    ax = plt.subplot(gs[grd[0],grd[1]])
    fig = plot_decision_regions(X=X,y=y,clf=clf)
    plt.title(label)
plt.show()

#plot learning curves
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
plt.figure()
plot_learning_curves(X_train,y_train,X_test,y_test,sclf,print_model=True,style='ggplot')
plt.show()
# 我们可以看到,叠加比单个分类器获得更高的精度,从学习曲线上看,没有过度拟合的迹象。

 

你可能感兴趣的:(Python机器学习)