集成学习: 三个臭皮匠顶个诸葛亮。相当于是多个相同或者不同的模型使用相同或者不同的训练集进行“并行或递进”式的进行模型集成,达到“1+1>2”的效果。
1. 多模型投票:
训练多个分类器模型(一般是不同的模型,如KNN、决策树、逻辑回归、朴素贝叶斯)然后进行多数投票。每个模型使用的训练集都是一样的,都是使用的全部训练集。
2. Bagging:
每个模型使用的训练集是在全部训练集中有放回的随机抽样+多模型投票。
3. Boosting:
4. stacking:
叠加法,第一层输出多个预测→逻辑回归→最终预测输出
from sklearn.ensemble import VotingClassifier
# coding=utf-8
# coding=utf-8
import time
import numpy as np
# 朴素贝叶斯分类模型
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
# 决策树
from sklearn.tree import DecisionTreeClassifier
# KNN
from sklearn.neighbors import KNeighborsClassifier
# 管道简化工作流
from sklearn.pipeline import Pipeline
# 10折交叉验证评价模型
from sklearn.model_selection import cross_val_score
# 集成学习:多数投票方法
from sklearn.ensemble import VotingClassifier
def loadData(fileName):
'''
加载文件
:param fileName:要加载的文件路径
:return: 数据集和标签集
'''
# 存放数据及标记
dataArr = []
labelArr = []
# 读取文件
fr = open(fileName)
# 遍历文件中的每一行
for line in fr.readlines():
# 获取当前行,并按“,”切割成字段放入列表中
# strip:去掉每行字符串首尾指定的字符(默认空格或换行符)
# split:按照指定的字符将字符串切割成每个字段,返回列表形式
curLine = line.strip().split(',')
# 将每行中除标记外的数据放入数据集中(curLine[0]为标记信息)
# 在放入的同时将原先字符串形式的数据转换为整型
# 此外将数据进行了二值化处理,大于128的转换成1,小于的转换成0,方便后续计算
# dataArr.append([int(int(num) > 128) for num in curLine[1:]])
dataArr.append([int(num) / 255 for num in curLine[1:]])
# dataArr.append([int(num)/255 for num in curLine[1:]])
# 将标记信息放入标记集中
# 放入的同时将标记转换为整型
labelArr.append(int(curLine[0]))
# 返回数据集和标记
return np.mat(dataArr), np.ravel(labelArr)
def voting_model(X_train, y_train):
# 单独每个模型预测的结果
clf1 = GaussianNB()
clf2 = DecisionTreeClassifier(criterion='entropy', random_state=0)
clf3 = KNeighborsClassifier(n_neighbors=25, p=2, metric="minkowski")
# 使用多数投票集成学习方法
mv_clf = VotingClassifier(estimators=[('clf1', clf1), ('clf2', clf2), ('clf3', clf3)], voting='soft')
clf_labels = ['Bayes', 'Decision tree', 'KNN']
clf_labels += ['MajorityVoteClassifier']
all_clf = [clf1, clf2, clf3, mv_clf]
print('train models :\n')
for clf, label in zip(all_clf, clf_labels):
clf.fit(X_train, y_train)
return clf1, clf2, clf3, mv_clf
def model_test(model, X_test, Y_test):
accuracy = model.score(X_test, Y_test)
return accuracy
if __name__ == '__main__':
# 开始时间
start = time.time()
# 获取训练集
trainDataList, trainLabelList = loadData('../Mnist/mnist_train.csv')
# 获取测试集
testDataList, testLabelList = loadData('../Mnist/mnist_test.csv')
# 训练多个分类器模型和投票集成方法模型
clf1, clf2, clf3, mv_clf = voting_model(trainDataList, trainLabelList)
# 对多个分类器模型分别进行测试
print('start test')
clf1_accuracy = model_test(clf1, testDataList, testLabelList)
clf2_accuracy = model_test(clf2, testDataList, testLabelList)
clf3_accuracy = model_test(clf3, testDataList, testLabelList)
mv_clf_accuracy = model_test(mv_clf, testDataList, testLabelList)
print('the clf1_accuracy is:{}\n the clf2_accuracy is:{}\n the clf3_accuracy is:{}\n the mv_clf_accuracy is:{}\n'
.format(clf1_accuracy, clf2_accuracy, clf3_accuracy, mv_clf_accuracy))
# 结束时间
end = time.time()
print('time span:', end - start)
from sklearn.ensemble import BaggingClassifier
# coding=utf-8
# coding=utf-8
import time
import numpy as np
# 朴素贝叶斯分类模型
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
# 决策树
from sklearn.tree import DecisionTreeClassifier
# KNN
from sklearn.neighbors import KNeighborsClassifier
# 管道简化工作流
from sklearn.pipeline import Pipeline
# 10折交叉验证评价模型
from sklearn.model_selection import cross_val_score
# 集成学习:bagging方法
from sklearn.ensemble import BaggingClassifier
def loadData(fileName):
'''
加载文件
:param fileName:要加载的文件路径
:return: 数据集和标签集
'''
# 存放数据及标记
dataArr = []
labelArr = []
# 读取文件
fr = open(fileName)
# 遍历文件中的每一行
for line in fr.readlines():
# 获取当前行,并按“,”切割成字段放入列表中
# strip:去掉每行字符串首尾指定的字符(默认空格或换行符)
# split:按照指定的字符将字符串切割成每个字段,返回列表形式
curLine = line.strip().split(',')
# 将每行中除标记外的数据放入数据集中(curLine[0]为标记信息)
# 在放入的同时将原先字符串形式的数据转换为整型
# 此外将数据进行了二值化处理,大于128的转换成1,小于的转换成0,方便后续计算
# dataArr.append([int(int(num) > 128) for num in curLine[1:]])
dataArr.append([int(num) / 255 for num in curLine[1:]])
# dataArr.append([int(num)/255 for num in curLine[1:]])
# 将标记信息放入标记集中
# 放入的同时将标记转换为整型
labelArr.append(int(curLine[0]))
# 返回数据集和标记
return np.mat(dataArr), np.ravel(labelArr)
def bagging_model(X_train, y_train):
# 单独每个模型预测的结果
bayes = GaussianNB()
descion_tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
knn = KNeighborsClassifier(n_neighbors=25, p=2, metric="minkowski")
# 使用多数投票集成学习方法
bagging_clf = BaggingClassifier(base_estimator=descion_tree, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True,
bootstrap_features=False, n_jobs=1, random_state=1)
all_clf = [bayes, descion_tree, knn, bagging_clf]
print('training models :\n')
for clf in all_clf:
clf.fit(X_train, y_train)
return (bayes, descion_tree, knn, bagging_clf)
def model_test(model, X_test, Y_test):
accuracy = model.score(X_test, Y_test)
return accuracy
if __name__ == '__main__':
# 开始时间
start = time.time()
# 获取训练集
trainDataList, trainLabelList = loadData('../Mnist/mnist_train.csv')
# 获取测试集
testDataList, testLabelList = loadData('../Mnist/mnist_test.csv')
# 训练多个分类器模型和投票集成方法模型
all_clf = bagging_model(trainDataList, trainLabelList)
# 对多个分类器模型分别进行测试
print('start test')
for i, clf in enumerate(all_clf):
accuracy = model_test(clf, testDataList, testLabelList)
print('the clf{}_accuracy is:{}\n '.format(i, accuracy))
# 结束时间
end = time.time()
print('time span:', end - start)
from sklearn.ensemble import AdaBoostClassifier
# coding=utf-8
import time
import numpy as np
# 朴素贝叶斯分类模型
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
# 决策树
from sklearn.tree import DecisionTreeClassifier
# KNN
from sklearn.neighbors import KNeighborsClassifier
# 管道简化工作流
from sklearn.pipeline import Pipeline
# 10折交叉验证评价模型
from sklearn.model_selection import cross_val_score
# 集成学习:AdaBoost方法
from sklearn.ensemble import AdaBoostClassifier
def loadData(fileName):
'''
加载文件
:param fileName:要加载的文件路径
:return: 数据集和标签集
'''
# 存放数据及标记
dataArr = []
labelArr = []
# 读取文件
fr = open(fileName)
# 遍历文件中的每一行
for line in fr.readlines():
# 获取当前行,并按“,”切割成字段放入列表中
# strip:去掉每行字符串首尾指定的字符(默认空格或换行符)
# split:按照指定的字符将字符串切割成每个字段,返回列表形式
curLine = line.strip().split(',')
# 将每行中除标记外的数据放入数据集中(curLine[0]为标记信息)
# 在放入的同时将原先字符串形式的数据转换为整型
# 此外将数据进行了二值化处理,大于128的转换成1,小于的转换成0,方便后续计算
# dataArr.append([int(int(num) > 128) for num in curLine[1:]])
dataArr.append([int(num) / 255 for num in curLine[1:]])
# dataArr.append([int(num)/255 for num in curLine[1:]])
# 将标记信息放入标记集中
# 放入的同时将标记转换为整型
labelArr.append(int(curLine[0]))
# 返回数据集和标记
return np.mat(dataArr), np.ravel(labelArr)
def adaboosting_model(X_train, y_train):
# 单独每个模型预测的结果
bayes = GaussianNB()
descion_tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
knn = KNeighborsClassifier(n_neighbors=25, p=2, metric="minkowski")
# 使用多数投票集成学习方法
adaboost_clf = AdaBoostClassifier(base_estimator=descion_tree, n_estimators=10, learning_rate=0.1, random_state=1)
all_clf = [bayes, descion_tree, knn, adaboost_clf]
print('training models :\n')
for clf in all_clf:
clf.fit(X_train, y_train)
return (bayes, descion_tree, knn, adaboost_clf)
def model_test(model, X_test, Y_test):
accuracy = model.score(X_test, Y_test)
return accuracy
if __name__ == '__main__':
# 开始时间
start = time.time()
# 获取训练集
trainDataList, trainLabelList = loadData('../Mnist/mnist_train.csv')
# 获取测试集
testDataList, testLabelList = loadData('../Mnist/mnist_test.csv')
# 训练多个分类器模型和投票集成方法模型
all_clf = adaboosting_model(trainDataList, trainLabelList)
# 对多个分类器模型分别进行测试
print('start test')
for i, clf in enumerate(all_clf):
accuracy = model_test(clf, testDataList, testLabelList)
print('the clf{}_accuracy is:{}\n '.format(i+1, accuracy))
# 结束时间
end = time.time()
print('time span:', end - start)
sklearn.tree.DecisionTreeClassifier
# coding=utf-8
# coding=utf-8
import time
import numpy as np
# 朴素贝叶斯分类模型
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
# 决策树
from sklearn.tree import DecisionTreeClassifier
# KNN
from sklearn.neighbors import KNeighborsClassifier
# 管道简化工作流
from sklearn.pipeline import Pipeline
# 10折交叉验证评价模型
from sklearn.model_selection import cross_val_score
# 逻辑回归
from sklearn.linear_model import LogisticRegression
# 集成学习:AdaBoost方法
from sklearn.ensemble import StackingClassifier
from mlxtend.classifier import StackingClassifier as SC
def loadData(fileName):
'''
加载文件
:param fileName:要加载的文件路径
:return: 数据集和标签集
'''
# 存放数据及标记
dataArr = []
labelArr = []
# 读取文件
fr = open(fileName)
# 遍历文件中的每一行
for line in fr.readlines():
# 获取当前行,并按“,”切割成字段放入列表中
# strip:去掉每行字符串首尾指定的字符(默认空格或换行符)
# split:按照指定的字符将字符串切割成每个字段,返回列表形式
curLine = line.strip().split(',')
# 将每行中除标记外的数据放入数据集中(curLine[0]为标记信息)
# 在放入的同时将原先字符串形式的数据转换为整型
# 此外将数据进行了二值化处理,大于128的转换成1,小于的转换成0,方便后续计算
# dataArr.append([int(int(num) > 128) for num in curLine[1:]])
dataArr.append([int(num) / 255 for num in curLine[1:]])
# dataArr.append([int(num)/255 for num in curLine[1:]])
# 将标记信息放入标记集中
# 放入的同时将标记转换为整型
labelArr.append(int(curLine[0]))
# 返回数据集和标记
return np.mat(dataArr), np.ravel(labelArr)
def stacking_model(X_train, y_train):
# 单独每个模型预测的结果
bayes = GaussianNB()
descion_tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
knn = KNeighborsClassifier(n_neighbors=25, p=2, metric="minkowski")
estimators = [('by', bayes), ('dt', descion_tree), ('kn', knn)]
# 使用多数投票集成学习方法
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
all_clf = [bayes, descion_tree, knn, stacking_clf]
print('training models :\n')
for clf in all_clf:
clf.fit(X_train, y_train)
return (bayes, descion_tree, knn, stacking_clf)
def mx_stacking(X_train, y_train):
# 单独每个模型预测的结果
bayes = GaussianNB()
descion_tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
knn = KNeighborsClassifier(n_neighbors=25, p=2, metric="minkowski")
sclf = SC(classifiers=[bayes, descion_tree, knn], meta_classifier=LogisticRegression())
sclf.fit(X_train, y_train)
return sclf
def model_test(model, X_test, Y_test):
accuracy = model.score(X_test, Y_test)
return accuracy
if __name__ == '__main__':
# 开始时间
start = time.time()
# 获取训练集
trainDataList, trainLabelList = loadData('../Mnist/mnist_train.csv')
# 获取测试集
testDataList, testLabelList = loadData('../Mnist/mnist_test.csv')
# 训练多个分类器模型和投票集成方法模型
all_clf = stacking_model(trainDataList, trainLabelList)
sclf = mx_stacking(trainDataList, trainLabelList)
accuracy = model_test(sclf, testDataList, testLabelList)
print('the sclf accuracy is:{}\n '.format(accuracy))
# 对多个分类器模型分别进行测试
print('start test')
for i, clf in enumerate(all_clf):
accuracy = model_test(clf, testDataList, testLabelList)
print('the clf{}_accuracy is:{}\n '.format(i+1, accuracy))
# 结束时间
end = time.time()
print('time span:', end - start)
ps:本博客仅供自己复习理解,不具其他人可参考,本博客参考了大量的优质资源,侵删。