机器学习python集成算法

from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import  SVC
from sklearn.linear_model import LogisticRegression
#装袋算法
'''先将训练集分成多个子集,然后通过各个子集训练多个模型,选择精度最高的那一个'''
filename = 'pima_data.csv'
names = ['preg','plas','pres','skin','test','mass','pedi','age','class']
data = read_csv(filename,names = names)
array = data.values
X = array[:,0:8]
Y = array[:,8]
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds,random_state=seed)

#装袋决策树
'''数据方差大的时候,效果好'''
# cart = DecisionTreeClassifier()
# num_tree = 100
# model = BaggingClassifier(base_estimator=cart,n_estimators=num_tree,random_state=seed)
# result = cross_val_score(model,X,Y,cv=kfold)
# print(result.mean())

#随机森林
'''构建森林,森林有许多决策树组成,训练数据,选择决策结果最多的那一类'''
# num_tree = 100
# max_features = 3
# model = RandomForestClassifier(n_estimators=num_tree,random_state=seed,max_features=max_features)
# result = cross_val_score(model,X,Y,cv=kfold)
# print(result.mean())

#极端随机数
'''每棵决策树使用的相同的全部训练样本'''
# num_tree = 100
# max_features = 7
# model = ExtraTreesClassifier(n_estimators=num_tree,random_state=seed,max_features=max_features)
# result = cross_val_score(model,X,Y,cv=kfold)
# print(result.mean())


#投票算法
'''将多个机器学习的算法集成的算法。通过创建两个或多个算法模型,
    利用投票算法将这些算法包装起来,计算各个子模型的平均预测情况,
    可以对子模型的算法进行加权'''
# cart = DecisionTreeClassifier()
# models = []
# model_log = LogisticRegression()
# models.append(('log',model_log))
# model_cart = DecisionTreeClassifier()
# models.append(('cart',model_cart))
# model_svc = SVC()
# models.append(('svm',model_svc))
# ensemble_model = VotingClassifier(estimators=models)
# result = cross_val_score(ensemble_model,X,Y,cv=kfold)
# print(result.mean())


#提升算法
'''训练多个模型并组成一个序列,序列中的每一个模型都会修正前一个模型的错误'''
#adaboost
'''通过改变数据分布实现,可以排除一些不必要的训练数据特征'''
# num_tree = 30
# model = AdaBoostClassifier(n_estimators=num_tree,random_state=seed)
# result = cross_val_score(model,X,Y,cv=kfold)
# print(result.mean())

#随机梯度算法
# num_tree = 100
# model = GradientBoostingClassifier(n_estimators=num_tree,random_state=seed)
# result =cross_val_score(model,X,Y,cv=kfold)
# print(result.mean())

你可能感兴趣的:(机器学习)