内部集成了k折交叉验证,网格搜索后可以不用单独再做k折。
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
#load dataset
data = pd.read_csv('train.csv')
data = data.iloc[:,[1,3,4,80]]
data = data.fillna(0)
#随机森林
clf = RandomForestRegressor()#定义随即状态
# print(clf.feature_importances_)#特征重要性
# print(clf.get_params())
#网格搜搜找到最优参数
param_grid = {
'n_estimators':np.arange(10,201,10),
'max_features':['auto','sqrt','log2'],#max_features
'max_depth':np.arange(3,13),
'bootstrap':[True,False]
}
#进行网格搜索
grid = GridSearchCV(clf,param_grid=param_grid,cv = 5)
#fit
grid.fit(data.iloc[:,:3],data.SalePrice)
#输出最优参数的取值,最优参数下模型的评分,最优模型,最优模型在第几个
print(grid.best_params_,
grid.best_score_,
grid.best_estimator_,
grid.best_index_,
)
输出结果:(运行时间很漫长)
{‘bootstrap’: True, ‘max_depth’: 7, ‘max_features’: ‘log2’, ‘n_estimators’: 40} 0.4125496968640675 RandomForestRegressor(max_depth=7, max_features=‘log2’, n_estimators=40) 283
将最优参数配置到实例化的模型
#将搜索到的最优参数配置到实例化的模型中
clf = RandomForestRegressor(bootstrap=True,
max_depth=7,
max_features='log2',
n_estimators=40)
例: AdaBoost
'''
AdaBoost double gridcv
'''
import numpy as np
from sklearn.datasets import load_wine
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import tree
import graphviz
class UseAdaBoost():
def __init__(self):#实例化函数
self.ParaGrid_ABC()
#load dataset
def LoadDataset(self):
wine = load_wine()
X = wine.data
y = wine.target
return X,y
#model
def UseABC(self):
acf = AdaBoostClassifier(n_estimators=100)
return acf
#gridsearchcv
def ParaGrid_ABC(self):
param_grid = {
"n_estimators": np.arange(10, 201, 10),
"learning_rate": np.arange(0.1, 1.1, 0.1)
}
grid = GridSearchCV(self.UseABC(),param_grid=param_grid,cv = 5)
grid.fit(self.LoadDataset()[0],self.LoadDataset()[1])
print(grid.best_params_,
grid.best_score_,
grid.best_estimator_,
grid.best_index_,
)
if __name__ == '__main__':
UseAdaBoost()
搜索好的参数进行打分
class UseAdaBoost():
def __init__(self):#实例化函数
#self.ParaGrid_ABC()
self.CVS()
#load dataset
def LoadDataset(self):
wine = load_wine()
X = wine.data
y = wine.target
return X,y
#model
def UseABC(self):
acf = AdaBoostClassifier(learning_rate=0.7000000000000001,
n_estimators=190)
return acf
#gridsearchcv
# def ParaGrid_ABC(self):
# param_grid = {
# "n_estimators": np.arange(10, 201, 10),
# "learning_rate": np.arange(0.1, 1.1, 0.1)
# }
# grid = GridSearchCV(self.UseABC(),param_grid=param_grid,cv = 5)
# grid.fit(self.LoadDataset()[0],self.LoadDataset()[1])
# print(grid.best_params_,
# grid.best_score_,
# grid.best_estimator_,
# grid.best_index_,
# )
#k折交叉验证
def CVS(self):
scores = cross_val_score(self.UseABC(), self.LoadDataset()[0],self.LoadDataset()[1], cv=5)
print(scores)
if __name__ == '__main__':
UseAdaBoost()
对基估计器(这里是decision tree,可以随意)的参数进行网格搜索
class UseAdaBoost():
def __init__(self):#实例化函数
#self.ParaGrid_ABC()
#self.CVS()
self.ParmGrid_CLF()
#load dataset
def LoadDataset(self):
wine = load_wine()
X = wine.data
y = wine.target
return X,y
#base model
def UseCLF(self):
clf = DecisionTreeClassifier(random_state=30)
return clf
#boosting model
def UseABC(self):
acf = AdaBoostClassifier(learning_rate=0.7000000000000001,
n_estimators=190)
return acf
def ParmGrid_CLF(self):
param_grid = {
"max_depth": np.arange(3,13),
"min_samples_leaf": np.arange(1,20)
}
grid = GridSearchCV(self.UseCLF(), param_grid=param_grid, cv=5)
grid.fit(self.LoadDataset()[0], self.LoadDataset()[1])
print(grid.best_params_,
grid.best_score_,
grid.best_estimator_,
grid.best_index_,
)
#gridsearchcv
# def ParaGrid_ABC(self):
# param_grid = {
# "n_estimators": np.arange(10, 201, 10),
# "learning_rate": np.arange(0.1, 1.1, 0.1)
# }
# grid = GridSearchCV(self.UseABC(),param_grid=param_grid,cv = 5)
# grid.fit(self.LoadDataset()[0],self.LoadDataset()[1])
# print(grid.best_params_,
# grid.best_score_,
# grid.best_estimator_,
# grid.best_index_,
# )
#k折交叉验证
# def CVS(self):
# scores = cross_val_score(self.UseABC(), self.LoadDataset()[0],self.LoadDataset()[1], cv=5)
# print(scores)
if __name__ == '__main__':
UseAdaBoost()
再次对boosting进行gridsearch
class UseAdaBoost():
def __init__(self):#实例化函数
self.ParaGrid_ABC()
#self.CVS()
#self.ParmGrid_CLF()
#load dataset
def LoadDataset(self):
wine = load_wine()
X = wine.data
y = wine.target
return X,y
#base model
def UseCLF(self):
clf = DecisionTreeClassifier(max_depth=4,min_samples_leaf=1,random_state=30)
return clf
#boosting model
def UseABC(self):
acf = AdaBoostClassifier(base_estimator=self.UseCLF(),learning_rate=0.7000000000000001,
n_estimators=190)
return acf
# def ParmGrid_CLF(self):
# param_grid = {
# "max_depth": np.arange(3,13),
# "min_samples_leaf": np.arange(1,20)
# }
# grid = GridSearchCV(self.UseCLF(), param_grid=param_grid, cv=5)
# grid.fit(self.LoadDataset()[0], self.LoadDataset()[1])
# print(grid.best_params_,
# grid.best_score_,
# grid.best_estimator_,
# grid.best_index_,
# )
#gridsearchcv
def ParaGrid_ABC(self):
param_grid = {
"n_estimators": np.arange(10, 201, 10),
"learning_rate": np.arange(0.1, 1.1, 0.1)
}
grid = GridSearchCV(self.UseABC(),param_grid=param_grid,cv = 5)
grid.fit(self.LoadDataset()[0],self.LoadDataset()[1])
print(grid.best_params_,
grid.best_score_,
grid.best_estimator_,
grid.best_index_,
)
#k折交叉验证
# def CVS(self):
# scores = cross_val_score(self.UseABC(), self.LoadDataset()[0],self.LoadDataset()[1], cv=5)
# print(scores)
if __name__ == '__main__':
UseAdaBoost()
对经过两次搜索后的模型配置最优参数后交叉验证打分
class UseAdaBoost():
def __init__(self):#实例化函数
#self.ParaGrid_ABC()
self.CVS()
#self.ParmGrid_CLF()
#load dataset
def LoadDataset(self):
wine = load_wine()
X = wine.data
y = wine.target
return X,y
#base model
def UseCLF(self):
clf = DecisionTreeClassifier(max_depth=4,min_samples_leaf=1,random_state=30)
return clf
#boosting model
def UseABC(self):
acf = AdaBoostClassifier(base_estimator=self.UseCLF(),learning_rate=0.1,
n_estimators=50)
return acf
# def ParmGrid_CLF(self):
# param_grid = {
# "max_depth": np.arange(3,13),
# "min_samples_leaf": np.arange(1,20)
# }
# grid = GridSearchCV(self.UseCLF(), param_grid=param_grid, cv=5)
# grid.fit(self.LoadDataset()[0], self.LoadDataset()[1])
# print(grid.best_params_,
# grid.best_score_,
# grid.best_estimator_,
# grid.best_index_,
# )
#gridsearchcv
def ParaGrid_ABC(self):
param_grid = {
"n_estimators": np.arange(10, 201, 10),
"learning_rate": np.arange(0.1, 1.1, 0.1)
}
grid = GridSearchCV(self.UseABC(),param_grid=param_grid,cv = 5)
grid.fit(self.LoadDataset()[0],self.LoadDataset()[1])
print(grid.best_params_,
grid.best_score_,
grid.best_estimator_,
grid.best_index_,
)
#k折交叉验证
def CVS(self):
scores = cross_val_score(self.UseABC(), self.LoadDataset()[0],self.LoadDataset()[1], cv=5)
print(scores)
if __name__ == '__main__':
UseAdaBoost()
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
'''
读取乳腺癌数据集
数据集前两列存储样本ID和诊断结果(M代表恶性,B代表良性)
3~32列包含了30个特征
'''
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases'
+ '/breast-cancer-wisconsin/wdbc.data',
header=None)
X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le =LabelEncoder()
# 将类标从字符串(M或B)变为整数的(0,1)
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
'''
在流水线中集成标准化操作以及分类器
PipeLine对象采用元组的序列作为输入,每个元组第一个值为字符串,
可以通过字符串访问流水线的元素,第二个值为sklearn中的转换器或评估器
'''
pipe_svc = Pipeline([
('scl', StandardScaler()),
('clf', SVC(random_state=0))
])
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
# 以字典的方式定义待调优的超参数
param_dist = {'clf__C': param_range,
'clf__kernel': ['linear', 'rbf'],
'clf__gamma': param_range}
rs = RandomizedSearchCV(estimator=pipe_svc,
param_distributions=param_dist,
cv=10)
rs.fit(X_train, y_train)
print(rs.best_score_)
'''
输出最佳k折交叉验证准确率:
0.9802197802197802
'''
print(rs.best_params_)
'''
最优的超参数信息:
{'clf__C': 10.0, 'clf__gamma': 0.01, 'clf__kernel': 'rbf'}
'''
clf = rs.best_estimator_
clf.fit(X_train, y_train)
print('Test accuracy: %.3f' % clf.score(X_test, y_test))
'''
Test accuracy: 0.982
'''