机器学习模型选择与评估、参数搜索(GridSearchCV) demo

# 线性回归模型
# from sklearn.linear_model import LinearRegression
#
# lin_reg = LinearRegression()
# lin_reg.fit(housing_prepared,housing_labels)
#
# some_data = housing.iloc[:5]
# some_labels = housing_labels.iloc[:5]
# some_data_prepared = full_pipeline.transform(some_data)
#
# print("prediction:\t",lin_reg.predict(some_data_prepared))
# print("Labels:\t\t",list(some_labels))
#
# # 整个训练集上回归模型的RMSE
# from sklearn.metrics import  mean_squared_error
# housing_predictions = lin_reg.predict(housing_prepared)
# lin_mse = mean_squared_error(housing_labels,housing_predictions)
# lin_rmse = np.sqrt(lin_mse)
# print(lin_rmse)

# 决策树模型
# from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
#
# tree_reg = DecisionTreeRegressor()
# tree_reg = tree_reg.fit(housing_prepared,housing_labels)
#
# housing_predictions = tree_reg.predict(housing_prepared)
# tree_mse = mean_squared_error(housing_labels,housing_predictions)
# tree_rmse = np.sqrt(tree_mse)
# print(tree_rmse)

# Scikit-Learn交叉验证功能:倾向于效用函数(越大越好),而非成本函数(越小越好)
# 故计算分数的函数实际上是负的MSE函数
# from sklearn.model_selection import cross_val_score
# # 10-折交叉验证(训练集随机分割为10个不同子集,对决策树模型进行10次训练评估
# # 每次选一个折叠评估,九个训练
# scores = cross_val_score(tree_reg,housing_prepared,housing_labels,scoring='neg_mean_squared_error',cv=10)
# rmse_scores = np.sqrt(-scores)  # 10个评估分数

def display_scores(scores):
    print('Scores:',scores)
    print('Mean:',scores.mean())
    print('Standard deviation:',scores.std())

# display_scores(rmse_scores)

# 随机森林RandomForestRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
# forest_reg = RandomForestRegressor()
# forest_reg = forest_reg.fit(housing_prepared,housing_labels)
#
# # 10折交叉验证
# scores = cross_val_score(forest_reg,housing_prepared,housing_labels,scoring='neg_mean_squared_error',cv=10)
# forest_rmse = np.sqrt(-scores)  # 10个评估分数
#
# display_scores(forest_rmse)


# GridSearchCV:使用交叉验证评估给定参数值的所有可能组合(使用于组合较少情况)

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[3,10,30] , 'max_features':[2,4,6,8]},
    {'bootstrap':[False] , 'n_estimators':[3,10],'max_features':[2,3,4]},
]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg,param_grid,cv=5,scoring='neg_mean_squared_error')

grid_search.fit(housing_prepared,housing_labels)

print(grid_search.best_params_)
print(grid_search.best_estimator_)  # 得到最好的估算器

final_model = grid_search.best_estimator_

X_test = test_set.drop('median_house_value',axis=1)
y_test = test_set['median_house_value'].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_prediction = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test,final_prediction)
final_rmse = np.sqrt(final_mse)
print(final_rmse)


# RandomizedSearchCV:每次迭代为每个超参数选择一个随机值,对一定数量的随机组合评估
# 可以指出每个属性的相对重要程度

你可能感兴趣的:(机器学习,python)