波士顿房价预测模型学习整理GridSearchCV

coding: utf-8

import numpy as np
import pandas as pd

#读取数据,预处理

data = pd.read_csv('housing.csv')
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)

#观察数据特征
#目标:计算价值的最小值

minimum_price = np.min(prices)

#目标:计算价值的最大值

maximum_price = np.max(prices)

#目标:计算价值的平均值

mean_price = np.mean(prices)

#目标:计算价值的中值

median_price = np.median(prices)

#目标:计算价值的标准差

std_price = np.std(prices)

#目标:输出计算的结果

print("Statistics for Boston housing dataset:\n")
print("Minimum price: ${:,.2f}".format(minimum_price))
print("Maximum price: ${:,.2f}".format(maximum_price))
print("Mean price: ${:,.2f}".format(mean_price))
print("Median price ${:,.2f}".format(median_price))
print("Standard deviation of prices: ${:,.2f}".format(std_price))

#通过散点图各个特征和标签之间的关系

import matplotlib.pyplot as plt      
rm = data['RM']
medv = data['MEDV']
plt.scatter(rm, medv, c='b')
plt.show()
lstat = data['LSTAT']
plt.scatter(lstat, medv, c='c')
plt.show()
ptratio = data['PTRATIO']
plt.scatter(ptratio, medv, c='g')
plt.show()

#确定预测评分模型,选用R2方法

from sklearn.metrics import r2_score
def performance_metric(y_true, y_predict):
"""计算并返回预测值相比于预测值的分数"""  
score = r2_score(y_true, y_predict, sample_weight=None, multioutput=None)
return score

#建立预测模型,通过GridSearchCV找到最有决策树模型

from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
def fit_model(X, y):
    """ 基于输入数据 [X,y],利于网格搜索找到最优的决策树模型""" 
    cross_validator = KFold(n_splits=10, shuffle=False, random_state=None)
    regressor = DecisionTreeRegressor()
    params = {'max_depth':[1,2,3,4,5,6,7,8,9,10]}
    scoring_fnc = make_scorer(performance_metric)
    grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cross_validator)
# 基于输入数据 [X,y],进行网格搜索

grid = grid.fit(X, y)
# 返回网格搜索后的最优模型
return grid.best_estimator_

#拆分数据集,训练集合测试集,选用train_test_split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.20, random_state=0)
print("Train test split success!")

基于训练数据,获得最优模型

optimal_reg = fit_model(X_train, y_train)

输出最优模型的 ‘max_depth’ 参数

print("Parameter 'max_depth' is {} for the optimal model.".format(optimal_reg.get_params()['max_depth'])) 

生成三个客户的数据,预测对应价格

client_data = [[5, 17, 15], # 客户 1
               [4, 32, 22], # 客户 2
               [8, 3, 12]]  # 客户 3

# 进行预测

predicted_price = optimal_reg.predict(client_data)
print(predicted_price)
for i, price in enumerate(predicted_price):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))

你可能感兴趣的:(学习笔记)