【精品系列】【机器学习实战】【实践项目一】区域房价中位数预测(完整代码)

import os
import tarfile
import urllib.request

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from scipy import stats

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"


def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
	"""
	下载数据
	"""
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


def load_housing_data(housing_path=HOUSING_PATH):
	"""
	读取数据
	"""
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)


# "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedroom_ix, population_ix, households_ix = 3, 4, 5, 6


class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
	"""
	自定义转换器
	"""
    def __init__(self, add_bedroom_per_room=True):
        self.add_bedroom_per_room = add_bedroom_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """
        X[:, rooms_ix]:X数组,":"所有行,取rooms_ix列的数据(即,取第rooms_ix列的全部数据,从0开始计数)
        ------------
        np.r_是按列连接两个矩阵,就是把两矩阵上下相加,要求列数相等。
        np.c_是按行连接两个矩阵,就是把两矩阵左右相加,要求行数相等。即添加计算后的新列数据
        例:
            a = np.array([[1, 2, 3],
                          [7,8,9]])
            b = np.array([[4,5,6],
                          [1,2,3]])
            c = np.c_[a,b]

            >>> print(c)
            array([[1, 2, 3, 4, 5, 6],
                   [7, 8, 9, 1, 2, 3]])
        """
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedroom_per_room:
            bedrooms_per_room = X[:, bedroom_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


def display_scores(scores):
    print("分  数:", scores)
    print("平均值:", scores.mean())
    print("标准差:", scores.std())


if __name__ == '__main__':
    # 下载数据
    fetch_housing_data()
    # 读取数据
    housing = load_housing_data()
    # 按收入分组
    housing["income_cat"] = pd.cut(housing["median_income"],
                                   bins=[0, 1.5, 3, 4.5, 6, np.inf],
                                   labels=[1, 2, 3, 4, 5])
    # 按income_cat类别比例抽取 20% 的测试集 和 80% 的训练集
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
    # 删除收入分组,恢复数据
    for set_ in (strat_train_set, strat_test_set):
        set_.drop("income_cat", axis=1, inplace=True)
    # 寻找数据相关性
    corr_matrix = housing.corr()
    corr_matrix["median_house_value"].sort_values(ascending=False)
    """
    数据处理:
        1、训练集拆分成 预测集(housing)和标签(housing_labels);
        2、预测集再拆分出 数字属性(housing_num)和 文本属性(ocean_proximity)
    """
    housing = strat_train_set.drop("median_house_value", axis=1)
    housing_labels = strat_train_set["median_house_value"].copy()
    housing_num = housing.drop("ocean_proximity", axis=1)
    """
    数值属性流水线:
        1、SimpleImputer(中位数填充缺失值);
        2、CombinedAttributesAdder(自定义转化器);
        3、StandardScaler(特征缩放)
    """
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
    housing_num_tr = num_pipeline.fit_transform(housing_num)

    num_attribs = list(housing_num)  # 获取数字集列头列表
    cat_attribs = ["ocean_proximity"]
    """
    所有属性的流水线:
        1、num_pipeline(数值属性流水线);
        2、OneHotEncoder(文本属性 转换为 独热向量)
    """
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
    housing_prepared = full_pipeline.fit_transform(housing)
    # 线性回归模型
    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)
    housing_predictions = lin_reg.predict(housing_prepared)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)
    print(lin_rmse)
    # K-交叉验证 - 线性回归模型
    lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
    lin_rmse_scores = np.sqrt(-lin_scores)
    display_scores(lin_rmse_scores)
    # 决策树模型
    tree_reg = DecisionTreeRegressor()
    tree_reg.fit(housing_prepared, housing_labels)
    housing_predictions = tree_reg.predict(housing_prepared)
    tree_mse = mean_squared_error(housing_labels, housing_predictions)
    tree_rmse = np.sqrt(tree_mse)
    print(tree_rmse)
    # K-交叉验证 - 决策树模型
    scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
    tree_rmse_scores = np.sqrt(-scores)
    display_scores(tree_rmse_scores)
    # 随机森林模型
    forest_reg = RandomForestRegressor()
    forest_reg.fit(housing_prepared, housing_labels)
    housing_predictions = forest_reg.predict(housing_prepared)
    forest_mse = mean_squared_error(housing_labels, housing_predictions)
    forest_rmse = np.sqrt(forest_mse)
    print(forest_rmse)
    # K-交叉验证 - 随机森林模型
    forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
    forest_rmse_scores = np.sqrt(-forest_scores)
    display_scores(forest_rmse_scores)
    """
    模型参数微调:
        模型:随机森林模型
        方法:网格搜索
    """
    param_grid = [
        {
     "n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8]},
        {
     "bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4]},
    ]
    forest_reg = RandomForestRegressor()
    grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_mean_squared_error", return_train_score=True)
    grid_search.fit(housing_prepared, housing_labels)
    # 微调结果
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)
    """
    模型参数微调:
        模型:随机森林模型
        方法:随机搜索
    """
    param_distribs = {
     
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }
    forest_reg = RandomForestRegressor()
    rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                    n_iter=10, cv=5, scoring='neg_mean_squared_error')
    rnd_search.fit(housing_prepared, housing_labels)
    # 微调结果
    cvres = rnd_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)
    """
    选取调整后最好的模型,进行测试集评估系统
    """
    final_model = rnd_search.best_estimator_

    X_test = strat_test_set.drop("median_house_value", axis=1)
    y_test = strat_test_set["median_house_value"].copy()

    X_test_prepared = full_pipeline.transform(X_test)
    final_predictions = final_model.predict(X_test_prepared)

    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)
    print(final_rmse)
    # 评估的精确度,计算泛化误差的95%置信区间:
    confidence = 0.95
    squared_errors = (final_predictions - y_test) ** 2
    np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                             loc=squared_errors.mean(),
                             scale=stats.sem(squared_errors)))

你可能感兴趣的:(机器学习,python,sklearn)