机器学习项目流程:
可能碰到以下情况:
1.缺少数据值
2.含有错误数据值
3.数据格式不一致
4.重复的记录值
训练集:训练模型
验证集:选择合适的参数
测试集:测试模型的泛化能力
mport pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib
# 1.数据
def loaddata(filename):
data = pd.read_excel(filename, header=0, index_col=0)
data = data.values
X = data[:, :-1]
y = data[:, -1]
return X, y
if __name__ == '__main__':
filename = '../data/boston.xls'
# 加载数据
X, y = loaddata(filename)
# boston = load_boston()
# X = boston.data
# y = boston.target
# 数据集分割
X_train, X_test, y_train, y_test = train_test_split(X, y)
# 模型
ridge_model = Ridge()
param_test = {'alpha': [0.01, 0.03, 0.07, 0.1, 0.2, 0.3, 0.5, 0.8, 1],
'normalize': [True, False]}
# 5折交叉验证
gv_ridge = GridSearchCV(estimator=ridge_model, param_grid=param_test, scoring='neg_mean_squared_error', cv=5)
gv_ridge.fit(X_train, y_train)
print("最好的参数模型:", gv_ridge.best_params_)
print("最好的模型评分值:", gv_ridge.best_score_)
# 评价模型
final_model = Ridge(alpha=0.03, normalize=True)
final_model.fit(X_train, y_train)
# 预测
y_pred = final_model.predict(X_test)
# MSE
print('MSE = ', mean_squared_error(y_test, y_pred))
# 模型保存
joblib.dump(final_model, 'house_train_model.m')
# 模型加载
clf = joblib.load('house_train_model.m')
最好的参数模型: {'alpha': 0.03, 'normalize': True}
最好的模型评分值: -27.72935911641626
MSE = 14.894224192732027
# 线性回归、岭回归案例-波士顿房价数据集分析
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
from sklearn.metrics import mean_squared_error
def mylinear():
'''
线性回归预测房子价格
:return:
'''
# 一、获得数据
lb = load_boston()
# 二、处理数据
# 1.取得数据集中特征值与目标值
x = lb.data
y = lb.target
# 2.分割数据集 训练集与测试集
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)
# 三、进行特征工程
# 1.训练集与测试集标准化处理
# 特征值与目标值都必须进行标准化处理,实例化两个API,分别处理特征值与目标值
# 特征值
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)
# 目标值
std_y = StandardScaler()
# 要求传入的y_train是二维数组,后面也一样,用reshape(-1,1)方法转换
y_train = std_y.fit_transform(y_train.reshape(-1,1))
y_test = std_y.transform(y_test.reshape(-1,1))
# 四、线性回归模型-estimator预测
# 1.正规方程求解方式预测结果
lr = LinearRegression()
lr.fit(x_train, y_train)
# 回归系数
print('正规方程的回归系数为:\n',lr.coef_)
# 预测测试集房子的价格,
y_predict = std_y.inverse_transform(lr.predict(x_test))
print('正规方程预测测试集房子的价格:\n',y_predict)
# 回归性能评估
y_test = std_y.inverse_transform(y_test)
print('正规方程的回归性能评估为:',mean_squared_error(y_test, y_predict))
# 2.梯度下降法对房价进行预测
sgd = SGDRegressor()
sgd.fit(x_train, y_train)
# 回归系数
print('梯度下降法的回归系数为:\n', sgd.coef_)
# 预测测试集房子的价格,
y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test))
print('梯度下降法的预测测试集房子的价格:\n', y_sgd_predict)
# 回归性能评估
print('梯度下降法的回归性能评估为:', mean_squared_error(y_test, y_sgd_predict))
# 3.岭回归进行房价预测
rd = Ridge(alpha=1.0)
rd.fit(x_train,y_train)
# 回归系数
print('岭回归的回归系数为:\n',rd.coef_)
# 预测测试集房子价格
y_rd_predict = std_y.inverse_transform(rd.predict(x_test))
print('测试集房子价格为:\n',y_rd_predict)
# 回归性能评估
print('岭回归的回归性能评估为:',mean_squared_error(y_test,y_rd_predict))
# 4.岭回归交叉验证与网格搜索
rd_1 = Ridge()
# 网格搜索
# 构造一些参数的值用于搜索
param = {'alpha': [0.5, 1.0, 2]}
gc = GridSearchCV(rd_1, param_grid=param, cv=5)
gc.fit(x_train, y_train)
# 预测测试集房子价格
y_gc_predict = std_y.inverse_transform(gc.predict(x_test))
# 回归性能评估
print('岭回归网格搜索的回归性能评估为:',mean_squared_error(y_test,y_gc_predict))
print('在交叉验证中最好的结果是:\n', gc.best_score_)
print('最好的参数选择是:\n', gc.best_params_)
return None
if __name__ == '__main__':
mylinear()
# 岭回归的回归性能评估为: 16.978116222418087
# 岭回归网格搜索的回归性能评估为: 16.890653980333717