1、预测模型新项目模板
- 机器学习是针对数据进行自动挖掘,找出数据的内在规律,并应用这个规律来预测新数据。
- 一个很好的实践机器学习项目的方法:使用从UCI机器学习仓库链接,获取的数据集开启一个机器学习项目。
- 分类或回归模型的机器学习项目可以分成以下六个步骤:
- 1、定义问题
- 2、理解数据
- 3、数据准备
- 4、评估算法
- 5、优化模型
- 6、结果部署
- 注意:有时这些步骤可能被合并或进一步分解。
- 项目模板:
2、回归项目实例
- 定义问题:
- 该项目将分析研究波士顿房价数据集(Boston House Price),该数据集中的每一行数据都是对波士顿周边或城镇房价的描述,数据是1978年统计收集的,数据中包含以下14个特征和506条数据。
- CRIM:城镇人均犯罪率
- ZN:住宅用地所占比例
- INDUS:城镇中非住宅用地所占比例
- CHAS:CHAS虚拟变量,用于回归分析
- NOX:环保指数
- RM:每栋住宅的房间数
- AGE:1940年以前建成的自住单位的比例
- DIS:距离5个波士顿的就业中心的加权距离
- RAD:距离高速公路的便利指数
- TAX:每一万美元的不动产税率
- PRTATIO:城镇中的教师学生比例
- B:城镇中的黑人比例
- LSTAT:地区中有多少房东属于低收入人群
- MEDV:自住房屋房价中位数
- 添加链接描述
- 概念2
- 代码
import numpy as np
from numpy import arange
from matplotlib import pyplot
from pandas import read_csv
from pandas import set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
filename = '../housing.csv'
names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PRTATIO','B','LSTAT','MEDV']
data = read_csv(filename,names=names,delim_whitespace=True)
'''
- 从以下几个方面处理数据,从而提高模型准确度
- 通过特征选择来减少大部分相关性高的特征
- 通过标准化数据来降低不同数据度量单位带来的影响
- 通过正态化数据来降低不同的数据分布结构,以提高算法的准确度
'''
array = data.values
X = array[:,0:13]
Y = array[:,13]
validation_size = 0.2
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=validation_size,random_state=seed)
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'
models = {}
models['LR'] = LinearRegression()
models['LASSO'] = Lasso()
models['EN'] = ElasticNet()
models['KNN'] = KNeighborsRegressor()
models['CART'] = DecisionTreeRegressor()
models['SVM'] = SVR()
results = []
for key in models:
kfold = KFold(n_splits=num_folds,random_state=seed)
cv_result = cross_val_score(models[key],X_train,Y_train,cv=kfold,scoring=scoring)
results.append(cv_result)
print('%s:%f(%f)' % (key,cv_result.mean(),cv_result.std()))
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler',StandardScaler()),('LR',LinearRegression())])
pipelines['ScalerLASSO'] = Pipeline([('Scaler',StandardScaler()),('LASSO',Lasso())])
pipelines['ScalerEN'] = Pipeline([('Scaler',StandardScaler()),('EN',ElasticNet())])
pipelines['ScalerKNN'] = Pipeline([('Scaler',StandardScaler()),('KNN',KNeighborsRegressor())])
pipelines['ScalerCART'] = Pipeline([('Scaler',StandardScaler()),('CART',DecisionTreeRegressor())])
pipelines['ScalerSVM'] = Pipeline([('Scaler',StandardScaler()),('SVM',SVR())])
results = []
for key in pipelines:
kfold = KFold(n_splits=num_folds,random_state=seed)
cv_result = cross_val_score(pipelines[key],X_train,Y_train,cv=kfold,scoring=scoring)
results.append(cv_result)
print('%s: %f (%f)' % (key,cv_result.mean(),cv_result.std()))
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21]}
model = KNeighborsRegressor()
kfold = KFold(n_splits=num_folds,random_state=seed)
grid = GridSearchCV(estimator=model,param_grid=param_grid,scoring=scoring,cv=kfold)
grid_result = grid.fit(X=rescaledX,y=Y_train)
print('最优:%s使用%s' % (grid_result.best_score_,grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],grid_result.cv_results_['std_test_score'],grid_result.cv_results_['params'])
for mean,std,param in cv_results:
print('%f (%f) with %r' % (mean,std,param))
ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler',StandardScaler()),('AB',AdaBoostRegressor())])
ensembles['ScaledAB-KNN'] = Pipeline([('Scaler',StandardScaler()),('ABKNN',AdaBoostRegressor(base_estimator=KNeighborsRegressor(n_neighbors=3)))])
ensembles['ScaledAB-LR'] = Pipeline([('Scaler',StandardScaler()),('ABLR',AdaBoostRegressor(LinearRegression()))])
ensembles['ScaledRFR'] = Pipeline([('Scaler',StandardScaler()),('RFR',RandomForestRegressor())])
ensembles['ScaledRTR'] = Pipeline([('Scaler',StandardScaler()),('ETR',ExtraTreesRegressor())])
ensembles['ScaledGBR'] = Pipeline([('Scaler',StandardScaler()),('RBR',GradientBoostingRegressor())])
results = []
for key in ensembles:
kfold = KFold(n_splits=num_folds,random_state=seed)
cv_result = cross_val_score(ensembles[key],X_train,Y_train,cv=kfold,scoring=scoring)
results.append(cv_result)
print('%s:%f(%f)' % (key,cv_result.mean(),cv_result.std()))
caler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_estimators':[10,50,100,200,300,400,500,600,700,800,900]}
model = GradientBoostingRegressor()
kfold = KFold(n_splits=num_folds,random_state=seed)
grid = GridSearchCV(estimator=model,param_grid=param_grid,scoring=scoring,cv=kfold)
grid_result = grid.fit(X=rescaledX,y=Y_train)
print('最优:%s使用%s' % (grid_result.best_score_,grid_result.best_params_))
caler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
gbr = ExtraTreesRegressor(n_estimators=80)
gbr.fit(X=rescaledX,y=Y_train)
rescaledX_validation = scaler.transform(X_validation)
predictions = gbr.predict(rescaledX_validation)
print(mean_squared_error(Y_validation,predictions))