机器学习实战项目总结

1、预测模型新项目模板

  • 机器学习是针对数据进行自动挖掘,找出数据的内在规律,并应用这个规律来预测新数据。
  • 一个很好的实践机器学习项目的方法:使用从UCI机器学习仓库链接,获取的数据集开启一个机器学习项目。
  • 分类或回归模型的机器学习项目可以分成以下六个步骤:
    • 1、定义问题
    • 2、理解数据
    • 3、数据准备
    • 4、评估算法
    • 5、优化模型
    • 6、结果部署
    • 注意:有时这些步骤可能被合并或进一步分解。
  • 项目模板:
# python机器学习项目的模板

# 1.定义问题
# a.导入类库
# b.导入数据集

# 2.理解数据
# a.描述性统计
# b.数据可视化

# 3.数据准备
# a.数据清洗
# b.特征选择
# c.数据转换

# 4.评估算法
# a.分离数据集
# b.定义模型评估标准
# c.算法审查
# d.算法比较

# 5.优化模型
# a.算法调参
# b.集成算法

# 6.结果部署
# a.预测评估数据集
# b.利用整个数据集生成模型
# c.序列化模型

2、回归项目实例

  • 定义问题:
    • 该项目将分析研究波士顿房价数据集(Boston House Price),该数据集中的每一行数据都是对波士顿周边或城镇房价的描述,数据是1978年统计收集的,数据中包含以下14个特征和506条数据。
      • CRIM:城镇人均犯罪率
      • ZN:住宅用地所占比例
      • INDUS:城镇中非住宅用地所占比例
      • CHAS:CHAS虚拟变量,用于回归分析
      • NOX:环保指数
      • RM:每栋住宅的房间数
      • AGE:1940年以前建成的自住单位的比例
      • DIS:距离5个波士顿的就业中心的加权距离
      • RAD:距离高速公路的便利指数
      • TAX:每一万美元的不动产税率
      • PRTATIO:城镇中的教师学生比例
      • B:城镇中的黑人比例
      • LSTAT:地区中有多少房东属于低收入人群
      • MEDV:自住房屋房价中位数
      • 添加链接描述
  • 概念2
  • 代码
# 1.定义问题

import numpy as np
from numpy import arange
from matplotlib import pyplot
from pandas import read_csv
from pandas import set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error

# 2.导入数据
filename = '../housing.csv'
names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PRTATIO','B','LSTAT','MEDV']
data = read_csv(filename,names=names,delim_whitespace=True)

# 3.理解数据
# print(data.shape)
# print(data.dtypes)

# set_option('display.line_width',120)    # 这里指定输出宽度为120个字符,保证所有特征属性值显示在一行内

# print(data.head(30))

# set_option('precision',1)
# print(data.describe())

# set_option('precision',2)
# print(data.corr(method='pearson'))

# 数据可视化
# 直方图
# data.hist(sharex=False,sharey=False,xlabelsize=1,ylabelsize=1)
# pyplot.show()

# 密度图
# data.plot(kind='density',subplots=True,layout=(4,4),sharex=False,fontsize=1)
# pyplot.show()

# 箱线图
# data.plot(kind='box',subplots=True,layout=(4,4),sharex=False,sharey=False,fontsize=4)
# pyplot.show()

# 多重数据图表
# 散点矩阵图
# scatter_matrix(data)
# pyplot.show()

# 相关矩阵图
# fig = pyplot.figure()
# ax = fig.add_subplot(111)
# cax = ax.matshow(data.corr(),vmin=-1,vmax=1,interpolation='none')
# fig.colorbar(cax)
# ticks = np.arange(0,14,1)
# ax.set_xticks(ticks)
# ax.set_yticks(ticks)
# ax.set_xticklabels(names)
# ax.set_yticklabels(names)
# pyplot.show()

'''
- 从以下几个方面处理数据,从而提高模型准确度
    - 通过特征选择来减少大部分相关性高的特征
    - 通过标准化数据来降低不同数据度量单位带来的影响
    - 通过正态化数据来降低不同的数据分布结构,以提高算法的准确度
'''

# 分离数据集[20%数据作为评估数据集,80%数据作为训练数据集]
array = data.values
X = array[:,0:13]
Y = array[:,13]
validation_size = 0.2
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=validation_size,random_state=seed)

# 评估算法--评估标准(采用10折交叉验证分离数据,通过均方误差来比较算法准确度,均方误差趋近0,算法准确度越高)
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'

# 选择三个线性算法和三个非线性算法
models = {}
models['LR'] = LinearRegression()
models['LASSO'] = Lasso()
models['EN'] = ElasticNet()
models['KNN'] = KNeighborsRegressor()
models['CART'] = DecisionTreeRegressor()
models['SVM'] = SVR()

# 评估算法
results = []
for key in models:
    kfold = KFold(n_splits=num_folds,random_state=seed)
    cv_result = cross_val_score(models[key],X_train,Y_train,cv=kfold,scoring=scoring)
    results.append(cv_result)
    print('%s:%f(%f)' % (key,cv_result.mean(),cv_result.std()))

# 评估算法-箱线图
# fig = pyplot.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# pyplot.boxplot(results)
# ax.set_xticklabels(models.keys())
# pyplot.show()

# 评估算法-正态化数据
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler',StandardScaler()),('LR',LinearRegression())])
pipelines['ScalerLASSO'] = Pipeline([('Scaler',StandardScaler()),('LASSO',Lasso())])
pipelines['ScalerEN'] = Pipeline([('Scaler',StandardScaler()),('EN',ElasticNet())])
pipelines['ScalerKNN'] = Pipeline([('Scaler',StandardScaler()),('KNN',KNeighborsRegressor())])
pipelines['ScalerCART'] = Pipeline([('Scaler',StandardScaler()),('CART',DecisionTreeRegressor())])
pipelines['ScalerSVM'] = Pipeline([('Scaler',StandardScaler()),('SVM',SVR())])
results = []
for key in pipelines:
    kfold = KFold(n_splits=num_folds,random_state=seed)
    cv_result = cross_val_score(pipelines[key],X_train,Y_train,cv=kfold,scoring=scoring)
    results.append(cv_result)
    print('%s: %f (%f)' % (key,cv_result.mean(),cv_result.std()))
# 评估算法——箱线图
# fig = pyplot.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# pyplot.boxplot(results)
# ax.set_xticklabels(models.keys())
# pyplot.show()

# 调参改善算法——KNN
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21]}
model = KNeighborsRegressor()
kfold = KFold(n_splits=num_folds,random_state=seed)
grid = GridSearchCV(estimator=model,param_grid=param_grid,scoring=scoring,cv=kfold)
grid_result = grid.fit(X=rescaledX,y=Y_train)

print('最优:%s使用%s' % (grid_result.best_score_,grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],grid_result.cv_results_['std_test_score'],grid_result.cv_results_['params'])
for mean,std,param in cv_results:
    print('%f (%f) with %r' % (mean,std,param))

# 集成算法
ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler',StandardScaler()),('AB',AdaBoostRegressor())])
ensembles['ScaledAB-KNN'] = Pipeline([('Scaler',StandardScaler()),('ABKNN',AdaBoostRegressor(base_estimator=KNeighborsRegressor(n_neighbors=3)))])
ensembles['ScaledAB-LR'] = Pipeline([('Scaler',StandardScaler()),('ABLR',AdaBoostRegressor(LinearRegression()))])
ensembles['ScaledRFR'] = Pipeline([('Scaler',StandardScaler()),('RFR',RandomForestRegressor())])
ensembles['ScaledRTR'] = Pipeline([('Scaler',StandardScaler()),('ETR',ExtraTreesRegressor())])
ensembles['ScaledGBR'] = Pipeline([('Scaler',StandardScaler()),('RBR',GradientBoostingRegressor())])
results = []
for key in ensembles:
    kfold = KFold(n_splits=num_folds,random_state=seed)
    cv_result = cross_val_score(ensembles[key],X_train,Y_train,cv=kfold,scoring=scoring)
    results.append(cv_result)
    print('%s:%f(%f)' % (key,cv_result.mean(),cv_result.std()))

# 集成算法——箱线图
# fig = pyplot.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# pyplot.boxplot(results)
# ax.set_xticklabels(ensembles.keys())
# pyplot.show()

# 集成算法GBM——调参
caler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_estimators':[10,50,100,200,300,400,500,600,700,800,900]}
model = GradientBoostingRegressor()
kfold = KFold(n_splits=num_folds,random_state=seed)
grid = GridSearchCV(estimator=model,param_grid=param_grid,scoring=scoring,cv=kfold)
grid_result = grid.fit(X=rescaledX,y=Y_train)
print('最优:%s使用%s' % (grid_result.best_score_,grid_result.best_params_))

# 训练模型
caler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
gbr = ExtraTreesRegressor(n_estimators=80)
gbr.fit(X=rescaledX,y=Y_train)

# 评估算法模型
rescaledX_validation = scaler.transform(X_validation)
predictions = gbr.predict(rescaledX_validation)
print(mean_squared_error(Y_validation,predictions))


你可能感兴趣的:(机器学习,信息可视化,数据分析)