机器学习一(回归预测)

# -*- coding: utf-8 -*-
"""
Created on Mon Aug  6 08:48:58 2018

@author: wangxihe
"""
#%%首先我们使用传统的统计学回归方法,然后在使用多中机器学习
#数据说明
#CRIM:城镇人均犯罪率。
#ZN:住宅用地超过 25000 sq.ft. 的比例。
#INDUS:城镇非零售商用土地的比例。
#CHAS:查理斯河空变量(如果边界是河流,则为1;否则为0)。
#NOX:一氧化氮浓度。
#RM:住宅平均房间数。
#AGE:1940 年之前建成的自用房屋比例。
#DIS:到波士顿五个中心区域的加权距离。
#RAD:辐射性公路的接近指数。
#TAX:每 10000 美元的全值财产税率。
#PTRATIO:城镇师生比例。
#B:1000(Bk-0.63)^ 2,其中 Bk 指代城镇中黑人的比例。
#LSTAT:人口中地位低下者的比例。
#MEDV:自住房的平均房价,以千美元计。
#%%
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.formula.api as smf
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split

n_fold=10
seed=7
scoring='neg_mean_squared_error'

os.chdir('E:\spyderwork\wxh\数据科学\线性回归')

names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PRTATIO','B','LSTAT','MEDV']
hprice =pd.read_csv('housing.csv', names=names, delim_whitespace=True)

X=hprice.ix[:,0:13].copy()
Y=hprice.ix[:,13].copy()
train,test,train_y,test_y=train_test_split(X,Y,test_size=0.2,random_state=seed)
len(train)
len(train_y)
dfy=pd.DataFrame(train_y,columns=['MEDV'])
traindata=pd.concat([train,dfy], axis=1)

dftesty=pd.DataFrame(test_y,columns=['MEDV'])
testdata=pd.concat([test,dftesty], axis=1)

#%%
#分类变量
var_c=['CHAS','RAD']
#连续变量相关性检查
var_d=['MEDV','CRIM','ZN','INDUS','NOX','RM','AGE','DIS','TAX','PRTATIO','B','LSTAT']
corr=hprice[var_d].corr()
sns.heatmap(corr)
#%%
# 利用众数减去中位数的差值除以四分位距来查找是否有可能存在异常值
abs((X[var_d[1:]].mode().iloc[0,] - X[var_d[1:]].median()) /
    (X[var_d[1:]].quantile(0.75) - X[var_d[1:]].quantile(0.25)))
#%%分类变量
CHAST=hprice.CHAS.value_counts()
CHAST.plot(kind='bar')
#双样本T检验
#二分类双样本T检验
CHAS0=hprice[hprice['CHAS']==0]['MEDV']
CHAS1=hprice[hprice['CHAS']==1]['MEDV']
#方差齐性检验
leveneTest=stats.levene(CHAS0,CHAS1,center='median')
print('w-value=%6.4f, p-value=%6.4f' %leveneTest)
stats.stats.ttest_ind(CHAS0,CHAS1,equal_var=False) #显著
#多分类方差分析
RADT=hprice.RAD.value_counts()
RADT.sort_values().plot(kind='barh')
sm.stats.anova_lm(ols('MEDV~C(RAD)',data=hprice).fit())#显著
#%%共线性
def vif(df, col_i):
    cols = list(df.columns)
    cols.remove(col_i)
    cols_noti = cols
    formula = col_i + '~' + '+'.join(cols_noti)
    r2 = ols(formula, df).fit().rsquared
    return 1. / (1. - r2)

#%%
exog = hprice[var_d]
for i in exog.columns:
    print(i, '\t', vif(df=exog, col_i=i))
#不存在共性
#%%

#%%上面检查所有变量都有显著
ls0=ols('MEDV~CRIM+ZN+INDUS+NOX+RM+AGE+DIS+TAX+PRTATIO+B+LSTAT+C(CHAS)+C(RAD)',data=traindata).fit()
ls0.summary()

pridict0=ls0.predict(traindata)
np.sum((traindata['MEDV']-pridict0)**2)/len(traindata)
resid0=ls0.resid
plt.scatter(x=resid0,y=traindata['MEDV'])
#AGE INDUS不现在
#%%

ls1=ols('MEDV~CRIM+ZN+NOX+RM+DIS+TAX+PRTATIO+B+LSTAT+C(CHAS)+C(RAD)',data=traindata).fit()
ls1.summary()

pridict1=ls1.predict(traindata)
np.sum((traindata['MEDV']-pridict1)**2)/len(traindata)
resid1=ls1.resid
plt.scatter(x=resid1,y=traindata['MEDV'])
sm.qqplot(resid1,line='45',fit=True)
traindata['resid']=resid1
##学生化残差
#%%
#强影响点  学生化残差  学生标准 <1000 取2 大于1000取 3
traindata['resid_t']=(traindata['resid']-traindata['resid'].mean())/traindata['resid'].std()
traindata[traindata['resid_t'].abs()>=2]
traindata1=traindata[traindata['resid_t'].abs()<2].copy()

ls2=ols('MEDV~CRIM+ZN+NOX+RM+DIS+TAX+PRTATIO+B+LSTAT+C(CHAS)+C(RAD)',data=traindata1).fit()
ls2.summary()

pridict2=ls2.predict(traindata1)
np.sum((traindata1['MEDV']-pridict2)**2)/len(traindata1)
resid2=ls2.resid
plt.scatter(x=resid2,y=traindata1['MEDV'])
qqplot=sm.qqplot(resid2,line='45',fit=True)
qqprice=traindata1['resid']=resid2

#%%多元变量筛选,向前法
'''forward select'''
def forward_select(data, response):
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = float('inf'), float('inf')
    while remaining:
        aic_with_candidates=[]
        for candidate in remaining:
            formula = "{} ~ {}".format(
                response,' + '.join(selected + [candidate]))
            aic = ols(formula=formula, data=data).fit().aic
            aic_with_candidates.append((aic, candidate))
        aic_with_candidates.sort(reverse=True)
        best_new_score, best_candidate=aic_with_candidates.pop()
        if current_score > best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
            print ('aic is {},continuing!'.format(current_score))
        else:       
            print ('forward selection over!')
            break
           
    formula = "{} ~ {} ".format(response,' + '.join(selected))
    print('final formula is {}'.format(formula))
    model = ols(formula=formula, data=data).fit()
    return(model)
#%%

col_select=traindata1[['MEDV','CRIM','ZN','INDUS','NOX','RM','AGE','DIS','TAX','PRTATIO','B','LSTAT','CHAS','RAD']]
lms=forward_select(col_select,'MEDV')
#选择的模型和lms2一样,故无需重做
#10.064137084511566
#MEDV ~ LSTAT + RM + PRTATIO + DIS + NOX + CHAS + B + ZN + CRIM + RAD + TAX
#%%查看数据的倾斜度
skew_var_x = {}
for i in var_d:
    skew_var_x[i] = abs(traindata1[i].skew())
   
skew = pd.Series(skew_var_x).sort_values(ascending=False)
skew
#对峰度叫大的查看直方图
for index in skew[0:3].index:
   traindata1[index].hist(bins=20)
   plt.show()

for index in skew.index:
   traindata1[index].hist(bins=20)
   plt.show()
#%%对测试数据集预测
test_predict=ls2.predict(testdata)
test_resid=ls2.resid
MSE=np.sum((testdata['MEDV']-test_predict)**2)/len(testdata)
#38.689864250465426
#如果把测试异常点去掉
testdata['resid']=test_resid
testdata['resid_t']=(testdata['resid']-testdata['resid'].mean())/testdata['resid'].std()
testdata[testdata['resid_t'].abs()>=2]
testdata1=testdata[testdata['resid_t'].abs()<2].copy()
test_predict=ls2.predict(testdata)
test_resid=ls2.resid
MSE=np.sum((testdata['MEDV']-test_predict)**2)/len(testdata)
  
#%%
                 使用机器学习预测模型

#%%%

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
#%%

#%%
models={}
models['LR']=LinearRegression()
models['LASSO']=Lasso()
models['EN']=ElasticNet()
models['KNN']=KNeighborsRegressor()
models['CART']=DecisionTreeRegressor()
models['SVM']=SVR()


#%%
results=[]
for key in models:
    kfold=KFold(n_splits=n_fold,random_state=seed)
    cv_result=cross_val_score(models[key],train,train_y,scoring=scoring,cv=kfold)
    results.append(cv_result)
    print('model:%s,  mean:%f,  std:%f'%(key,cv_result.mean(),cv_result.std()))
   


#%%
fig=plt.figure(figsize=(10,10))
fig.suptitle('Algorithm Comparison')
ax=fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(models.keys())
plt.show()
#%%
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])
pipelines['ScalerLASSO'] = Pipeline([('Scaler', StandardScaler()), ('LASSO', Lasso())])
pipelines['ScalerEN'] = Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())])
pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])
pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])
pipelines['ScalerSVM'] = Pipeline([('Scaler', StandardScaler()), ('SVM', SVR())])


results = []
for key in pipelines:
    kfold = KFold(n_splits=n_fold, random_state=seed)
    cv_result = cross_val_score(pipelines[key], train, train_y, cv=kfold, scoring=scoring)
    results.append(cv_result)
    print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))
#%%
#评估算法 - 箱线图
fig=plt.figure(figsize=(10,10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(models.keys())
plt.show()
#%%从图显示KNN均值最接近为0,所有现在knn,用网格化搜索调整参数试试看看
scaler=StandardScaler().fit(train)
train_scaler=scaler.transform(train)
parm_grid={'n_neighbors':[2,3,4,5,6,7,8,9,10]}
knnmodel=KNeighborsRegressor()
kfold=KFold(n_splits=n_fold,random_state=seed)
grid=GridSearchCV(estimator=knnmodel,param_grid=parm_grid,scoring=scoring,cv=kfold)
grid_result=grid.fit(X=train_scaler,y=train_y)

print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
                 grid_result.cv_results_['std_test_score'],
                 grid_result.cv_results_['params'])
for mean, std, param in cv_results:
    print('%f (%f) with %r' % (mean, std, param))
#%%预测
knnmodel=KNeighborsRegressor(n_neighbors=3).fit(train, train_y)
knn_pre=knnmodel.predict(test)
knn_resid=knnmodel.resid
knnMSE=np.sum((test_y-knn_pre)**2)/len(test)
knn_pretrain=knnmodel.predict(train)
knn_residtrain=knnmodel.resid
knnMSE=np.sum((train_y-knn_pretrain)**2)/len(train)
#olsMSE_test=38.689864250465426
#knnMSE_test=43.03427015250544
#olsMSE_train=10.029292188200227
#knnMSE_train=18.275123762376236
#结论传统的ols完胜
#%%
   使用集成算法
#%%
ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostRegressor())])
ensembles['ScaledAB-KNN'] = Pipeline([('Scaler', StandardScaler()),
                                       ('ABKNN', AdaBoostRegressor(base_estimator=KNeighborsRegressor(n_neighbors=3)))])
ensembles['ScaledAB-LR'] = Pipeline([('Scaler', StandardScaler()), ('ABLR', AdaBoostRegressor(LinearRegression()))])
ensembles['ScaledRFR'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestRegressor())])
ensembles['ScaledETR'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesRegressor())])
ensembles['ScaledGBR'] = Pipeline([('Scaler', StandardScaler()), ('RBR', GradientBoostingRegressor())])

results = []
for key in ensembles:
    kfold = KFold(n_splits=n_fold, random_state=seed)
    cv_result = cross_val_score(ensembles[key], train, train_y, cv=kfold, scoring=scoring)
    results.append(cv_result)
    print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))
#%%
fig = plt.figure(figsize=(10,10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(ensembles.keys(),rotation=45)
plt.show()
#%%
ScaledAB: -14.912337 (6.126548)
ScaledAB-KNN: -15.721078 (9.410225)
ScaledAB-LR: -23.611420 (8.504075)
ScaledRFR: -13.164676 (5.431761)
ScaledETR: -9.632596 (5.667208)
ScaledGBR: -10.077588 (4.513702)
#%%
# 集成算法GBM - 调参
#scaler = StandardScaler().fit(train)
#train_scaler = scaler.transform(train)
param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]}
model = GradientBoostingRegressor()
kfold = KFold(n_splits=n_fold, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=train, y=train_y)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))


#%% ScaledETR模型最有
#scaler=StandardScaler().fit(train)
#train_scaler=scaler.transform(train)
param_grid={'n_estimators': [ 30, 40, 50, 60, 70, 80,85,90,95,100,110]}
kfold=KFold(n_splits=n_fold,random_state=seed)
ETR=ExtraTreesRegressor()
grid=GridSearchCV(estimator=ETR,param_grid=param_grid,scoring=scoring)
grid_result=grid.fit(X=train_scaler,y=train_y)

print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
#%%使用极端树建模
#训练模型
gbr = ExtraTreesRegressor(n_estimators=80)
gbr.fit(X=train_scaler, y=train_y)
# 评估算法模型
test_scaler = scaler.transform(test)
test_predict = gbr.predict(test_scaler)
print(mean_squared_error(test_y, test_predict))
sum((test_y-test_predict)**2)/len(test)

train_predict = gbr.predict(train_scaler)
print(mean_squared_error(train_y, train_predict))
sum((train_y-train_predict)**2)/len(test)
#olsMSE_test=38.689864250465426
#knnMSE_test=43.03427015250544
#olsMSE_train=10.029292188200227
#knnMSE_train=18.275123762376236
#gbrMSE_test 14.ij

你可能感兴趣的:(机器学习)