# -*- coding: utf-8 -*-
"""
Created on Thu Apr 05 19:52:39 2018
@author: Alvin AI
"""
from sklearn.datasets import load_boston
from sklearn.cross_validationi import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from itertools import combinations
from sklearn.feature_selection import RFE
#载入数据
def get_data():
data = load_boston()
x = data['data']
y = data['target']
return x,y
#建立模型
#让回归特征消除(RFE-recursive feature elimination)只保留no_features个最重要的特征
def build_model(x,y,no_features):
model = LinearRegression(normalize=True,fit_intercept=True)
rfe_model = RFE(estimator=model,n_features_to_select=no_features)
rfe_model.fit(x,y)
return rfe_model
#查看模型
def view_model(model):
print "\nmodel coefficients"
print "===================\n"
#coef_提供了一个系数矩阵,intercept_提供了回归常数
for i,coef in enumerate(model.coef_):
print "\t coefficient %d %model"%(i+1,coef)
print "\n\tintercept %0.3f"%(model.intercept_)
#计算均平方差用以评估模型误差
def model_worth(true_y,predicted_y):
print "\t mean squared error = %0.2f"%(mean_squared_error(true_y,predicted_y))
return mean_squared_error(true_y,predicted_y)
#绘制残差图
def plot_residual(y,predicted_y):
plt.cla()
plt.xlabel('predicted y')
plt.ylabel('residual')
plt.title('residual plot')
plt.figure1(1)
diff = y - predicted_y
plt.plot(predicted_y,diff,'go')
plt.show()
if __name__=="__main__":
x,y = get_data()
#划分数据集
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
test_size=0.3,random_state=9)
#准备一些多项式特征
poly_features = PolynomialFeatures(interaction_only=True)#只有x1和x2交互一起的,x1^2这种不行
x_train_poly = poly_features.fit_transform(x_train)
x_dev_poly = poly_features.fit_transform(x_dev)
choosen_model = build_model(x_train_poly,y_train,20)
predicted_y = choosen_model.predict(x_train_poly)
mse = model_worth(y_train,predicted_y)
x_test_poly = poly_features.fit_transform(x_test)
predicted_y = choosen_model.predict(x_test_poly)
model_worth(y_test,predicted_y)
LASSO:
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 09 09:08:51 2018
@author: Alvin AI
"""
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import numpy as np
#加载数据
def get_data():
data = load_boston()
x = data['data']
y = data['target']
return x,y
#建立模型
def build_models(x,y):
alpha_range = np.linspace(0,0.5,200)
model = Lasso(normalize=True)#只需要标准化,不需要中心化
coeffiecients = []
#对每个alpha值适配模型
for alpha in alpha_range:
model.set_params(alpha=alpha)
model.fit(x,y)
coeffiecients.append(model.coef_)#追踪系数用来绘图
#print coeffiecients #维度为200*13
#绘制系数权重变化和对应的alpha值
#绘制模型的RMSE和对应的alpha值
coeff_path(alpha_range,coeffiecients)
#查看系数值
#view_model(model)
#查看回归系数值
def view_model(model):
print "\n model coeffiecients"
print "======================"
for i,coef in enumerate(model.coef_):
print "\t coefficient %d %0.3f" % (i+1,coef)
print "\n\t intercept %0.3f" % (model.intercept_)
#评估模型
def model_worth(true_y,predicted_y):
print "\t mean squared error = %0.2f\n" % \
(mean_squared_error(true_y,predicted_y))
#绘制不同alpha值情况下的系数权重
def coeff_path(alpha_range,coeffiecients):
plt.close('all')
plt.cla()
plt.figure(1)
plt.xlabel("Alpha Values")
plt.ylabel("coeffiecient weights for different alpha values")
plt.plot(alpha_range,coeffiecients)
plt.axis('tight')#修改x、y坐标的范围让所有的数据显示出来
plt.show()
#主函数调用,查看保留下来的回归系数有哪些
def get_coef(x,y,alpha):
model = Lasso(normalize=True,alpha=alpha)
model.fit(x,y)
coefs = model.coef_
indices = [i for i,coef in enumerate(coefs) if abs(coef) > 0.0]
return indices
#电泳所有函数
if __name__ == "__main__":
x,y = get_data()
#用不用的alpha值多次建模,并绘出图形
build_models(x,y)
print "\npredicting using all the variables\n"
full_model = LinearRegression(normalize=True)
full_model.fit(x,y)
predicted_y = full_model.predict(x)
model_worth(y,predicted_y)
print "\n models at different alpha values\n"
alpa_values = [0.22,0.08,0.01]
for alpha in alpa_values:
indices = get_coef(x,y,alpha)
print "\t alpha = %0.2f number of variables selected = %d\
" % (alpha,len(indices))#看保留下来的回归系数有多少
print "\t attributes include ", indices#看保留下来的回归系数有哪些
x_new = x[:,indices]
model = LinearRegression(normalize=True)
model.fit(x_new,y)
predicted_y = model.predict(x_new)
model_worth(y,predicted_y)
岭回归+交叉验证迭代器:针对于数据少的时候,然后把训练集划分为K份,模型再k-1份数据上进行驯良,剩下的用作测试,这样就不需要单独划分dev集,这种方法也叫K折交叉验证法。
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 09 14:30:10 2018
@author: Alvin AI
"""
from sklearn.datasets import load_boston
from sklearn.cross_validation import KFold,train_test_split
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
#载入数据
def get_data():
data = load_boston()
x = data['data']
y = data['target']
return x,y
#构建模型
def build_model(x,y):
kfold = KFold(y.shape[0],5)#K折交叉检验划分训练集和测试集,5份数据集(每份包括训练和测试)
model = Ridge(normalize=True)#标准化数据并采用岭回归模型
alpha_range = np.linspace(0.0015,0.0017,30)#生成alpha测试集
grid_param = {"alpha":alpha_range}
#GridSearchCV帮助我们采用一个范围内参数对模型进行训练
#cv定义了感兴趣的交叉验证类型
grid = GridSearchCV(estimator=model,param_grid=grid_param,cv=kfold,\
scoring='mean_squared_error')
grid.fit(x,y)
display_param_results(grid.grid_scores_)#展示均方误差平均值
print grid.best_params_#打印最好的参数和评估量
#追踪均方残差的计量用于绘制图形
return grid.best_estimator_
#查看回归系数和截距
def view_model(model):
#print "\n estimated alpha = %0.3f" % model.alpha_#打印模型采用的alpha值
print "\n model coeffiecients"
print "======================\n"
for i,coef in enumerate(model.coef_):
print "\t coefficent %d %0.3f" % (i+1,coef)
print "\n\t intercept %0.3f" % (model.intercept_)
#模型评估
def model_worth(true_y,predicted_y):
print "\t Mean squared error = %0.2f" % (mean_squared_error(true_y,predicted_y))
return mean_squared_error(true_y,predicted_y)
#展示参数结果
def display_param_results(param_results):
fold = 1
for param_result in param_results:
print "fold %d mean squared error %0.2f" % (fold,abs(param_result[1]\
)),param_result[0]
fold+=1
if __name__ == "__main__":
x,y = get_data()
#将数据集划分为训练集和测试集
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.3,\
random_state=9)
#准备一些多项式特征
poly_features = PolynomialFeatures(interaction_only=True)
x_train_poly = poly_features.fit_transform(x_train)
x_test_poly = poly_features.fit_transform(x_test)
choosen_model = build_model(x_train_poly,y_train)
predicted_y = choosen_model.predict(x_train_poly)
model_worth(y_train,predicted_y)
view_model(choosen_model)
predicted_y = choosen_model.predict(x_test_poly)
model_worth(y_test,predicted_y)