Python - 回归(线性回归、RFE、LASSO 和 岭回归+K折交叉验证)

1. 普通线性回归:通过输出模型的真实值和预测值的平均平方差尽可能小(即最小二乘估计法),但容易陷入过度拟合(即低偏差),后续回归方法会有带正则化法来缩减数据。
2. 普通线性回归+RFE:RFE是recursive feature elimination回归特征消除,让回归特征消除过程中只保留no_features个最重要的特征,可以避免过度拟合,但RFE会舍弃一些变量,原没有下面几个方法给变量赋权重来的好。
3. L2缩减回归 - 岭回归:正则化那块采用L2范式,alpha越大,缩减幅度越大。岭回归比LASSO的预测能力好点,但LASSO能完成动态选择。
4. L1缩减回归 - LASSO:Least absolute shrinkage and selection operator最小绝对值缩减和选择操作,LASSO更偏向于稀疏的结果,如果一个结果大多数系数被压缩为0,那么它被称为系数的,LASSO大多数的系数都变成0了,对相关联的变量,只选择保留一个。

RFE:
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 05 19:52:39 2018

@author: Alvin AI
"""

from sklearn.datasets import load_boston
from sklearn.cross_validationi import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from itertools import combinations
from sklearn.feature_selection import RFE

#载入数据
def get_data():
    data = load_boston()
    x = data['data']
    y = data['target']
    return x,y

#建立模型
#让回归特征消除(RFE-recursive feature elimination)只保留no_features个最重要的特征
def build_model(x,y,no_features):
    model = LinearRegression(normalize=True,fit_intercept=True)
    rfe_model = RFE(estimator=model,n_features_to_select=no_features)
    rfe_model.fit(x,y)
    return rfe_model

#查看模型
def view_model(model):
    print "\nmodel coefficients"
    print "===================\n"
    #coef_提供了一个系数矩阵,intercept_提供了回归常数
    for i,coef in enumerate(model.coef_):
        print "\t coefficient %d %model"%(i+1,coef)
    print "\n\tintercept %0.3f"%(model.intercept_)

#计算均平方差用以评估模型误差    
def model_worth(true_y,predicted_y):
    print "\t mean squared error = %0.2f"%(mean_squared_error(true_y,predicted_y))
    return mean_squared_error(true_y,predicted_y)

#绘制残差图
def plot_residual(y,predicted_y):
    plt.cla()
    plt.xlabel('predicted y')
    plt.ylabel('residual')
    plt.title('residual plot')
    plt.figure1(1)
    diff = y - predicted_y
    plt.plot(predicted_y,diff,'go')
    plt.show()
    
if __name__=="__main__":
    x,y = get_data()
    #划分数据集
    x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
                                            test_size=0.3,random_state=9)
    x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
                                            test_size=0.3,random_state=9)
    #准备一些多项式特征
    poly_features = PolynomialFeatures(interaction_only=True)#只有x1和x2交互一起的,x1^2这种不行
    x_train_poly = poly_features.fit_transform(x_train)
    x_dev_poly = poly_features.fit_transform(x_dev)
    choosen_model = build_model(x_train_poly,y_train,20)
    predicted_y = choosen_model.predict(x_train_poly)
    mse = model_worth(y_train,predicted_y)
   
    x_test_poly = poly_features.fit_transform(x_test)    
    predicted_y = choosen_model.predict(x_test_poly)
    
    model_worth(y_test,predicted_y)    
        
LASSO:
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 09 09:08:51 2018

@author: Alvin AI
"""

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import numpy as np

#加载数据
def get_data():
    data = load_boston()
    x = data['data']
    y = data['target']
    return x,y

#建立模型
def build_models(x,y):
    alpha_range = np.linspace(0,0.5,200)
    model = Lasso(normalize=True)#只需要标准化,不需要中心化
    coeffiecients = []
    #对每个alpha值适配模型
    for alpha in alpha_range:
        model.set_params(alpha=alpha)
        model.fit(x,y)
        coeffiecients.append(model.coef_)#追踪系数用来绘图
        #print coeffiecients #维度为200*13
    #绘制系数权重变化和对应的alpha值
    #绘制模型的RMSE和对应的alpha值
    coeff_path(alpha_range,coeffiecients)
    #查看系数值
    #view_model(model)
    
#查看回归系数值
def view_model(model):
    print "\n model coeffiecients"
    print "======================"
    for i,coef in enumerate(model.coef_):
        print "\t coefficient %d %0.3f" % (i+1,coef)
    print "\n\t intercept %0.3f" % (model.intercept_)
    
#评估模型
def model_worth(true_y,predicted_y):
    print "\t mean squared error = %0.2f\n" % \
    (mean_squared_error(true_y,predicted_y))

#绘制不同alpha值情况下的系数权重
def coeff_path(alpha_range,coeffiecients):
     plt.close('all')
     plt.cla()
     plt.figure(1)
     plt.xlabel("Alpha Values")
     plt.ylabel("coeffiecient weights for different alpha values")
     plt.plot(alpha_range,coeffiecients)
     plt.axis('tight')#修改x、y坐标的范围让所有的数据显示出来
     plt.show()
     
#主函数调用,查看保留下来的回归系数有哪些
def get_coef(x,y,alpha):
    model = Lasso(normalize=True,alpha=alpha)
    model.fit(x,y)
    coefs = model.coef_
    indices = [i for i,coef in enumerate(coefs) if abs(coef) > 0.0]
    return indices

#电泳所有函数
if __name__ == "__main__":
    x,y = get_data() 
    #用不用的alpha值多次建模,并绘出图形
    build_models(x,y)
    print "\npredicting using all the variables\n"
    full_model = LinearRegression(normalize=True)
    full_model.fit(x,y)
    predicted_y = full_model.predict(x)
    model_worth(y,predicted_y)
    
    print "\n models at different alpha values\n"
    alpa_values = [0.22,0.08,0.01]
    for alpha in alpa_values:
        indices = get_coef(x,y,alpha)
        print "\t alpha = %0.2f number of variables selected = %d\
        " % (alpha,len(indices))#看保留下来的回归系数有多少
        print "\t attributes include ", indices#看保留下来的回归系数有哪些
        x_new = x[:,indices]
        model = LinearRegression(normalize=True)
        model.fit(x_new,y)
        predicted_y = model.predict(x_new)
        model_worth(y,predicted_y)
                

岭回归+交叉验证迭代器:针对于数据少的时候,然后把训练集划分为K份,模型再k-1份数据上进行驯良,剩下的用作测试,这样就不需要单独划分dev集,这种方法也叫K折交叉验证法。

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 09 14:30:10 2018

@author: Alvin AI
"""

from sklearn.datasets import load_boston
from sklearn.cross_validation import KFold,train_test_split
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

#载入数据
def get_data():
    data = load_boston()
    x = data['data']
    y = data['target']
    return x,y

#构建模型
def build_model(x,y):
    kfold = KFold(y.shape[0],5)#K折交叉检验划分训练集和测试集,5份数据集(每份包括训练和测试)
    model = Ridge(normalize=True)#标准化数据并采用岭回归模型
    alpha_range = np.linspace(0.0015,0.0017,30)#生成alpha测试集
    grid_param = {"alpha":alpha_range}
    #GridSearchCV帮助我们采用一个范围内参数对模型进行训练
    #cv定义了感兴趣的交叉验证类型
    grid = GridSearchCV(estimator=model,param_grid=grid_param,cv=kfold,\
                        scoring='mean_squared_error')
    grid.fit(x,y)
    display_param_results(grid.grid_scores_)#展示均方误差平均值
    print grid.best_params_#打印最好的参数和评估量
    #追踪均方残差的计量用于绘制图形
    return grid.best_estimator_
     
    
#查看回归系数和截距
def view_model(model):
    #print "\n estimated alpha = %0.3f" % model.alpha_#打印模型采用的alpha值
    print "\n model coeffiecients"
    print "======================\n"
    for i,coef in enumerate(model.coef_):
        print "\t coefficent %d %0.3f" % (i+1,coef)
    print "\n\t intercept %0.3f" % (model.intercept_)

#模型评估
def model_worth(true_y,predicted_y):
    print "\t Mean squared error = %0.2f" % (mean_squared_error(true_y,predicted_y))
    return mean_squared_error(true_y,predicted_y)

#展示参数结果
def display_param_results(param_results):
    fold = 1
    for param_result in param_results:
        print "fold %d mean squared error %0.2f" % (fold,abs(param_result[1]\
                                                             )),param_result[0]
        fold+=1
        
if __name__ == "__main__":
    x,y = get_data()
    
    #将数据集划分为训练集和测试集
    x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.3,\
                                                    random_state=9)
    #准备一些多项式特征
    poly_features = PolynomialFeatures(interaction_only=True)
    x_train_poly = poly_features.fit_transform(x_train)
    x_test_poly = poly_features.fit_transform(x_test)
    choosen_model = build_model(x_train_poly,y_train)
    predicted_y = choosen_model.predict(x_train_poly)
    model_worth(y_train,predicted_y)
    
    view_model(choosen_model)
    
    predicted_y = choosen_model.predict(x_test_poly)
    model_worth(y_test,predicted_y)    
        


你可能感兴趣的:(机器学习)