#用线性回归算法拟合正弦函数
#首先,生成 200 个在[-2pai,2pai] 区间内的正弦函数上的点 并且给这些点加上一些随机的噪声。
import numpy as np
n_dots=200
X=np.linspace(-2*np.pi,2*np.pi,n_dots)
Y=np.sin(X)+0.2*np.random.rand(n_dots)-0.1
#X.shape#(200,)
#Y.shape#(200,)
X=X.reshape(-1,1)#(200, 1)
Y=Y.reshape(-1,1)#(200, 1)
#reshape()函数的作用是把numpy的数组整形成符合scikit-learn输入格式的数组,否则scikit-learn会报错
from sklearn.linear_model import LinearRegression #线性回归模型
from sklearn.preprocessing import PolynomialFeatures #形成多项式
from sklearn.pipeline import Pipeline #流水线
from sklearn.metrics import mean_squared_error #均方根误差
#创建多项式拟合模型
def polynomial_model(degree=1):
polynomial_features=PolynomialFeatures(degree=degree,include_bias=False)
#degree:控制多项式的次数;
#interaction_only:默认为 False,如果指定为 True,那么就不会有特征自己和自己结合的项,组合的特征中没有 a2 和 b2;
#include_bias:默认为 True 。如果为 True 的话,那么结果中就会有 0 次幂项,即全为 1 这一列
linear_regression=LinearRegression(normalize=True)#normalize=True:特征缩放-行归化处理,,处理后的特征值范围在[0,1]之间。
pipeline=Pipeline(
[("polynomial_features",polynomial_features),
("linear_regression",linear_regression)])
return pipeline
#分别用2、3、5、10 阶多项式来拟合数据
degrees=[2,3,5,10]
results=[]
for d in degrees:
model=polynomial_model(degree=d)
model.fit(X,Y)#训练模型
train_score=model.score(X,Y)
mse=mean_squared_error(Y,model.predict(X))
results.append({"model":model,"degree":d,"score":train_score,"mse":mse})#字典
for r in results:
print("degree:{};train_score:{};mse:{}".format(r["degree"],r["score"],r["mse"]))
#把各个阶数的拟合模型画出来,查看拟合效果
import matplotlib.pyplot as plt
from matplotlib.figure import SubplotParams #使用SubplotParams调整了子图的竖直间距
plt.figure(figsize=(12,6),dpi=200,subplotpars=SubplotParams(hspace=0.3))
for i,r in enumerate(results):
fig=plt.subplot(2,2,i+1)
plt.xlim(-8,8)
plt.title("Linear Regression degree={}".format(r["degree"]))
plt.scatter(X,Y,s=5,c='b',alpha=0.5)
plt.plot(X,r["model"].predict(X),'r-')
它总共收集了 13 特征,具体如下
•CRIM:城镇人均犯罪率。
•ZN: 城镇超过 25,000 平方英尺的住宅区域的占地比例
•INDUS: 城镇非零售用 占地 比例
•CHAS: 是否靠近河边, l 为靠近, 为远离。
•NO: 二氧 氮浓度。
•RM: 每套房 的平均房间个数
•AGE: 1940 年之前就盖好,且业主自住的房子的比例。
•DIS: 与波士顿市中心的距离
•RAD: 周边高速公道的便利性指数。
•TAX: 10 000 美元的财产税率。
•PTRATIO: 小学老师的比例
•B: 城镇黑人的比例
•LSTAT: 地位较低的人口比例
#导入数据
from sklearn.datasets import load_boston
boston=load_boston()
X=boston.data
Y=boston.target
#X.shape#(506, 13)
#boston.feature_names#特征名
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)
linear_reg=LinearRegression()
start=time.perf_counter() #统计cpu时间的工具
linear_reg.fit(X_train,Y_train)
train_score=linear_reg.score(X_train,Y_train)
test_score=linear_reg.score(X_test,Y_test)
print("模型的训练时间:{0:.6f};train_score:{1:.6f};test_score:{2:.6f}".format(time.perf_counter()-start,train_score,test_score))
分析思路:特征数据的范围相差较大,做数据归一化处理(数据归一化处理只会加快算法收敛速度,优化算法训练的效率,无法提升算法
的准确性。)
优化准确性:train_score:0.723941 训练样本评分较低,是典型的欠拟合现象。
而优化欠拟合模型的方法:1、加入更多的特征 2、增加多项式的特征 总体是为了增加模型的复杂度
import time
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
#改为用多项式输入特征
def polynomial_model(degree=1):
polynomial_features=PolynomialFeatures(degree=degree,include_bias=False)#没有0次幂
linear_regression=LinearRegression(normalize=True)
pipeline=Pipeline([("polynomial_features",polynomial_features),
("linear_regression",linear_regression)])
return pipeline
degrees=[2,3]
for i in degrees:
poly_model=polynomial_model(degree=i)
poly_start=time.perf_counter()
poly_model.fit(X_train,Y_train)
poly_train_score=poly_model.score(X_train,Y_train)
poly_test_score=poly_model.score(X_test,Y_test)
mse=mean_squared_error(Y_train,poly_model.predict(X_train))
print("{0:}阶,模型训练时间:{1:.6f};poly_train_score:{2:.6f};poly_test_score:{3:.6f};mse:{4:.6f}".format(i,time.perf_counter()-poly_start,poly_train_score,poly_test_score,mse))
2阶、3阶训练样本分数得到提高,但3阶测试样本分数却是负数,说明3阶模型过度拟合
然后画学习曲线验证
#画学习曲线
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
plt.rcParams['font.sans-serif'] = ['SimHei']#显示中文
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()# 生成网格
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score")
plt.legend(loc="best")#添加图例
return plt
from sklearn.model_selection import ShuffleSplit
cv=ShuffleSplit(n_splits=10,test_size=0.2,random_state=0)
#plt.figure(figsize=(18,4),dpi=200)
title="多项式线性回归学习曲线(degree={0})"
degrees=[1,2,3]
start=time.perf_counter()
for i in range(len(degrees)):
#plt.subplot(3,1,i+1)
#对整个数据集画学习曲线
plot_learning_curve(polynomial_model(degrees[i]),title.format(degrees[i]),X,Y,ylim=(0.01,1.01),cv=cv)
print("elaspe:{0:.6f}".format(time.perf_counter()-start))