误差是独立并且具有相同的分布,服从均值为0,方差为σ2的高斯分布;
推导过程:https://download.csdn.net/download/Little_mosquito_/86746462
jupyter notebook目录安装_修沟的修勾在修沟的博客-CSDN博客_jupyter安装目录
import warnings
warnings.filterwarnings("ignore") # 忽略警告
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# 读取文件
data=pd.read_csv(r"C:\Users\Administrator\Desktop\jqxx\data_line.txt",sep=' ',names=['x1','y'])
data['x0']=1
feature=data[['x0','x1']] # 特征
label=data['y'] # 标签
#画原始数据图
plt.figure(figsize=(8,4),dpi=100) # inch,100 dpi:分辨率
plt.scatter(data['x1'],label,color='black') # 原始数据
plt.title("original_data")
#最小二乘法求解系数
feature_matrix=np.mat(feature) # 转为矩阵,X
label_matrix=np.mat(label).T # 转置变成一列
theta = (feature_matrix.T * feature_matrix).I * feature_matrix.T * label_matrix
y_predict=feature_matrix * theta
plt.figure(figsize=(8,4),dpi=100)
plt.scatter(data['x1'],label,color='black') # 原始数据
plt.plot(data['x1'],y_predict,color ='blue',linewidth=3) # 拟合曲线
plt.title("least_square")
使得代价函数达到最小的经典方法之一;
函数z=f(x,y)在点P沿哪个方向变化的速率最大,这个方向就是梯度的方向;
z=f(x,y) 在 点 P(x,y) 处 的梯 度方 向 与 点 P 的等高线 f(x,y)=c在这点的法向量的方向相同,且从数值较低的等高线指向数值较高的等高线;
4.1、梯度下降法理解:
import numpy as np
%matplotlib inline
# 构建函数
x = np.linspace(-5,9)
y = x**2-4*x
# 梯度下降法求解最小值
def f_prime(x_old): # f(x)的导数
return 2 * x_old - 4
x_old = -200 # 初始值
alpha = 0.001 # 步长,也就是学习速率,控制更新的幅度
for i in range(10000): # 参数循环迭代
x_new = x_old - alpha * f_prime(x_old)
x_old = x_new
print(x_new)
4.2、梯度下降法求最小值:
#梯度下降法
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
# y = kx + b,k is slope, b is y-intercept
#更新一次,单次迭代
def step_gradient(b_current, k_current, points, learningRate):
b_gradient = 0
k_gradient = 0
for i in range(0, len(points)):
x = points[i, 0]
y = points[i, 1]
k_gradient += (((k_current * x) + b_current)-y)*x
b_gradient += (((k_current * x) + b_current)-y)*1
new_b = b_current - (learningRate * b_gradient)
new_k = k_current - (learningRate * k_gradient)
return [new_b, new_k]
#误差函数 ,单次迭代后计算误差
def compute_error_for_line_given_points(b, k, points):
totalError = 0
for i in range(0, len(points)):
x = points[i, 0]
y = points[i, 1]
totalError += (y - (k * x + b)) ** 2
return totalError/len(points)
#多次迭代
def gradient_descent_runner(points, starting_b, starting_k, learning_rate, num_iterations):
b = starting_b
k = starting_k
error=[]
for i in range(num_iterations): #迭代多少次
b, k = step_gradient(b, k, np.array(points), learning_rate) #单次迭代
error.append(compute_error_for_line_given_points(b, k, points))
return [b, k,error]
#画原始数据图
points = np.array(pd.read_csv(r"C:\Users\Administrator\Desktop\jqxx\data.csv", delimiter=",",names=['x1','y']))
plt.figure(figsize=(8,4),dpi=100)
plt.scatter(points[:,0],points[:,1],color='black')
plt.title("original_data")
#梯度下降法
plt.figure(figsize=(8,4),dpi=100)
plt.scatter(points[:,0],points[:,1],color='black')
learning_rate = 0.000001 #学习率
initial_b = 0 # initial y-intercept guess
initial_k = 0 # initial slope guess
num_iterations = 10000
[b, k, error1] = gradient_descent_runner(points, initial_b, initial_k, 0.0000001, num_iterations)
y_predict=points[:,0]*k+b #预测
plt.plot(points[:,0],y_predict,color='blue')
plt.title("GradientDescent")
#迭代次数与误差的关系
plt.figure(figsize=(8,4),dpi=100)
plt.plot(range(num_iterations)[0:1000],np.array(error1)[0:1000],color='blue')
plt.xlabel("iteration number") #添加x轴的名称
plt.ylabel("error")
算法的步长选择:
[b, k, error1] = gradient_descent_runner(points, initial_b, initial_k, 0.000001, num_iterations)
[b, k, error2] = gradient_descent_runner(points, initial_b, initial_k, 0.0000001, num_iterations)
[b, k, error3] = gradient_descent_runner(points, initial_b, initial_k, 0.000008, num_iterations)
# 迭代次数与误差的关系
plt.figure(figsize=(8,4),dpi=100)
plt.plot(range(num_iterations)[0:100],np.array(error1)[0:100],color='red')
plt.plot(range(num_iterations)[0:100],np.array(error2)[0:100],color='blue')
plt.plot(range(num_iterations)[0:100],np.array(error3)[0:100],color='green')
plt.xlabel("iteration number")
plt.ylabel("error")
算法参数的初始值选择:
单个初始值可能只能达到局部最优,需要放置多个初始值,迭代选取最优的参数;
归一化:
距离算法(KNN、KMeans)通过归一化消除量纲的影响;
梯度下降,主要是为降低迭代次数,提升迭代效率;
批量梯度下降算法:
随机梯度下降算法:
***小批量梯度下降算法:
泰勒公式_百度百科
最优化方法之梯度下降法和牛顿法_thatway1989的博客-CSDN博客
常见的几种最优化方法(梯度下降法、牛顿法、拟牛顿法、共轭梯度法等) - 蓝鲸王子 - 博客园
基本原理:利用迭代点Xk处的一阶导数(梯度)和二阶导数 对目标函数进行二次函数近似,然后把二次函数的极小点作 为新的迭代点
偏差:避免欠拟合
方差:避免过拟合
正则项的理解:
# 特征之间相关
x=np.mat([[1,2],[2,4.1]])
(x.T*x).I # 求逆值较大,导致系数较大
# 特征微小变化导致系数变化较大,模型不稳定,模型过拟合
x=np.mat([[1,2],[2,4.2]])
(x.T*x).I
#特征不相关
x=np.mat([[1.1,2],[2,4.2]])
(x.T*x).I # 求逆值较小,系数较小
#特征微小变化导致的系数变化较小,模型稳定
x=np.mat([[1.1,2],[2,4.3]])
(x.T*x).I
岭回归实现:
# 岭回归
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def load_data():
data=pd.read_table('data_ridge.txt',sep=' ',names=['x1','y'])
data['x0']=1
feature=data[['x0','x1']]
label=data['y']
return np.mat(feature), np.mat(label).T
def ridge_regression(feature, label, lam):
n = np.shape(feature)[1]
theta = (feature.T * feature + lam * np.mat(np.eye(n))).I * feature.T * label
return theta
# 导入数据
feature, label = load_data()
plt.figure(figsize = (8,4),dpi = 100)
feature1=feature[:,1].getA()
label1 = label.getA() # 从矩阵里面取数组
plt.scatter(feature1,label1,color = 'black')
# 训练模型
theta = ridge_regression(feature, label, 0.0001)
y_fit = feature * theta
plt.plot(feature1,y_fit,color ='blue',linewidth=3)
plt.title("ridge_regression")
曲线绘制:
# 岭回归系数选择
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from math import exp
def load_data():
data=pd.read_table('abalone.txt',sep=' ',names=['x1','x2','x3','x4','x5','x6','x7','x8','y'])
feature=data.loc[:,'x1':'x8']
label=data['y']
return np.mat(feature), np.mat(label).T
def standarize(X):
std_deviation = np.std(X)
mean = np.mean(X)
return (X - mean)/std_deviation
def ridge_regression(X, y, lambd=0.2):
m,n = X.shape
I = np.matrix(np.eye(n))
w = (X.T*X + lambd*I).I*X.T*y
return w
def ridge_traj(X, y, k=20):
m, n = X.shape
ws = np.zeros((k, n))
for i in range(k):
w = ridge_regression(X, y, lambd=exp(i-10))
ws[i, :] = w.T
return ws
# 加载数据
X, y = load_data()
# 标准化
X, y = standarize(X), standarize(y)
# 绘制岭轨迹
k = 30
ws = ridge_traj(X, y, k)
lambdas = [exp(i-10) for i in range(k)]
plt.semilogx(lambdas, ws)
plt.show()
import itertools
from math import exp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
def load_data():
data=pd.read_table('abalone.txt',sep=' ',names=['x1','x2','x3','x4','x5','x6','x7','x8','y'])
feature=data.loc[:,'x1':'x8']
label=data['y']
return np.mat(feature), np.mat(label).T
def standarize(X):
''' 标准化数据 (零均值, 单位标准差)
'''
std_deviation = np.std(X)
mean = np.mean(X)
return (X - mean)/std_deviation
def lasso_regression(X, y, lambd=0.2, threshold=0.1):
''' 通过坐标下降(coordinate descent)法获取LASSO回归系数
'''
# 计算残差平方和
rss = lambda X, y, w: (y - X*w).T*(y - X*w)
# 初始化回归系数w.
m, n = X.shape
w = np.matrix(np.zeros((n, 1)))
r = rss(X, y, w)
# 使用坐标下降法优化回归系数w
niter = itertools.count(1)
for it in niter:
for k in range(n):
# 计算常量值z_k和p_k
z_k = (X[:, k].T*X[:, k])[0, 0]
p_k = 0
for i in range(m):
p_k += X[i, k]*(y[i, 0] - sum([X[i, j]*w[j, 0] for j in range(n) if j != k]))
if p_k < -lambd/2:
w_k = (p_k + lambd/2)/z_k
elif p_k > lambd/2:
w_k = (p_k - lambd/2)/z_k
else:
w_k = 0
w[k, 0] = w_k
r_prime = rss(X, y, w)
delta = abs(r_prime - r)[0, 0]
r = r_prime
#print('Iteration: {}, delta = {}'.format(it, delta))
if delta < threshold:
break
return w
def lasso_traj(X, y, k=20):
''' 获取回归系数轨迹矩阵
'''
m, n = X.shape
ws = np.zeros((k, n))
for i in range(k):
w = lasso_regression(X, y, lambd=exp(i-10))
ws[i, :] = w.T
#print('lambda = e^({}), w = {}'.format(i-10, w.T[0, :]))
return ws
X, y = load_data()
X, y = standarize(X), standarize(y)
w = lasso_regression(X, y, lambd=10)
# 绘制轨迹
k = 20
ws = lasso_traj(X, y, k)
fig = plt.figure()
lambdas = [exp(i-10) for i in range(k)]
plt.semilogx(lambdas, ws)
plt.show()
算法库:
https://scikit-learn.org/stable/
# 标准线性回归
from sklearn import linear_model
reg = linear_model.LinearRegression() # 创建模型:用默认参数即可
reg.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) # 训练,拟合
reg.predict([[4,4],[5,6]]) # 预测
reg.coef_ # 系数
reg.intercept_ # 截距
from sklearn import linear_model
reg = linear_model.Ridge(alpha=0.5) # 需要优化alpha参数
reg.fit([[0, 0], [0, 0], [1, 1]], [0, .1, 1])
reg.predict([[4,4],[3,4]]) # 预测
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
reg.fit([[0, 0], [1, 1]], [0, 1])
reg.predict([[1, 1]])
from sklearn.linear_model import ElasticNet
reg = ElasticNet(alpha=1.0,l1_ratio=0.5) # 需要优化alpha和l1_ratio
reg.fit([[0, 0], [1, 1]], [0, 1])
reg.predict([[1, 1]])
from sklearn import linear_model
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
def load_data():
data=pd.read_csv(r'D:\CDA\File\abalone.txt',sep=' ',names=['x1','x2','x3','x4','x5','x6','x7','x8','y'])
feature=data.loc[:,'x1':'x8']
label=data['y']
return np.mat(feature), np.mat(label).T
X, y = load_data()
X, y = StandardScaler().fit_transform(X), StandardScaler().fit_transform(y) # 标准化
params={'alpha':[0.001,0.01,0.1,1,10,100]} # 粗调节,确定数量级
ridge = linear_model.Ridge()
grid_search=GridSearchCV(ridge,param_grid=params,cv=10,verbose=2,n_jobs=-1) # verbose:日志信息
grid_search.fit(X, y)
grid_search.best_params_
params={'alpha':np.arange(1,25)} # 细调节
ridge = linear_model.Ridge()#
grid_search=GridSearchCV(ridge,params,cv=10,verbose=2,n_jobs=-1)
grid_search.fit(X, y)
grid_search.best_params_
from sklearn.linear_model import ElasticNet
params = {'alpha':[0.00001,0.001,0.01,0.1,1,10,100],'l1_ratio':[0.1,0.2,0.3,0.4,0.5]}
elasticNet = ElasticNet()
grid_search = GridSearchCV(elasticNet,param_grid=params,cv=10,verbose=2,n_jobs=-1)
grid_search.fit(X, y)
grid_search.best_params_
params = {'alpha':np.linspace(0.001,0.1,10),'l1_ratio':np.linspace(0.1,0.3,10)}
elasticNet = ElasticNet()
grid_search = GridSearchCV(elasticNet,param_grid=params,cv=10,verbose=2,n_jobs=-1)
grid_search.fit(X, y)
grid_search.best_params_
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
ld = load_boston()
# 拆分数据集
x_train,x_test,y_train,y_test = train_test_split(ld.data,ld.target,test_size=0.25)
# 标准化
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)
lr = LinearRegression().fit(x_train,y_train)
y_lr_predict = lr.predict(x_test)
print("lr的均方误差为:",mean_squared_error(y_test,y_lr_predict))
plt.figure(figsize=(8,4),dpi=100)
plt.title("LinearRegression")
plt.plot(y_test,y_lr_predict, 'rx')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'b-.', lw=4) # y=x
plt.ylabel("Predieted Price")
plt.xlabel("Real Price")
plt.show()
params={'alpha':[0.001,0.01,0.1,1,10,100]} # 粗调节
ridge = Ridge()
grid_search=GridSearchCV(ridge,param_grid=params,cv=10)
grid_search.fit(x_train,y_train)
grid_search.best_params_
y_rd_predict = grid_search.predict(x_test)
print("Ridge的均方误差为:",mean_squared_error(y_test,y_rd_predict))
plt.figure(figsize=(8,4),dpi=100)
plt.title("RidgeRegression")
plt.plot(y_test,y_lr_predict, 'rx')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'b-.', lw=4)
plt.ylabel("Predieted Price")
plt.xlabel("Real Price")
plt.show()
params={'alpha':[0.001,0.01,0.1,1,10,100]} # 粗调节
ridge = Lasso()
grid_search=GridSearchCV(ridge,param_grid=params,cv=10)
grid_search.fit(x_train,y_train)
grid_search.best_params_
y_rd_predict = grid_search.predict(x_test)
print("Lasso的均方误差为:",mean_squared_error(y_test,y_rd_predict))
plt.figure(figsize=(8,4),dpi=100)
plt.title("RidgeRegression")
plt.plot(y_test,y_lr_predict, 'rx')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'b-.', lw=4)
plt.ylabel("Predieted Price")
plt.xlabel("Real Price")
plt.show()