[ 2 3 1 4 ] [ 1 1 1 1 ] \begin{bmatrix}2&3\\1&4\end{bmatrix} \begin{bmatrix}1&1\\1&1\end{bmatrix} [2134][1111]
[ 2 3 1 4 ] [ 1 1 1 1 ] = [ 2 ∗ 3 + 1 ∗ 1 2 ∗ 3 + 1 ∗ 1 1 ∗ 4 + 1 ∗ 1 1 ∗ 4 + 1 ∗ 1 ] = [ 5 5 5 5 ] \begin{bmatrix}2&3\\1&4\end{bmatrix}\begin{bmatrix}1&1\\1&1\end{bmatrix} = \begin{bmatrix}2*3+1*1&2*3+1*1\\1*4+1*1&1*4+1*1\end{bmatrix} = \begin{bmatrix}5&5\\5&5\end{bmatrix} [2134][1111]=[2∗3+1∗11∗4+1∗12∗3+1∗11∗4+1∗1]=[5555]
(1) [ 2 3 ] [ 2 3 ] \begin{bmatrix}2&3\end{bmatrix}\begin{bmatrix}2\\3\end{bmatrix} [23][23]
(2) [ 2 3 ] [ 2 3 ] \begin{bmatrix}2\\3\end{bmatrix}\begin{bmatrix}2&3\end{bmatrix} [23][23]
(1) [ 2 3 ] [ 2 3 ] = 2 ∗ 2 + 3 ∗ 3 = 13 \begin{bmatrix}2&3\end{bmatrix}\begin{bmatrix}2\\3\end{bmatrix} = 2*2+3*3 = 13 [23][23]=2∗2+3∗3=13
(2) [ 2 3 ] [ 2 3 ] = [ 2 ∗ 2 2 ∗ 3 3 ∗ 2 3 ∗ 3 ] = [ 4 6 6 9 ] \begin{bmatrix}2\\3\end{bmatrix}\begin{bmatrix}2&3\end{bmatrix} = \begin{bmatrix}2*2&2*3\\3*2&3*3\end{bmatrix} = \begin{bmatrix}4&6\\6&9\end{bmatrix} [23][23]=[2∗23∗22∗33∗3]=[4669]
t r ( [ 3 1 4 2 1 3 1 4 5 ] ) tr\left(\begin{bmatrix}3&1&4\\2&1&3\\1&4&5\end{bmatrix}\right) tr 321114435
t r ( A ) = ∑ i = 1 3 a i i = 3 + 1 + 5 = 9 tr(A) = \sum_{i = 1}^{3}a_{ii} = 3+1+5 = 9 tr(A)=i=1∑3aii=3+1+5=9
假设我们决定将 y y y近似为 x x x的线性函数: h θ ( x ) = θ 0 + θ 1 x 1 + θ 2 x 2 h_{θ(x)} = θ_0 + θ_1x_1 + θ_2x_2 hθ(x)=θ0+θ1x1+θ2x2
为了简化公式,我们还使 x 0 = 1 x_0=1 x0=1,这样 h ( x ) = ∑ i = 0 n θ i x i = θ T x h(x) = \sum_{i = 0}^{n}θ_ix_i = θ^Tx h(x)=∑i=0nθixi=θTx
定义损失函数: J ( θ ) = 1 2 ∑ i = 1 m ( h ( x ( i ) ) − y ( i ) ) 2 J(θ) = \frac{1}{2}\sum^{m}_{i = 1}{(h(x^{(i)})-y^{(i)})}^2 J(θ)=21∑i=1m(h(x(i))−y(i))2
具体来说,让我们考虑梯度下降算法,它从一些初始 θ θ θ开始,并反复执行更新: θ j = θ j − α ∂ ∂ θ J ( θ ) θ_j = θ_j - α\frac{∂}{∂θ} J(θ) θj=θj−α∂θ∂J(θ)( α α α被称为学习率,这个更新是同时对 j = 0 , … … , n j = 0,……,n j=0,……,n的所有值执行的。)。它不断地朝着 j j j最陡下降的方向迈出一步。
先解决单个示例训练集 ( x , y ) (x,y) (x,y),这样就可以忽略 j j j定义中的和。我们有:
∇ J ( θ ) = ∂ ∂ θ J ( θ ) = ∂ ∂ θ 1 2 ( h θ ( x ) − y ) 2 = 2 ⋅ 1 2 ( h θ ( x ) − y ) ⋅ ∂ ∂ θ ( h θ ( x ) − y ) = ( h θ ( x ) − y ) ⋅ ∂ ∂ θ ( ∑ i = 0 n θ i x i − y ) = ( h θ ( x ) − y ) x j \begin{equation*} %加*表示不对公式编号 \begin{split} \nabla J(\theta) = \frac{∂}{∂θ} J(θ) &= \frac{∂}{∂θ} \frac{1}{2}{(h_θ(x)-y)}^2\\ &=2\cdot\frac{1}{2}(h_θ(x)-y)\cdot \frac{∂}{∂θ}(h_θ(x)-y)\\ &=(h_θ(x)-y)\cdot \frac{∂}{∂θ}\left(\sum_{i = 0}^{n}θ_ix_i-y\right)\\ &=(h_θ(x)-y)x_j\\ \end{split} \end{equation*} ∇J(θ)=∂θ∂J(θ)=∂θ∂21(hθ(x)−y)2=2⋅21(hθ(x)−y)⋅∂θ∂(hθ(x)−y)=(hθ(x)−y)⋅∂θ∂(i=0∑nθixi−y)=(hθ(x)−y)xj
对于单个示例训练集,由此给出更新规则:
∇ J ( θ ) = ( h θ ( x ( i ) ) − y ( i ) ) x j ( i ) \nabla J(\theta)=(h_{\theta}(x^{(i)})-y^{(i)})x^{(i)}_j ∇J(θ)=(hθ(x(i))−y(i))xj(i)
θ j : = θ j − α ∇ J ( θ ) \theta_j:=\theta_j-\alpha\nabla J(\theta) θj:=θj−α∇J(θ)
对于包含多个示例的训练集,有两种方法来修改此方法。用以下算法替换它:
∇ J ( θ ) = ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) x j ( i ) \nabla J(\theta)=\sum_{i=1}^m(h_{\theta}(x^{(i)})-y^{(i)})x^{(i)}_j ∇J(θ)=∑i=1m(hθ(x(i))−y(i))xj(i)(for evert j j j)
重复至收敛( ∇ J ( θ ) 趋近于 0 \nabla J(\theta)趋近于0 ∇J(θ)趋近于0): θ j : = θ j − α ∇ J ( θ ) \theta_j:=\theta_j-\alpha\nabla J(\theta) θj:=θj−α∇J(θ)(for evert j j j)
注:我们使用符号“a:=b”来表示用b的值覆盖a。相反,当我们断言一个事实陈述时,我们会写一个=b,即a的值等于b的值。
class LinerRegression:
x = list # 自变量
y = list # 因变量
num_sample = 0 # 样本数量
num_varialble = 1 # 变量数量
num_theta = num_varialble + 1 # theta数量
m = num_theta
n = num_sample
lr = float #学习率(步长)
loss = float #损失
theta = list #参数
grandient = list #梯度
def __init__(self, x, y, lr = 0.005):
# 录入样本
self.x = x
self.y = y
# 统计样本数量以及变量个数,这两个数值可以通过使用np.array.shape获取(现在用的是list,还没来得及改)
# self.num_sample, self.num_sample = x.shape
self.num_sample = len(y) # 计算样本数量
self.num_varialble = len(x[0])
# 根据变量个数计算得参数个数
self.num_theta = self.num_varialble + 1
# 初始化参数
self.theta = [0]*self.num_theta
self.grandient = [0]*self.num_theta
# 初始化学习率
self.lr = lr
# 将损失置0(因为只有最后求损失时才用)
self.loss = 0
self.m = self.num_theta
self.n = self.num_sample
self.add_theta0_for_x()
print(self.x,self.y)
def add_theta0_for_x(self):
for i in range(self.n):
self.x[i].insert(0,1)
def mathFunction_hypotheses(self, theta, x): # 假设的函数
# $$h(x) = /sum_{i=0}^n\theta_ix_i$$
h = 0
for i in range(self.m):
h += theta[i]*x[i]
return h
def mathFunction_loss(self): # 损失函数
# $$J(\theta) = 1/2 \sum_{i=0}^m{h(x^{i})-y^{i}}$$
loss = 0
for i in range(self.n):
loss += pow((self.mathFunction_hypotheses(self.theta, self.x[i])-self.y[i]), 2)
loss = loss/2 # 有的文章中为 loss/m 或 loss = loss*2/m
return loss
def grandient_of_loss_update(self): #批量计算并更新梯度
# $$\frac{∂}{∂θ}J(θ)$$
for j in range(self.m):
grandient = 0
for i in range(self.n):
grandient += (self.mathFunction_hypotheses(self.theta, self.x[i])-self.y[i])*self.x[i][j]
self.grandient[j] = grandient
# self.grandient[j] = grandient*2/self.n # 有的教程里加这个
def theta_update(self): #计算并更新参数
for j in range(self.m):
self.theta[j] = self.theta[j] - self.lr*self.grandient[j]
def get_best_theta(self): #求最佳参数
# while(1):
for i in range(100):
self.grandient_of_loss_update()
self.theta_update()
self.loss = self.mathFunction_loss() # 计算损失
print(self.loss)
return self.theta
if __name__ == "__main__":# 程序入口
x = [[1,2,3],[3,4,5],[5,6,7]]
y = [7, 8, 9]
gd = LinerRegression(x,y)
print(gd.get_best_theta())
print("11111")