基于Lecture4第4页表格中的数据(输入特征为living area和#bedrooms,预测房价price),建立线性回归模型预测房价。(老师取了波士顿房价估计模型中的部分数据)
最小二乘法(又称最小平方法)是一种数学优化技术,它通过最小化误差的平方和寻找数据的最佳函数匹配。利用最小二乘法可以简便地求得未知的数据,并使得这些求得的数据与实际数据之间误差的平方和为最小。
h ( x ) = ∑ m = 0 n θ j x j = θ T x h(x)=\sum_{m=0}^n \theta_jx_j=\theta^Tx h(x)=m=0∑nθjxj=θTx
上式为我们最终需要得到的一张超平面,建立损失函数为:
J ( θ ) = 1 2 ∑ i = 1 m ( h θ ( x ( i ) − y ( i ) ) 2 J(\theta)=\frac{1}{2}\sum_{i=1}^m(h_\theta(x^{(i)}-y^{(i)})^2 J(θ)=21i=1∑m(hθ(x(i)−y(i))2
当上式的函数值最小时,便可以得到拟合程度最好的线性回归曲线。
设定学习率α,并将其与当前损失函数梯度相乘作为下降步长,进行多次迭代便可以得到最优解。但这样有一个缺点,在远离最优解时收敛速度较快,但当靠近最优解时,梯度下降法的收敛十分缓慢。
由于最小二乘法的损失函数Hessian矩阵正定,而且该函数为一个二次函数,所以我们可以直接一步得到损失函数的最优值。
θ = ( X T X ) − 1 X T y \theta=(X^TX)^{-1}X^Ty θ=(XTX)−1XTy
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
#load data from file
def load_data():
data_mat = []
lable_mat = []
testing_data_mat = []
with open('linear_regression_training_data.txt','r') as file:
for line in file.readlines():
cur_line = line.strip().split(',')
data_mat.append([1,float(cur_line[0]),float(cur_line[1])])
lable_mat.append(float(cur_line[2]))
with open('linear_regression_testing_data.txt','r') as file:
for line in file.readlines():
cur_line = line.strip().split(',')
testing_data_mat.append([1,float(cur_line[0]),float(cur_line[1])])
data_mat = np.asarray(data_mat)
lable_mat = np.asarray(lable_mat)
testing_data_mat = np.asarray(testing_data_mat)
return data_mat, lable_mat, testing_data_mat
#Computes the negative log likelihood function for the parameter W
def loss(data_mat, lable_mat, weights):
loss = 0
for i in range(data_mat.shape[0]):
loss += np.square(np.inner(weights,data_mat[i]) - lable_mat[i])*0.5
return loss
# linear regression
def LR(data_mat, lable_mat, tol, max_cycles):
weights = np.array([1.0,1.0,1.0])
alpha = 0.000000001
cost = []
for k in range(max_cycles):
l1 = loss(data_mat, lable_mat, weights)
cost.append(l1)
deltaW = gradL(data_mat, lable_mat, weights)
weights += - alpha * deltaW
l2 = loss(data_mat, lable_mat, weights)
if (np.abs(l2 - l1))<tol:
break
return weights,k,cost
# Computes the gradient of loss
def gradL(data_mat, lable_mat, weights):
Y = np.squeeze(lable_mat)
D = np.inner(weights,data_mat) - Y
G = np.inner(data_mat.T,D)
return G
#plot
def plot(data_mat, lable_mat, testing_data_mat, weights, cost, steps):
xcord1 = []
ycord1 = []
zcord1 = []
xcord2 = []
ycord2 = []
zcord2 = []
n = data_mat.shape[0]
m = testing_data_mat.shape[0]
data_mat = np.array(data_mat)
testing_z_mat = np.inner(weights, testing_data_mat)
testing_data_mat = np.array(testing_data_mat)
testing_z_mat = np.array(testing_z_mat)
lable_mat = np.array(lable_mat)
for i in range(n):
xcord1.append(data_mat[i,1])
ycord1.append(data_mat[i,2])
zcord1.append(lable_mat[i])
for j in range(m):
xcord2.append(testing_data_mat[j,1])
ycord2.append(testing_data_mat[j,2])
zcord2.append(testing_z_mat[j])
fig1 = plt.figure()
#ax = Axes3D(fig1).add_subplot(111)
ax = Axes3D(fig1)
#ax.add_subplot(111)
Training = ax.scatter(xcord1, ycord1, zcord1, c='red')
Testing = ax.scatter(xcord2, ycord2, zcord2, c='green')
x = np.arange(1000.0, 3500.0, 10)
y = np.arange(1.0, 5.0, 0.016)
x,y = np.meshgrid(x,y)
z = weights[0] + weights[1] * x + weights[2] * y # revised
ax.plot_surface(x,y,z)
ax.set_xlabel('area')
ax.set_ylabel('bedrooms')
ax.set_zlabel('price')
plt.legend([Training, Testing], ["Training", "Testing"], loc='best')
fig2 = plt.figure()
lteration = np.arange(steps+1)
plt.plot(lteration,cost,linestyle="--", marker="*", linewidth=1.0)
plt.xlabel('lteration')
plt.ylabel('cost')
plt.show()
def main():
data_mat, lable_mat, testing_data_mat = load_data()
weights,steps,cost = LR(data_mat, lable_mat, 10**(-8), 100000)
print("\nLinear Regression\n", weights, " Steps: ", steps)
plot(data_mat, lable_mat, testing_data_mat, weights, cost, steps)
main()
import numpy as np
x = [[2104,1600,2400,1416,3000],[3,3,3,2,4],[1,1,1,1,1]]
y = [400,330,369,232,540]
x = np.mat(x)
y = np.mat(y)
theta = (x*x.T).I*x*y.T
print (theta)
cost = y - theta.T*x
lost = cost*cost.T/2
print (cost)
print (lost)