在单变量线性回归这一部分中,我们将使用一个变量实现线性回归,以预测食品卡车的利润。
假设你是一家餐饮连锁店的老板,并且正在考虑在不同的城市开设新的门店。该连锁店已经在各个城市开了新的分店,并且你有这些城市的利润和人口数据。你想使用此数据来帮助你选择要扩展到的下一个城市。
文件ex1data1.txt包含我们线性回归问题的数据集。第一列是城市的人口,第二列是该城市的餐车的利润,利润的负值表示亏损。
python实现:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
path = 'ex1data1.txt'
data = pd.read_csv(path,header=None,names=['Population','Profit'])
# print(data.head()) # 预览数据
# print(data.describe())
# data.plot(kind='scatter',x='Population',y='Profit',figsize=(12,8))
# plt.show()
data.insert(0,'Ones',1)
# print(data.shape) (97,3)
cols = data.shape[1]
X = data.iloc[:,0:cols-1] #左闭右开
Y = data.iloc[:,cols-1:cols]
X = np.matrix(X.values) #97行2列
y = np.matrix(Y.values) #97行1列
theta = np.matrix(np.array([0,0])) #1行2列
# X = np.array(X)
# Y = np.array(Y)
# # theta = np.matrix(np.array([0,0]))
# theta = np.array([0,0]).reshape(1,2)
# print(X.shape)
# print(Y.shape)
# print(theta.shape)
# 计算代价函数
def computCost(X,y,theta):
inner = np.power((X*theta.T-y),2)
return np.sum(inner)/(2*len(X))
# j = computCost(X,Y,theta)
# print(j)
# 梯度下降
def gradientDescent(X,y,theta,alpha,iters):
temp = np.matrix(np.zeros(theta.shape)) # 构建零值矩阵
parameters = int(theta.ravel().shape[1]) #ravel()把多维数组降至一维
cost = np.zeros(iters) # 构建iter个0的数组
for i in range(iters):
error = (X*theta.T)-y
for j in range(parameters):
term = np.multiply(error,X[:,j]) # 点乘
temp[0,j] = theta[0,j] - ((alpha/len(X)) * np.sum(term))
theta = temp
cost[i] = computCost(X,y,theta)
return theta,cost
alpha = 0.01
iters = 1000
# theta,cost = gradientDescent(X,y,theta,alpha,iters)
# print(theta)
# print(cost)
# j = computCost(X,Y,theta)
# # print(j)
# 绘图
g,cost = gradientDescent(X,y,theta,alpha,iters)
# x = np.linspace(data.Population.min(),data.Population.max(),100)
# f = g[0,0] + (g[0,1] * x)
# fig,ax = plt.subplots(figsize = (12,8))
# ax.plot(x,f,'r',label='Prediction')
# ax.scatter(data.Population,data.Profit,label='Training Data')
# ax.legend(loc=4)
# ax.set_xlabel('Population')
# ax.set_ylabel('Profit')
# plt.show()
fig,ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(iters),cost,'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs.Training Epoch')
plt.show()
在这一部分中,我们将使用多个变量实现线性回归以预测房屋价格。
文件ex1data2.txt包含某地区房屋价格的训练集。第一列是房屋的大小(以平方英尺为单位),第二列是卧室的数量,第三列是房屋的价格。
python代码实现:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
path = 'ex1data2.txt'
data2 = pd.read_csv(path,header=None,names=['Size','Bedrooms','Price'])
# 归一化
data2 = (data2 - data2.mean()) / data2.std()
data2.insert(0,'Ones',1)
# print(data2.shape)
cols = data2.shape[1]
X = data2.iloc[:,0:cols-1] #左闭右开
Y = data2.iloc[:,cols-1:cols]
X = np.matrix(X.values) #47行3列
y = np.matrix(Y.values) #47行1列
theta = np.matrix(np.array([0,0,0]))
def computCost(X,y,theta):
inner = np.power((X*theta.T-y),2)
return np.sum(inner)/(2*len(X))
def gradientDescent(X,y,theta,alpha,iters):
temp = np.matrix(np.zeros(theta.shape)) # 构建零值矩阵
parameters = int(theta.ravel().shape[1]) #ravel()把多维数组降至一维
cost = np.zeros(iters) # 构建iter个0的数组
for i in range(iters):
error = (X*theta.T)-y
for j in range(parameters):
term = np.multiply(error,X[:,j]) # 点乘
temp[0,j] = theta[0,j] - ((alpha/len(X)) * np.sum(term))
theta = temp
cost[i] = computCost(X,y,theta)
return theta,cost
alpha = 0.01
iters = 1000
theta,cost = gradientDescent(X,y,theta,alpha,iters)
j = computCost(X,Y,theta)
# print(j)
# 绘图
fig,ax = plt.subplots(figsize=(12,8))
ax.plot(np.arange(iters),cost,'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs.Training Epoch')
plt.show()
np.linalg.inv() 求逆操作
.dot()函数可以通过numpy库调用,也可以由数组实例对象进行调用。
a.dot(b)和np.dot(a,b)效果相同
import numpy as np
import pandas as pd
path = 'ex1data1.txt'
data = pd.read_csv(path,header=None,names=['Population','Profit'])
data.insert(0,'Ones',1)
cols = data.shape[1]
X = data.iloc[:,0:cols-1] #左闭右开
Y = data.iloc[:,cols-1:cols]
X = np.matrix(X.values) #97行2列
Y = np.matrix(Y.values) #97行1列
def normalequ(X,Y):
theta = np.linalg.inv((X.T).dot(X)).dot(X.T).dot(Y)
return theta
theta = normalequ(X,Y)
print(theta)