题目:假设你是一家餐馆的CEO,考虑不同的城市开设一个新的分店。该连锁店已经在各个城市拥有卡车,而且你有来自城市的利润和人口数据。
数据集:ex1data1.txt
数据集样例
|人口(x)|利润(y) |
|6.1101 |17.592 |
| 5.5277 |9.1302 |
|… |… |
共有97行数据。
常用库
import numpy as np # 科学计算库,处理多维数组,进行数据分析
import pandas as pd #解决数据分析任务而创建
import matplotlib.pyplot as plt # 提供一个类似matlab的绘图框架
导入数据,并查看前五行data = pd.read_csv("ex1data1.txt", sep=",", names=["population", "profit"]) print(data.head())
数据可视化处理data.plot.scatter("population", "profit", label="population") plt.show()
# 在population前插入一列
data.insert(0, "ones", 1)
# 对数据切片,提取前俩列特征值,所有行,可以打印查看格式
X = data.iloc[:, 0:-1]
y = data.iloc[:, -1]
# 目前结构为dataframe结构,不是数组结构,将其转换为数组形
X = X.values
print(X.shape) # 维度为(97,2)
y = y.values
print(y.shape) # 维度为(97,)
y = y.reshape(97, 1)
def costFunction(X, y, theta):
inner = np.power(X @ theta - y, 2)
return np.sum(inner) / (2 * len(X))
# 对theta初始化为全0,X维度为(97,2),Y为(97,1),所以theta维度为(2,1)
theta = np.zeros((2, 1))
cost_init = costFunction(X, y, theta)
print(cost_init) # 初始代价为32.07
# 学习率,迭代次数
def gradientDescent(X, y, theta, alpha, iters):
costs = []
for i in range(iters):
theta = theta - (X.T @ (X @ theta - y)) * alpha / len(X)
cost = costFunction(X, y, theta)
costs.append(cost)
# 抽取几个cost查看,避免太多
if i % 100 == 0:
print(costs)
return theta, costs
# 初始化学习率
alpha = 0.02
iters = 2000
theta, costs = gradientDescent(X,y,theta,alpha,iters)
7.绘制迭代次数损失函数图像
fig, ax = plt.subplots()
ax.plot(np.arange(iters), costs)
ax.set(xlabel= "iters", ylabel = "costs", title = "cost vs iters")
plt.show()
fig, ax = plt.subplots()
ax.plot(np.arange(iters), costs)
ax.set(xlabel= "iters", ylabel = "costs", title = "cost vs iters")
plt.show()
x = np.linspace(y.min(), y.max(), 100)
y_ = theta[0, 0] + theta[1, 0] * x
fig, ax = plt.subplots()
ax.scatter(X[:, 1], y, label= "traindata") # 绘制训练集
ax.plot(x, y_, "r", label= "predict")
ax.legend()
ax.set(xlabel= "population", ylabel= "profit")
plt.show()