1. 单变量线性回归
"""
单变量线性回归
案例:假设你是一家餐厅的CEO,正在考虑开一家分店,根据该城市的人口数据预测其利润。
"""
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
def get_X(df):
"""
读取特征
use concat to add intersect feature to avoid side effect
not efficient for big dataset though
"""
ones = pd.DataFrame({
'ones': np.ones(len(df))}) # len(data)行的长度,插入标签ones一列,赋值为1 创建m×1的矩阵(dataframe)
data = pd.concat([ones, df], axis=1) # 将创建的ones合并入df中
# 以上两句代码或者直接用data.insert('位置索引(如0)',"标签名(ones)",'赋值(1)')插入
# print(data)
return data.iloc[:, 0:2] # loc是自定义索引标签,iloc是默认索引标签(0,1,2...) iloc[行,列]
def get_y(df):
""" 读取标签
assume the last column is the target
"""
return np.array(df.iloc[:, -1]) # 返回df最后一列--dataframe转np数组
def cost_function(theta, X, y):
m = X.shape[0] # 样本数量
inner = X @ theta - y
square_num = inner.T @ inner
cost = square_num / (2 * m)
return cost
def batch_gradient_descent(theta, X, y, epoch, alpha=0.01):
"""
拟合线性回归,返回参数和代价
epoch: 批处理的轮数
"""
m = X.shape[0]
cost_data = [cost_function(theta, X, y)] # 将theta为0时,最小代价函数加入到列表
for _ in range(epoch):
theta = theta - (X.T @ (X @ theta - y)) * alpha / m
cost_data.append(cost_function(theta, X, y))
return theta, cost_data # 返回最后的theta和保存每次迭代最小化代价函数值列表
if __name__ == "__main__":
data = pd.read_csv('ex1data1.txt', names=['population', 'profit'])
sns.set(context='notebook', style='whitegrid', palette='dark')
# print('info:',data.info(),'\n','head:','\n',data.head(),'\n','tail:','\n',data.tail(),'\n','describe:','\n',data.describe())
# sns.lmplot('population','profit',data,height=6,fit_reg=False)
# plt.show()
# data.plot.scatter('population','profit',label='population')
# # plt.show()
X = get_X(data) # (97, 2)
# print(X.shape,type(X))
# print("*"*20,"\n",X)
y = get_y(data) # 97×1向量(列表)
theta = np.zeros(X.shape[1]) # X.shape=(97,2),代表特征数量,theta初始值赋0
cost_num = cost_function(theta, X, y)
# print(cost_num)
epoch = 2000
alpha = 0.02
final_theta, cost_data = batch_gradient_descent(theta, X, y, epoch)
print("theta值:\n",final_theta)
print("每次迭代的最小代价函数值:",cost_data)
final_cost = cost_function(final_theta, X, y) # 计算最终代价函数值
print("最终代价函数值:",final_cost)
# 最小代价数据可视化
plt.figure(figsize=(12, 4)) # 表示figure 的大小为宽、长(单位为inch)
plt.subplot(1, 2, 1)
plt.plot(np.arange(epoch+1),cost_data)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.subplot(1, 2, 2)
plt.scatter(data.population, data.profit, label="Training data")
plt.plot(data.population, data.population*final_theta[1] + final_theta[0], label="prediction",color="#FF0000")
plt.legend(loc=2)
plt.show()
2. 多变量线性回归
"""
多变量线性回归
案例:假设你现在打算卖房子,想知道房子能卖多少钱?
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def normalize_feature(df):
'''标准化数据(特征缩放)'''
return (df - df.mean()) / df.std()
# 散点图
# data_nomalized.plot.scatter('size','price',label='size')
# data_nomalized.plot.scatter('bedrooms','price',label='bedrooms')
# plt.show()
def get_X(df):
"""
读取特征
use concat to add intersect feature to avoid side effect
not efficient for big dataset though
"""
ones = pd.DataFrame({
'ones': np.ones(len(df))}) # len(data)行的长度,插入标签ones一列,赋值为1 创建m×1的矩阵(dataframe)
data = pd.concat([ones, df], axis=1) # 将创建的ones合并入df中
# 以上两句代码或者直接用data.insert('位置索引(如0)',"标签名(ones)",'赋值(1)')插入
# print(data)
return data.iloc[:, 0:2] # loc是自定义索引标签,iloc是默认索引标签(0,1,2...) iloc[行,列]
def get_y(df):
""" 读取标签
assume the last column is the target
"""
return np.array(df.iloc[:, -1]) # 返回df最后一列
def cost_function(theta, X, y):
m = X.shape[0] # 样本数量
inner = X @ theta - y
square_num = inner.T @ inner
cost = square_num / (2 * m)
return cost
def batch_gradient_descent(theta, X, y, epoch, alpha=0.01):
"""
拟合线性回归,返回参数和代价
epoch: 批处理的轮数
"""
m = X.shape[0]
cost_data = [cost_function(theta, X, y)] # 将theta为0时,最小代价函数加入到列表
for _ in range(epoch):
theta = theta - (X.T @ (X @ theta - y)) * alpha / m
cost_data.append(cost_function(theta, X, y))
return theta, cost_data # 返回最后的theta和保存每次迭代最小化代价函数值列表
if __name__ == "__main__":
data = pd.read_csv('ex1data2.txt', names=['size', 'bedrooms', 'price'])
data_nomalized = normalize_feature(data)
# print(data_nomalized.head())
X = get_X(data_nomalized)
y = get_y(data_nomalized)
alpha = 0.01 # 学习率
theta = np.zeros(X.shape[1]) # X.shape[1]:特征数n
epoch = 500 # 轮数
final_theta, cost_data = batch_gradient_descent(theta, X, y, epoch)
# 最小代价数据可视化
fig = plt.figure(figsize=(12, 4)) # 表示figure 的大小为宽、长(单位为inch)
plt.subplot(1, 2, 1)
plt.plot(np.arange(len(cost_data)), cost_data)
plt.xlabel('epoch')
plt.ylabel('loss')
# 不同学习速率alpha的效果
base = np.logspace(-1, -5, 4) # np.logspace(start=开始值,stop=结束值,num=元素个数,base=指定对数的底(默认底数10), endpoint=是否包含结束值)
candidate = np.sort(
np.concatenate((base, base * 3), axis=0)) # sort axis=0按列排序(对每一列排序),默认按行(axis=1)排序。concatenate默认axis=0(沿列方向),沿轴连接
print(candidate)
epoch = 50
ax = plt.subplot(1, 2, 2)
for alpha in candidate:
_, cost_data = batch_gradient_descent(theta, X, y, epoch, alpha=alpha)
ax.plot(np.arange(len(cost_data)), cost_data, label=alpha)
ax.set_xlabel('epoch', fontsize=10)
ax.set_ylabel('cost', fontsize=10)
ax.legend(bbox_to_anchor=(0.5, 1), loc=2, borderaxespad=0) # bbox_to_anchor调节图例位置
ax.set_title('learning rate', fontsize=10)
plt.show()
3. 正规方程算法
"""
多变量线性回归
案例:假设你现在打算卖房子,想知道房子能卖多少钱?
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def normalize_feature(df):
'''标准化数据(特征缩放)'''
return (df - df.mean()) / df.std()
# 散点图
# data_nomalized.plot.scatter('size','price',label='size')
# data_nomalized.plot.scatter('bedrooms','price',label='bedrooms')
# plt.show()
def get_X(df):
"""
读取特征
use concat to add intersect feature to avoid side effect
not efficient for big dataset though
"""
ones = pd.DataFrame({
'ones': np.ones(len(df))}) # len(data)行的长度,插入标签ones一列,赋值为1 创建m×1的矩阵(dataframe)
data = pd.concat([ones, df], axis=1) # 将创建的ones合并入df中
# 以上两句代码或者直接用data.insert('位置索引(如0)',"标签名(ones)",'赋值(1)')插入
# print(data)
return data.iloc[:, 0:2] # loc是自定义索引标签,iloc是默认索引标签(0,1,2...) iloc[行,列]
def get_y(df):
""" 读取标签
assume the last column is the target
"""
return np.array(df.iloc[:, -1]) # 返回df最后一列
def cost_function(theta, X, y):
m = X.shape[0] # 样本数量
inner = X @ theta - y
square_num = inner.T @ inner
cost = square_num / (2 * m)
return cost
def batch_gradient_descent(theta, X, y, epoch, alpha=0.01):
"""
拟合线性回归,返回参数和代价
epoch: 批处理的轮数
"""
m = X.shape[0]
cost_data = [cost_function(theta, X, y)] # 将theta为0时,最小代价函数加入到列表
for _ in range(epoch):
theta = theta - (X.T @ (X @ theta - y)) * alpha / m
cost_data.append(cost_function(theta, X, y))
return theta, cost_data # 返回最后的theta和保存每次迭代最小化代价函数值列表
if __name__ == "__main__":
data = pd.read_csv('ex1data2.txt', names=['size', 'bedrooms', 'price'])
data_nomalized = normalize_feature(data)
# print(data_nomalized.head())
X = get_X(data_nomalized)
y = get_y(data_nomalized)
alpha = 0.01 # 学习率
theta = np.zeros(X.shape[1]) # X.shape[1]:特征数n
epoch = 500 # 轮数
final_theta, cost_data = batch_gradient_descent(theta, X, y, epoch)
# 最小代价数据可视化
fig = plt.figure(figsize=(12, 4)) # 表示figure 的大小为宽、长(单位为inch)
plt.subplot(1, 2, 1)
plt.plot(np.arange(len(cost_data)), cost_data)
plt.xlabel('epoch')
plt.ylabel('loss')
# 不同学习速率alpha的效果
base = np.logspace(-1, -5, 4) # np.logspace(start=开始值,stop=结束值,num=元素个数,base=指定对数的底(默认底数10), endpoint=是否包含结束值)
candidate = np.sort(
np.concatenate((base, base * 3), axis=0)) # sort axis=0按列排序(对每一列排序),默认按行(axis=1)排序。concatenate默认axis=0(沿列方向),沿轴连接
print(candidate)
epoch = 50
ax = plt.subplot(1, 2, 2)
for alpha in candidate:
_, cost_data = batch_gradient_descent(theta, X, y, epoch, alpha=alpha)
ax.plot(np.arange(len(cost_data)), cost_data, label=alpha)
ax.set_xlabel('epoch', fontsize=10)
ax.set_ylabel('cost', fontsize=10)
ax.legend(bbox_to_anchor=(0.5, 1), loc=2, borderaxespad=0) # bbox_to_anchor调节图例位置
ax.set_title('learning rate', fontsize=10)
plt.show()
4. 基于tensorflow的线性回归
"""
基于tensorflow的线性回归
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
def get_X(df):
"""
读取特征
use concat to add intersect feature to avoid side effect
not efficient for big dataset though
"""
ones = pd.DataFrame({
'ones': np.ones(len(df))}) # len(data)行的长度,插入标签ones一列,赋值为1 创建m×1的矩阵(dataframe)
data = pd.concat([ones, df], axis=1) # 将创建的ones合并入df中
# 以上两句代码或者直接用data.insert('位置索引(如0)',"标签名(ones)",'赋值(1)')插入
# print(data)
return data.iloc[:, 0:2] # loc是自定义索引标签,iloc是默认索引标签(0,1,2...) iloc[行,列]
def get_y(df):
""" 读取标签
assume the last column is the target
"""
return np.array(df.iloc[:, -1]) # 返回df最后一列
def linear_regression(X_data, y_data, alpha, epoch,
optimizer=tf.train.GradientDescentOptimizer): # 这个函数是旧金山的一个大神Lucas Shen写的
# placeholder for graph input
X = tf.placeholder(tf.float32, shape=X_data.shape)
y = tf.placeholder(tf.float32, shape=y_data.shape)
# construct the graph
with tf.variable_scope('linear-regression'):
W = tf.get_variable("weights",
(X_data.shape[1], 1),
initializer=tf.constant_initializer()) # n*1
y_pred = tf.matmul(X, W) # m*n @ n*1 -> m*1
loss = 1 / (2 * len(X_data)) * tf.matmul((y_pred - y), (y_pred - y), transpose_a=True) # (m*1).T @ m*1 = 1*1
opt = optimizer(learning_rate=alpha)
opt_operation = opt.minimize(loss)
# run the session
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
loss_data = []
for i in range(epoch):
_, loss_val, W_val = sess.run([opt_operation, loss, W], feed_dict={
X: X_data, y: y_data})
loss_data.append(loss_val[0, 0]) # because every loss_val is 1*1 ndarray
if len(loss_data) > 1 and np.abs(
loss_data[-1] - loss_data[-2]) < 10 ** -9: # early break when it's converged
# print('Converged at epoch {}'.format(i))
break
# clear the graph
tf.reset_default_graph()
return {
'loss': loss_data, 'parameters': W_val} # just want to return in row vector format
if __name__ == "__main__":
data = pd.read_csv('ex1data1.txt', names=['population', 'profit'])
X_data = get_X(data)
print(X_data.shape, type(X_data))
y_data = get_y(data).reshape(len(X_data), 1) # special treatment for tensorflow input data
print(y_data.shape, type(y_data))
epoch = 2000
alpha = 0.01
optimizer_dict = {
'GD': tf.train.GradientDescentOptimizer,
'Adagrad': tf.train.AdagradOptimizer,
'Adam': tf.train.AdamOptimizer,
'Ftrl': tf.train.FtrlOptimizer,
'RMS': tf.train.RMSPropOptimizer
}
results = []
for name in optimizer_dict:
res = linear_regression(X_data, y_data, alpha, epoch, optimizer=optimizer_dict[name])
res['name'] = name
results.append(res)
print(res["parameters"])
fig, ax = plt.subplots(figsize=(12, 6))
for res in results:
loss_data = res['loss']
# print('for optimizer {}'.format(res['name']))
# print('final parameters\n', res['parameters'])
# print('final loss={}\n'.format(loss_data[-1]))
ax.plot(np.arange(len(loss_data)), loss_data, label=res['name'])
ax.set_xlabel('epoch', fontsize=18)
ax.set_ylabel('cost', fontsize=18)
ax.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
ax.set_title('different optimizer', fontsize=18)
plt.show()