线性模型是机器学习领域里最基础的模型之一,因此被Andrew NG老师用来开启CS229课程。 线性模型又分为线性回归(Linear Regression)与逻辑回归(Logistic Regression,但是这种翻译被很多人诟病,正确的说法应该是 对数几率回归)两大类。
本文主要讨论一下前者,即线性回归。
网上关于线性回归的学习笔记很多了,这里不多赘述,我就列一下相关公式和
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 3 10:53:02 2018
@author: PhoenixMY
Linear Regression Notes:
Ordinary Least Squares
Gradient Descent : Batch GD / Stochastic GD / Mini-Batch GD
Ridge Regression
Lasso Regression
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn import linear_model
'''
' Ordinary Least Square
' J(theta) = ∑(X(i)*theta - y(i)) ^ 2 / 2 ;
' 1 <= i <= m, m is the size of data set;
'''
#import boston house price dataset, for details, see load_boston.DESCR
X, y = load_boston(return_X_y=True)
def featureNormalize(X):
X_norm = X;
mu = np.zeros((1,X.shape[1]))
sigma = np.zeros((1,X.shape[1]))
for i in range(X.shape[1]):
mu[0,i] = np.mean(X[:,i]) # 均值
sigma[0,i] = np.std(X[:,i]) # 标准差
if (sigma[0,i] == 0):
sigma[0,i] = 1
# print(mu)
# print(sigma)
X_norm = (X - mu) / sigma
return X_norm
def J_theta(X_input, y_input, theta):
t = X_input@theta - y_input
return (t.T @ t)[0, 0] / (y_input.size * 2)
#########################################################
# Name : batch_gradient
# Descr :
# Params :
#########################################################
def batch_gradient(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.000001, cost=12):
print("***********************")
print("==>batch gradient descent started!")
theta = theta_init
J_theta_init = 0
J_theta_old = 0
for i in range(iter_max):
J_theta_init = J_theta(X_input, y_input, theta)
#print("J of theta is ", J_theta_init)
if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
break
if (np.abs(J_theta_init) < cost):
break
J_theta_old = J_theta_init
############################################################
#batch gradient : use all sample to update theta in one loop
############################################################
delta = X_input@theta - y_input
# warning point 2 : divided by m(the number of samples)
theta = theta - lr * (X_input.T@delta) / (y_input.size)
print("iteration times:", i)
if (i == iter_max):
print("!!!Reach the max number of iteration(iter_max)")
print("J of theta is ", J_theta_init)
print("**end")
print(theta)
return theta
def stochastic_gradient(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.00001, cost=12):
print("***********************")
print("==>stochastic gradient descent started!")
theta = theta_init
J_theta_init = 0
J_theta_old = 0
m = X.shape[0]
#n = X.shape[1]
for i in range(iter_max):
k = np.random.randint(low=0, high=(m-1))
J_theta_init = J_theta(X_input, y_input, theta)
'''
# no matter what kind of gradient method you use,
# the cost function should be same ~~~
t = X_input[k, :]@theta - y_input[k]
J_theta_init = t[0] * t[0] / 2
'''
if ((i % 50000) == 0):
print("J of theta is ", J_theta_init)
#'''
if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
break
if (np.abs(J_theta_init) < cost):
break
#'''
J_theta_old = J_theta_init
############################################################
#stochastic gradient : use a random sample to update theta in one loop
############################################################
delta = X_input[k, :]@theta - y_input[k]
# warning 2 : by theory , it should not be divided by m(the number of samples)
# but if learning rate is small(eg : 0.5 for this dataset), cost function will
# not converge any more!!!
# https://blog.csdn.net/lw_ghy/article/details/71054965
# 对于该算法,需要注意的是我们必须小心调节学习率。因为如果学习率设置太大的话,
# 算法会导致目标函数发散;反之如果学习率设置太小的话,算法会导致目标函数收敛过慢。
#theta = theta - lr * delta * (X_input.T[:, k].reshape((X_input.shape[1], 1))) / (y_input.size)
theta = theta - lr * delta * (X_input.T[:, k].reshape((X_input.shape[1], 1)))
print("iteration times:", i)
if (i == (iter_max-1)):
print("!!!Reach the max number of iteration(iter_max)")
print("J of theta is ", J_theta_init)
print("**end")
return theta
def mini_batch_gradient(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.000001, cost=12):
print("***********************")
print("==>mini batch gradient descent started!")
theta = theta_init
J_theta_init = 0
J_theta_old = 0
for i in range(iter_max):
J_theta_init = J_theta(X_input, y_input, theta)
#print("J of theta is ", J_theta_init)
if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
break
if (np.abs(J_theta_init) < cost):
break
J_theta_old = J_theta_init
############################################################
#batch gradient : use a group of samples to update theta in one loop
############################################################
# select 10 samples(vector) from dataset
# 10 is selected by experience(by Andrew NG)
batch_num = 10
mini_batch = np.random.choice((y_input.size-1), batch_num)
# stochastic gradient is a special case of mini-batch gradient :
# under the condition : mini_batch = 1
for k in mini_batch:
delta = X_input[k, :]@theta - y_input[k]
# warning point 2 : divided by m(the number of samples)
theta = theta - lr * delta * (X_input.T[:, k].reshape((X_input.shape[1], 1))) / batch_num
print("iteration times:", i)
if (i == iter_max):
print("!!!Reach the max number of iteration(iter_max)")
print("J of theta is ", J_theta_init)
print("**end")
return theta
def batch_gradient_Ridge(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.0000001, cost=12, alpha=0.00001):
print("***********************")
print("==>batch gradient descent with Ridge started!")
theta = theta_init
J_theta_init = 0
J_theta_old = 0
m = X_input.shape[0]
n = X_input.shape[1]
for i in range(iter_max):
###
# Ridge Regression : add penalty factor(alpha * theta ** 2) in cost function
###
t = X_input@theta - y_input
J_theta_init = (t.T @ t)[0, 0] / (y_input.size * 2) + alpha * theta.reshape((1,n)).dot(theta)[0,0]
#print("J of theta is ", J_theta_init)
if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
break
#if (np.abs(J_theta_init) < cost):
# break
J_theta_old = J_theta_init
############################################################
#batch gradient : use all sample to update theta in one loop
############################################################
delta = X_input@theta - y_input
# warning point 2 : divided by m(the number of samples)
theta = theta - lr * (X_input.T@delta) / (y_input.size) - alpha * theta
print("iteration times:", i)
if (i == iter_max):
print("!!!Reach the max number of iteration(iter_max)")
print("J of theta is ", J_theta_init)
print("**end")
print(theta)
return theta
def batch_gradient_Lasso(X_input, y_input, theta_init, iter_max, lr=0.001, J_theta_limit=0.0000001, cost=12, alpha=0.00001):
print("***********************")
print("==>batch gradient descent with Ridge started!")
theta = theta_init
J_theta_init = 0
J_theta_old = 0
m = X_input.shape[0]
n = X_input.shape[1]
for i in range(iter_max):
###
# Ridge Regression : add penalty factor(theta * alpha) in cost function
###
t = X_input@theta - y_input
J_theta_init = (t.T @ t)[0, 0] / (y_input.size * 2) + alpha *np.sum(theta)
#print("J of theta is ", J_theta_init)
if (np.abs(J_theta_init - J_theta_old) <= J_theta_limit):
break
#if (np.abs(J_theta_init) < cost):
# break
J_theta_old = J_theta_init
############################################################
#batch gradient : use all sample to update theta in one loop
############################################################
delta = X_input@theta - y_input
# warning point 2 : divided by m(the number of samples)
theta = theta - lr * (X_input.T@delta) / (y_input.size) - alpha
print("iteration times:", i)
if (i == iter_max):
print("!!!Reach the max number of iteration(iter_max)")
print("J of theta is ", J_theta_init)
print("**end")
print(theta)
return theta
def NormalEquation(X_input, y_input):
# theta = (X.T * X) ** (-1) * X.T * y
t = np.linalg.pinv(X_input.T@X)
theta = t @ X_input.T @ y_input
print(theta.reshape((X_input.shape[1]), 1))
return theta
# warning point 1 : normalization
# if there is no feature normalization, cost function will not converge
X = featureNormalize(X)
NormalEquation(X, y)
X = np.insert(X, 0, values=np.ones(y.size), axis=1)
X = X.reshape(X.shape[0], X.shape[1])
y = y.reshape(y.size, 1)
ITER_MAX = 50000
theta_init = np.zeros((X.shape[1], 1))
batch_gradient(X, y, theta_init, ITER_MAX, lr=0.01, J_theta_limit=0.00000001, cost=10)
#stochastic_gradient(X, y, theta_init, ITER_MAX, lr=0.001, J_theta_limit=0.00000001, cost=11)
#mini_batch_gradient(X, y, theta_init, ITER_MAX, lr=0.001, J_theta_limit=0.00000001, cost=11)
batch_gradient_Ridge(X, y, theta_init, ITER_MAX, lr=0.01, J_theta_limit=0.00000001, cost=10)
batch_gradient_Lasso(X, y, theta_init, ITER_MAX, lr=0.01, J_theta_limit=0.00000001, cost=10)
#
# _oo0oo_
# o8888888o
# 88" . "88
# (| -_- |)
# 0\ = /0
# ___/`---'\___
# .' \\| |# '.
# / \\||| : |||# \
# / _||||| -:- |||||- \
# | | \\\ - #/ | |
# | \_| ''\---/'' |_/ |
# \ .-\__ '-' ___/-. /
# ___'. .' /--.--\ `. .'___
# ."" '< `.___\_<|>_/___.' >' "".
# | | : `- \`.;`\ _ /`;.`/ - ` : | |
# \ \ `_. \_ __\ /__ _/ .-` / /
# =====`-.____`.___ \_____/___.-`___.-'=====
# `=---='
#
#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# 佛祖保佑 永无BUG
#
#
#