机器学习(machine learning)的回归分析基本思路是通过收集训练集的特征(x),采用某种函数去拟合训练集目标值(y), 希望通过拟合的方式预测/评估尚未遇到样本(x’)的目标值。回归分析更严谨的数据定义[2]:
- 给定一些未知/待求解变量(向量)
β- 基于一些相互独立的变量 (向量)
X- 希望找到应变量y与未知变量 \beta和独立变量x的函数关系
Y≈f(X,β)
具体的应用场景如广告系统的在线流量预估(进行在线流量分配以确保合约式保量展现),通过流量的预估可以最优化广告系统收益。在大部分的回归分析中,未知变量的求解大多通过优化目标函数来已完成,
以上数学表达式中x_i 表示第i个样本的特征值(向量), y_i 表示对应的目标值, f 表示函数(这部分一般由先验知识得来,如多项式), 如果样本空间里所有点能最小化以上数学等式(如何定义最小化? 一般有工程师人为设定阀值),则工程上我们认为函数f 以及\beta 很好的描述了样本空间与目标值的关系,对于求解得的目标函数是否能很好的描述未知样本,不在本文的讨论范围,感兴趣的读者可以搜回归分析中的过拟合问题(overfitting), 本篇博客将重点讨论如何通过梯度下降(gradient descent)算法解决回归分析的优化问题以及相关扩展算法。关于回归分析的讨论已超出本篇的讨论,这里将简单跳过,感兴趣的读者可以看[2]。
首先我们先来看一下梯度下降的基本思想,这里我们重新引用目标函数
# -*- encoding: utf-8 -*-
import re
import sys
import numpy as np
import copy
import time
def timeConsumption(func):
def func_wrapper(x):
start_time = time.time()
func(x)
end_time = time.time()
print "[Function-Time-Consumption] ", end_time - start_time
return func_wrapper
def initialize(length=300):
"""
"""
X = []
Y = []
mu, sigma = 0, 0.1
V = 100
# here we assume x is two-dimesion matrix
for i in np.random.random(length):
a = i * V
for j in np.random.random(length):
b = j * V
X.append([a**2, b**2, a, b, 1, 1])
# white noise
noise = np.random.normal(mu, sigma, size=length * length)
# a * x**2 + b * x + c
function = lambda x: np.dot([3.0, -1.5, 3.5, -2, 4, 1.0], x)
Y = [function(X[i]) + noise[i] for i in range(length * length)]
return X, Y
class GradientDescent(object):
"""
"""
def __init__(self, X, Y, eplison=0.000001, gama=0.01, iter_num=10000):
"""
"""
_d = X.shape[-1]
# parameter initailization
self.a = np.random.normal(0, 1, size=_d)
self.X = X
self.Y = Y
self.eplison = eplison
self.gama = (1.0 / max(Y)) * 1.0 / len(X)
self.iter_num = iter_num
def function(self, a, x):
"""
Do we have prior knowledge about the function?
- quadratic ?
- exponential ?
- linear ?
"""
return np.dot(a, x)
@timeConsumption
def run(self):
"""
"""
derative = []
for i in range(len(self.a)):
derative.append(np.mean([x[i] for x in self.X]))
local_y = [self.function(self.a, x) for x in self.X]
diff = np.mean(np.subtract(local_y, self.Y))
while self.iter_num > 0:
local_a = copy.copy(self.a)
for i in range(len(local_a)):
local_a[i] -= self.gama * derative[i] * diff
local_y = [self.function(local_a, x) for x in self.X]
diff = np.mean(np.subtract(local_y, self.Y))
print diff
self.a = local_a
self.iter_num -= 1
if abs(diff) < self.eplison: break
print self.a
def main():
"""
"""
X, Y = initialize()
# time-consumption for length of 300 is 298.651947021, dependent on initialize value
# instance = GradientDescent(np.array(X), np.array(Y))
# instance.run()
# time-consumption for length of 300 is 290.654798031
instance = StochasticGradientDescent(np.array(X), np.array(Y))
instance.run()
# time-consumption for length of 300 is 287.527049065, dependent on initialize value
# instance = MiniBatchGradientDescent(np.array(X), np.array(Y))
# instance.run()
if __name__ == "__main__":
reload(sys)
sys.setdefaultencoding("utf-8")
main()
在具体工程实现中,可能会遇到训练数据过大无法全部加载到内存的情况或者训量量大导致训练缓慢(每次变量更新都需要过一遍全数据),这时候我们可以采用梯度下降的集中变形[2]:
- 批量梯度下降 (Batch Gradient Descent)
- 随机梯度下降 (Stochastic Gradient Descent)
- 小批量梯度下降 (Mini-Batch Gradient Descent)
以上算法的基本思想和梯度下降相似,当在更新变量时,所有训练数据量有所不同,批量梯度下降会预先设置每次更新所依赖的训练集的量,随机梯度下降更具每个训练数据更新一次变量。
class StochasticGradientDescent(GradientDescent):
def __init__(self, X, Y, eplison=0.000001, gama=0.01, iter_num=300):
super(StochasticGradientDescent, self).__init__(X, Y, eplison, gama, iter_num)
@timeConsumption
def run(self):
derative = []
for i in range(len(self.a)):
derative.append(np.mean([x[i] for x in self.X]))
LENGTH = len(self.X)
i = np.random.randint(0, LENGTH, 1)
local_y = self.function(self.a, self.X[i].reshape(-1))
diff = local_y - self.Y[i]
# https://en.wikipedia.org/wiki/Stochastic_gradient_descent
learning_rate = self.gama
den = self.iter_num
while self.iter_num > 0:
local_a = copy.copy(self.a)
for i in range(len(local_a)):
local_a[i] -= learning_rate * derative[i] * diff
i = np.random.randint(0, LENGTH, 1)
local_y = self.function(local_a, self.X[i].reshape(-1))
diff = local_y - self.Y[i]
self.a = local_a
self.iter_num -= 1
learning_rate = self.gama * self.iter_num / den
_temp = [self.function(local_a, x) for x in self.X]
_diff = np.mean(np.subtract(_temp, self.Y))
print _diff
if abs(_diff) < self.eplison: break
# make sure result converge
while abs(_diff) > self.eplison:
local_a = copy.copy(self.a)
for i in range(len(local_a)):
local_a[i] -= self.gama * derative[i] * _diff
local_y = [self.function(local_a, x) for x in self.X]
_diff = np.mean(np.subtract(local_y, self.Y))
print _diff
self.a = local_a
print self.a
class MiniBatchGradientDescent(GradientDescent):
def __init__(self, X, Y, eplison=0.000001, gama=0.01, iter_num=300, batch=100):
super(MiniBatchGradientDescent, self).__init__(X, Y, eplison, gama, iter_num)
self.batch = int(0.1 * self.X.shape[0])
@timeConsumption
def run(self):
derative = []
for i in range(len(self.a)):
derative.append(np.mean([x[i] for x in self.X]))
LENGTH = len(self.X)
local_a = copy.copy(self.a)
learning_rate = self.gama
den = self.iter_num
_diff = 0
while self.iter_num > 0:
index = np.random.randint(0, LENGTH, self.batch)
data = self.X[index]
local_y = [self.function(local_a, x) for x in data]
diff = np.mean(np.subtract(local_y, self.Y[index]))
for i in range(len(local_a)):
local_a[i] -= learning_rate * derative[i] * diff
self.a = local_a
self.iter_num -= 1
learning_rate = self.gama * self.iter_num / den
_temp = [self.function(local_a, x) for x in self.X]
_diff = np.mean(np.subtract(_temp, self.Y))
print _diff
if abs(_diff) < self.eplison: break
# without follow code, diff do not converge to minimize
# make sure result converge
while abs(_diff) > self.eplison:
local_a = copy.copy(self.a)
for i in range(len(local_a)):
local_a[i] -= self.gama * derative[i] * _diff
local_y = [self.function(local_a, x) for x in self.X]
_diff = np.mean(np.subtract(local_y, self.Y))
print _diff
self.a = local_a
print self.a