最优化问题-梯度下降(Gradient Descent)算法&样例代码以及相关扩展

机器学习(machine learning)的回归分析基本思路是通过收集训练集的特征(x),采用某种函数去拟合训练集目标值(y), 希望通过拟合的方式预测/评估尚未遇到样本(x’)的目标值。回归分析更严谨的数据定义[2]:

  • 给定一些未知/待求解变量(向量)
  • 基于一些相互独立的变量 (向量)
  • 希望找到应变量y与未知变量 \beta和独立变量x的函数关系



以上数学表达式中x_i 表示第i个样本的特征值(向量), y_i 表示对应的目标值, f 表示函数(这部分一般由先验知识得来,如多项式), 如果样本空间里所有点能最小化以上数学等式(如何定义最小化? 一般有工程师人为设定阀值),则工程上我们认为函数f 以及\beta 很好的描述了样本空间与目标值的关系,对于求解得的目标函数是否能很好的描述未知样本,不在本文的讨论范围,感兴趣的读者可以搜回归分析中的过拟合问题(overfitting), 本篇博客将重点讨论如何通过梯度下降(gradient descent)算法解决回归分析的优化问题以及相关扩展算法。关于回归分析的讨论已超出本篇的讨论,这里将简单跳过,感兴趣的读者可以看[2]。




其中\beta 为未知变量,我们假设F(凸函数-convex function)在指定域内可导,关于不可导函数最优化解法见(次阶梯 subgradient method),我们知道如果我们将\beta沿着F(x)一阶导数的方向变化,我们能确保

,为此我们可以初始化一个未知变量\beta_0, 每次更新/探索未知变量\beta时,我们沿


一阶导数的负方向改变一定的步长(这里我们先讨论r为定常量的情况, 如果r足够小), 在代码实现中我们需要预先计算得目标函数的一阶导数



# -*- encoding: utf-8 -*-
import re
import sys
import numpy as np
import copy
import time

def timeConsumption(func):
    def func_wrapper(x):
        start_time = time.time()
        end_time = time.time()
        print "[Function-Time-Consumption] ", end_time - start_time
    return func_wrapper

def initialize(length=300):
    X = []
    Y = []
    mu, sigma = 0, 0.1
    V = 100
    # here we assume x is two-dimesion matrix 
    for i in np.random.random(length):
        a = i * V
        for j in np.random.random(length): 
            b = j * V
            X.append([a**2, b**2, a, b, 1, 1])

    # white noise
    noise = np.random.normal(mu, sigma, size=length * length)

    # a * x**2 + b * x + c
    function = lambda x: np.dot([3.0, -1.5, 3.5, -2, 4, 1.0], x)
    Y = [function(X[i]) + noise[i] for i in range(length * length)] 
    return X, Y

class GradientDescent(object):

    def __init__(self, X, Y, eplison=0.000001, gama=0.01, iter_num=10000):
        _d = X.shape[-1]

        # parameter initailization
        self.a = np.random.normal(0, 1, size=_d)

        self.X = X
        self.Y = Y
        self.eplison = eplison
        self.gama = (1.0 / max(Y)) * 1.0 / len(X)
        self.iter_num = iter_num

    def function(self, a, x):
        Do we have prior knowledge about the function?
            - quadratic ?
            - exponential ?
            - linear ?
        return np.dot(a, x)

    def run(self):
        derative = []
        for i in range(len(self.a)):
            derative.append(np.mean([x[i] for x in self.X]))

        local_y = [self.function(self.a, x) for x in self.X]
        diff = np.mean(np.subtract(local_y, self.Y)) 
        while self.iter_num > 0:
            local_a = copy.copy(self.a)
            for i in range(len(local_a)):
                local_a[i] -= self.gama * derative[i] * diff
            local_y = [self.function(local_a, x) for x in self.X]
            diff = np.mean(np.subtract(local_y, self.Y))
            print diff
            self.a = local_a
            self.iter_num -= 1
            if abs(diff) < self.eplison: break
        print self.a

def main():
    X, Y = initialize()
    # time-consumption for length of 300 is 298.651947021, dependent on initialize value
    # instance = GradientDescent(np.array(X), np.array(Y))
    # instance.run()

    # time-consumption for length of 300 is 290.654798031
    instance = StochasticGradientDescent(np.array(X), np.array(Y))

    # time-consumption for length of 300 is 287.527049065, dependent on initialize value
    # instance = MiniBatchGradientDescent(np.array(X), np.array(Y))
    # instance.run()

if __name__ == "__main__":



- 批量梯度下降 (Batch Gradient Descent)
- 随机梯度下降 (Stochastic Gradient Descent)
- 小批量梯度下降 (Mini-Batch Gradient Descent)


class StochasticGradientDescent(GradientDescent):

    def __init__(self, X, Y, eplison=0.000001, gama=0.01, iter_num=300):
        super(StochasticGradientDescent, self).__init__(X, Y, eplison, gama, iter_num)

    def run(self):
        derative = []
        for i in range(len(self.a)):
            derative.append(np.mean([x[i] for x in self.X]))

        LENGTH = len(self.X)
        i = np.random.randint(0, LENGTH, 1)
        local_y = self.function(self.a, self.X[i].reshape(-1))
        diff = local_y - self.Y[i]

        # https://en.wikipedia.org/wiki/Stochastic_gradient_descent
        learning_rate = self.gama
        den = self.iter_num
        while self.iter_num > 0:
            local_a = copy.copy(self.a)
            for i in range(len(local_a)):
                local_a[i] -= learning_rate * derative[i] * diff
            i = np.random.randint(0, LENGTH, 1)
            local_y = self.function(local_a, self.X[i].reshape(-1))
            diff = local_y - self.Y[i]
            self.a = local_a
            self.iter_num -= 1
            learning_rate = self.gama * self.iter_num / den

            _temp = [self.function(local_a, x) for x in self.X]
            _diff = np.mean(np.subtract(_temp, self.Y))
            print _diff
            if abs(_diff) < self.eplison: break

        # make sure result converge
        while abs(_diff) > self.eplison:
            local_a = copy.copy(self.a)
            for i in range(len(local_a)):
                local_a[i] -= self.gama * derative[i] * _diff
            local_y = [self.function(local_a, x) for x in self.X]
            _diff = np.mean(np.subtract(local_y, self.Y))
            print _diff
            self.a = local_a
        print self.a

class MiniBatchGradientDescent(GradientDescent):

    def __init__(self, X, Y, eplison=0.000001, gama=0.01, iter_num=300, batch=100):
        super(MiniBatchGradientDescent, self).__init__(X, Y, eplison, gama, iter_num)
        self.batch = int(0.1 * self.X.shape[0])

    def run(self):
        derative = []
        for i in range(len(self.a)):
            derative.append(np.mean([x[i] for x in self.X]))

        LENGTH = len(self.X)
        local_a = copy.copy(self.a)
        learning_rate = self.gama
        den = self.iter_num
        _diff = 0
        while self.iter_num > 0:
            index = np.random.randint(0, LENGTH, self.batch)
            data = self.X[index]
            local_y = [self.function(local_a, x) for x in data]
            diff = np.mean(np.subtract(local_y, self.Y[index]))
            for i in range(len(local_a)):
                local_a[i] -= learning_rate * derative[i] * diff
            self.a = local_a
            self.iter_num -= 1
            learning_rate = self.gama * self.iter_num / den

            _temp = [self.function(local_a, x) for x in self.X]
            _diff = np.mean(np.subtract(_temp, self.Y))
            print _diff
            if abs(_diff) < self.eplison: break

        # without follow code, diff do not converge to minimize
        # make sure result converge
        while abs(_diff) > self.eplison:
            local_a = copy.copy(self.a)
            for i in range(len(local_a)):
                local_a[i] -= self.gama * derative[i] * _diff
            local_y = [self.function(local_a, x) for x in self.X]
            _diff = np.mean(np.subtract(local_y, self.Y))
            print _diff
            self.a = local_a
        print self.a
