第6章 梯度下降法

6-1 什么是梯度下降法

第6章 梯度下降法_第1张图片

第6章 梯度下降法_第2张图片

第6章 梯度下降法_第3张图片

第6章 梯度下降法_第4张图片

第6章 梯度下降法_第5张图片

第6章 梯度下降法_第6张图片

第6章 梯度下降法_第7张图片

第6章 梯度下降法_第8张图片

第6章 梯度下降法_第9张图片

第6章 梯度下降法_第10张图片

第6章 梯度下降法_第11张图片

第6章 梯度下降法_第12张图片

6-2 模拟实现梯度下降法

Notbook 示例

第6章 梯度下降法_第13张图片

Notbook 源码

[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
plot_x = np.linspace(-1,6,141)
plot_x
array([-1.  , -0.95, -0.9 , -0.85, -0.8 , -0.75, -0.7 , -0.65, -0.6 ,
       -0.55, -0.5 , -0.45, -0.4 , -0.35, -0.3 , -0.25, -0.2 , -0.15,
       -0.1 , -0.05,  0.  ,  0.05,  0.1 ,  0.15,  0.2 ,  0.25,  0.3 ,
        0.35,  0.4 ,  0.45,  0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,
        0.8 ,  0.85,  0.9 ,  0.95,  1.  ,  1.05,  1.1 ,  1.15,  1.2 ,
        1.25,  1.3 ,  1.35,  1.4 ,  1.45,  1.5 ,  1.55,  1.6 ,  1.65,
        1.7 ,  1.75,  1.8 ,  1.85,  1.9 ,  1.95,  2.  ,  2.05,  2.1 ,
        2.15,  2.2 ,  2.25,  2.3 ,  2.35,  2.4 ,  2.45,  2.5 ,  2.55,
        2.6 ,  2.65,  2.7 ,  2.75,  2.8 ,  2.85,  2.9 ,  2.95,  3.  ,
        3.05,  3.1 ,  3.15,  3.2 ,  3.25,  3.3 ,  3.35,  3.4 ,  3.45,
        3.5 ,  3.55,  3.6 ,  3.65,  3.7 ,  3.75,  3.8 ,  3.85,  3.9 ,
        3.95,  4.  ,  4.05,  4.1 ,  4.15,  4.2 ,  4.25,  4.3 ,  4.35,
        4.4 ,  4.45,  4.5 ,  4.55,  4.6 ,  4.65,  4.7 ,  4.75,  4.8 ,
        4.85,  4.9 ,  4.95,  5.  ,  5.05,  5.1 ,  5.15,  5.2 ,  5.25,
        5.3 ,  5.35,  5.4 ,  5.45,  5.5 ,  5.55,  5.6 ,  5.65,  5.7 ,
        5.75,  5.8 ,  5.85,  5.9 ,  5.95,  6.  ])
[3]
plot_y = (plot_x - 2.5) ** 2 - 1
[4]
plt.plot(plot_x,plot_y)
[]

[5]
def dJ(theta):
    return 2 * (theta - 2.5)
[6]
def J(theta):
    return (theta - 2.5) ** 2 - 1
[7]
eta = 0.1
epsilon = 1e-8

theta = 0.0
while True:
    gradient = dJ(theta)
    last_theta = theta
    theta = theta - eta * gradient
    
    if(abs(dJ(theta)-dJ(last_theta)) < epsilon ):
        break
        
print(theta)
print(J(theta))
2.4999999819074863
-0.9999999999999997

[8]
theta = 0.0
theta_history = [theta]
while True:
    gradient = dJ(theta)
    last_theta = theta
    theta = theta - eta * gradient
    theta_history.append(theta)
    
    if(abs(dJ(theta)-dJ(last_theta)) < epsilon ):
        break
        
plt.plot(plot_x,J(plot_x))
plt.plot(np.array(theta_history),J(np.array(theta_history)),color = 'r',marker = '+')
[]

[9]
theta
2.4999999819074863
[10]
len(theta_history) # 非46
85
[11]
def gradient_descent(initial_theta, eta, epsilon = 1e-8):
    theta = initial_theta
    theta_history.append(initial_theta)
    
    while True:
        gradient = dJ(theta)
        last_theta = theta
        theta = theta - eta * gradient
        theta_history.append(theta)
    
        if(abs(dJ(theta)-dJ(last_theta)) < epsilon ):
            break
        
def plot_theta_history():
    plt.plot(plot_x,J(plot_x))
    plt.plot(np.array(theta_history),J(np.array(theta_history)),color = 'r',marker = '+')
[12]
eta = 0.01
theta_history = [] # theta_history[] = [] 错误
gradient_descent(0.0,eta)
plot_theta_history()

[13]
len(theta_history) # 非424
800
[14]
eta = 0.001
theta_history = [] 
gradient_descent(0.0,eta)
plot_theta_history()

[15]
theta
2.4999999819074863
[16]
len(theta_history) # 3682
6903
[17]
eta = 0.8
theta_history = [] 
gradient_descent(0.0,eta)
plot_theta_history()

[18]
len(theta_history)
43
[19]
theta
2.4999999819074863
eta = 1.1 theta_history = [] gradient_descent(0.0,eta) plot_theta_history()

[20]
def J(theta):
    try:
        return (theta - 2.5) ** 2 - 1
    except:
        return float('inf')
[21]
def gradient_descent(initial_theta, eta,n_iters = 1e3, epsilon = 1e-8):
    theta = initial_theta
    theta_history.append(initial_theta)
    i_iters = 0
    
    while i_iters < n_iters:
        gradient = dJ(theta)
        last_theta = theta
        theta = theta - eta * gradient
        theta_history.append(theta)
    
        if(abs(dJ(theta)-dJ(last_theta)) < epsilon ):
            break
        i_iters +=1
        
def plot_theta_history():
    plt.plot(plot_x,J(plot_x))
    plt.plot(np.array(theta_history),J(np.array(theta_history)),color = 'r',marker = '+')
[22]
eta = 1.1 
theta_history = []
gradient_descent(0.0,eta) 
plot_theta_history()

[23]
theta
2.4999999819074863
[24]
len(theta_history)
1001
[25]
dJ(theta_history[-1])
-7.58955044586262e+79
[26]
theta_history[-1]
-3.79477522293131e+79
[27]
np.argsort(theta)
array([0], dtype=int64)
[28]
eta = 1.1 
theta_history = []
gradient_descent(0.0,eta,n_iters=10) 
plot_theta_history()

[29]
theta
2.4999999819074863

6-3 线性回归中的梯度下降法

第6章 梯度下降法_第14张图片

第6章 梯度下降法_第15张图片

第6章 梯度下降法_第16张图片

第6章 梯度下降法_第17张图片

第6章 梯度下降法_第18张图片

第6章 梯度下降法_第19张图片

第6章 梯度下降法_第20张图片

 6-4 实现线性回归中的梯度下降法

 Notbook 示例

第6章 梯度下降法_第21张图片

Notbook 源码

在线性回归模型中实现梯度下降法
[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
np.random.seed(666)
x = 2 * np.random.random(size = 100)
y = x * 3.0 + 4.0 + np.random.normal(size = 100)
[3]
X = x.reshape(-1,1)
[4]
X.shape
(100, 1)
[5]
y.shape
(100,)
[6]
plt.scatter(X,y)


使用梯度下降法训练
%E6%A2%AF%E5%BA%A6%E4%B8%8B%E9%99%8D.png

[7]
def J(theta, X_b, y):
    try:
        return np.sum( ( y - X_b.dot(theta) ) ** 2 ) / len(X_b)
    except:
        return float('inf')
[8]
def dJ(theta, X_b, y):
    res = np.empty(len(theta))
    res[0] = np.sum( X_b.dot(theta) - y )
    for i in range(1,len(theta)):
        res[i] = (X_b.dot(theta) - y).dot(X_b[:,i])
    return res * 2 / len(theta)
[9]
def gradient_descent(X_b, y, initial_theta, eta,n_iters = 1e5, epsilon = 1e-8):
    theta = initial_theta
    # theta_history.append(initial_theta)
    i_iters = 0
    
    while i_iters < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        # theta_history.append(theta)
    
        if(abs(J(theta, X_b, y)-J(last_theta,X_b,y)) < epsilon ):
            break
        i_iters +=1
        
    return theta
        
[10]
X_b = np.hstack( [ np.ones((len(X),1)) ,X ] )
initial_theta = np.zeros(X_b.shape[1])

eta = 0.001 # 仅仅在 eta,n_iters = 1e5, eta = 0.0001 时得到正确数值
theta = gradient_descent(X_b,y,initial_theta,eta)
[11]
theta
array([4.02271672, 3.006001  ])
[12]
X_b.shape[0]
100
封装我们的线性回归算法
[14]
from playML.LinearRegression import LinearRegression
[15]
lin_reg = LinearRegression()
lin_reg.fit_gd(X,y)
LinearRegression()
[16]
lin_reg.coef_
array([3.0111697])
[18]
lin_reg.interception_ # 为什么没加ion
4.01658859640915

6-5 梯度下降的向量化和数据标准化

第6章 梯度下降法_第22张图片

第6章 梯度下降法_第23张图片

第6章 梯度下降法_第24张图片

第6章 梯度下降法_第25张图片

Notbook 示例

第6章 梯度下降法_第26张图片

Notbook 源码

梯度下降法的向量化
[1]
import numpy as np
from sklearn import datasets
[2]
boston = datasets.load_boston()
X = boston.data
y = boston.target


X = X[ y < 50.0]
y = y[ y < 50.0]
F:\anaconda\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function load_boston is deprecated; `load_boston` is deprecated in 1.0 and will be removed in 1.2.

    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_housing
        housing = fetch_california_housing()

    for the California housing dataset and::

        from sklearn.datasets import fetch_openml
        housing = fetch_openml(name="house_prices", as_frame=True)

    for the Ames housing dataset.
    
  warnings.warn(msg, category=FutureWarning)

[3]
from playML.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=222)
[4]
from playML.LinearRegression import LinearRegression

lin_reg1 = LinearRegression()
%time lin_reg1.fit_normal(X_train,y_train)
lin_reg1.score(X_test,y_test)
CPU times: total: 0 ns
Wall time: 770 ms

0.8129794056212779
[5]
lin_reg2 = LinearRegression()
lin_reg2.fit_gd(X_train,y_train)
C:\Users\Administrator\PycharmProjects\pythonProject\anaconda\第4章 最基础的分类算法-k近邻算法\playML\LinearRegression.py:33: RuntimeWarning: overflow encountered in square
  return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)
C:\Users\Administrator\PycharmProjects\pythonProject\anaconda\第4章 最基础的分类算法-k近邻算法\playML\LinearRegression.py:57: RuntimeWarning: invalid value encountered in double_scalars
  if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
C:\Users\Administrator\PycharmProjects\pythonProject\anaconda\第4章 最基础的分类算法-k近邻算法\playML\LinearRegression.py:54: RuntimeWarning: invalid value encountered in subtract
  theta = theta - eta * gradient

LinearRegression()
[6]
lin_reg2.coef_
array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])
[7]
X_train[:10,:]
array([[1.42362e+01, 0.00000e+00, 1.81000e+01, 0.00000e+00, 6.93000e-01,
        6.34300e+00, 1.00000e+02, 1.57410e+00, 2.40000e+01, 6.66000e+02,
        2.02000e+01, 3.96900e+02, 2.03200e+01],
       [3.67822e+00, 0.00000e+00, 1.81000e+01, 0.00000e+00, 7.70000e-01,
        5.36200e+00, 9.62000e+01, 2.10360e+00, 2.40000e+01, 6.66000e+02,
        2.02000e+01, 3.80790e+02, 1.01900e+01],
       [1.04690e-01, 4.00000e+01, 6.41000e+00, 1.00000e+00, 4.47000e-01,
        7.26700e+00, 4.90000e+01, 4.78720e+00, 4.00000e+00, 2.54000e+02,
        1.76000e+01, 3.89250e+02, 6.05000e+00],
       [1.15172e+00, 0.00000e+00, 8.14000e+00, 0.00000e+00, 5.38000e-01,
        5.70100e+00, 9.50000e+01, 3.78720e+00, 4.00000e+00, 3.07000e+02,
        2.10000e+01, 3.58770e+02, 1.83500e+01],
       [6.58800e-02, 0.00000e+00, 2.46000e+00, 0.00000e+00, 4.88000e-01,
        7.76500e+00, 8.33000e+01, 2.74100e+00, 3.00000e+00, 1.93000e+02,
        1.78000e+01, 3.95560e+02, 7.56000e+00],
       [2.49800e-02, 0.00000e+00, 1.89000e+00, 0.00000e+00, 5.18000e-01,
        6.54000e+00, 5.97000e+01, 6.26690e+00, 1.00000e+00, 4.22000e+02,
        1.59000e+01, 3.89960e+02, 8.65000e+00],
       [7.75223e+00, 0.00000e+00, 1.81000e+01, 0.00000e+00, 7.13000e-01,
        6.30100e+00, 8.37000e+01, 2.78310e+00, 2.40000e+01, 6.66000e+02,
        2.02000e+01, 2.72210e+02, 1.62300e+01],
       [9.88430e-01, 0.00000e+00, 8.14000e+00, 0.00000e+00, 5.38000e-01,
        5.81300e+00, 1.00000e+02, 4.09520e+00, 4.00000e+00, 3.07000e+02,
        2.10000e+01, 3.94540e+02, 1.98800e+01],
       [1.14320e-01, 0.00000e+00, 8.56000e+00, 0.00000e+00, 5.20000e-01,
        6.78100e+00, 7.13000e+01, 2.85610e+00, 5.00000e+00, 3.84000e+02,
        2.09000e+01, 3.95580e+02, 7.67000e+00],
       [5.69175e+00, 0.00000e+00, 1.81000e+01, 0.00000e+00, 5.83000e-01,
        6.11400e+00, 7.98000e+01, 3.54590e+00, 2.40000e+01, 6.66000e+02,
        2.02000e+01, 3.92680e+02, 1.49800e+01]])
[8]
lin_reg2.fit_gd(X_train,y_train,eta = 0.000001)
LinearRegression()
[9]
lin_reg2.score(X_test,y_test)
0.5183037995455362
[10]
%time lin_reg2.fit_gd(X_train,y_train,eta = 0.0000031001,n_iters=1e12)
CPU times: total: 14.9 s
Wall time: 8.04 s

LinearRegression()
[11]
lin_reg2.score(X_test,y_test)
0.6180704373142486
使用梯度下降法前进行数据归一化
[12]
from sklearn.preprocessing import StandardScaler
[13]
standardScaler = StandardScaler()
standardScaler.fit(X_train)
StandardScaler()
[14]
X_train_standard = standardScaler.transform(X_train)
[15]
lin_reg3 = LinearRegression()
%time lin_reg3.fit_gd(X_train_standard,y_train)
CPU times: total: 5.5 s
Wall time: 2.82 s

LinearRegression()
[16]
X_test_standard = standardScaler.transform(X_test)
[17]
lin_reg3.score(X_test_standard,y_test)
0.8130040900692703
梯度下降法的优势
[18]
m = 2000
n = 5000

big_X = np.random.normal(size=(m,n))
true_theta = np.random.uniform(0.0,100.0,size=n+1)
big_y = big_X.dot(true_theta[1:]) + true_theta[0] + np.random.normal(0.0,10.0,size = m)
[19]
big_reg1 = LinearRegression()
%time big_reg1.fit_normal(big_X,big_y)
CPU times: total: 1min 19s
Wall time: 46.6 s

LinearRegression()
[29]
big_reg2 = LinearRegression()
%time big_reg2.fit_gd(big_X,big_y,eta=0.1,n_iters=1e3)
# CPU times: total: 8min 8s 原因是eta=0.001,n_iters = 1e4
# Wall time: 5min 41s  所以与具体模型关系甚大
CPU times: total: 5.75 s
Wall time: 3.29 s

LinearRegression()
[30]
big_reg2.coef_
array([14.82820315, 43.97009426,  6.49921157, ...,  7.31030043,
       51.42885153, 20.76248914])
[31]
big_reg2.interception_
102.55282983264055

6-6 随机梯度下降法

第6章 梯度下降法_第27张图片

第6章 梯度下降法_第28张图片

第6章 梯度下降法_第29张图片

第6章 梯度下降法_第30张图片

Notbook 示例

第6章 梯度下降法_第31张图片

Notbook 源码

随机梯度下降法

[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
m = 100000

x = np.random.normal(size = m)
X = x.reshape(-1,1)

y = 4.0*x + 3. + np.random.normal(0,3,size=m)
[3]
y.shape
(100000,)
[4]
def J(theta, X_b, y):
    try:
        return np.sum( ( y - X_b.dot(theta) ) ** 2 ) / len(y)
    except:
        return float('inf')
    
def dJ(theta, X_b, y):
     return X_b.T.dot(X_b.dot(theta) - y) * 2 / len(y)
    
def gradient_descent(X_b, y, initial_theta, eta, n_iters = 1e5, epsilon = 1e-8):
    theta = initial_theta
    i_iters = 0
    
    while i_iters < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        
        if(abs(J(theta, X_b, y)-J(last_theta,X_b,y)) < epsilon ):
            break
        i_iters +=1
        
    return theta
[5]
%%time
X_b = np.hstack( [ np.ones((len(X),1)) ,X ] )
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01 # 仅仅在 eta,n_iters = 1e5, eta = 0.0001 时得到正确数值
theta = gradient_descent(X_b,y,initial_theta,eta)
CPU times: total: 1.81 s
Wall time: 946 ms

[6]
theta
array([2.97998979, 3.99960807])
随机梯度下降法
[7]
def dJ_sgd(theta, X_b_i, y_i):
     return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2
[8]
def sgd(X_b, y, initial_theta, n_iters):
    
    t0 = 5
    t1 = 50
    
    def learning_rate(t):
        return t0 / (t + t1)
    
    theta = initial_theta
    for cur_iter in range(n_iters):
        rand_i = np.random.randint(len(X_b))
        gradient = dJ_sgd(theta, X_b[rand_i],y[rand_i])
        theta = theta - learning_rate(cur_iter) * gradient
    return theta
[9]
%%time
X_b = np.hstack( [ np.ones((len(X),1)) ,X ] )
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, n_iters=len(X_b//3))
CPU times: total: 1.84 s
Wall time: 1.79 s

[10]
theta
array([2.99897113, 4.02392746])

6-7 scikit-learn中的随机梯度下降法

Notbook 示例

第6章 梯度下降法_第32张图片

Notbook 源码

使用我们自己的SGD
[1]
import numpy as np 
import matplotlib.pyplot as plt
[2]
m = 100000

x = np.random.normal(size = m)
X = x.reshape(-1,1)

y = 4.0*x + 3. + np.random.normal(0,3,size=m)
[3]
from playML.LinearRegression import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit_sgd(X, y, n_iters=2)
LinearRegression()
[4]
lin_reg._theta
array([3.01202028, 3.97799884])
[5]
lin_reg.coef_
array([3.97799884])
[6]
lin_reg.interception_
3.012020275313304
真实使用我们自己的SGD
[7]
from sklearn import datasets

boston = datasets.load_boston()
X = boston.data
y = boston.target


X = X[ y < 50.0]
y = y[ y < 50.0]
F:\anaconda\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function load_boston is deprecated; `load_boston` is deprecated in 1.0 and will be removed in 1.2.

    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_housing
        housing = fetch_california_housing()

    for the California housing dataset and::

        from sklearn.datasets import fetch_openml
        housing = fetch_openml(name="house_prices", as_frame=True)

    for the Ames housing dataset.
    
  warnings.warn(msg, category=FutureWarning)

[8]
from playML.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666) # 666 效果比 222 好很多
[9]
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.transform(X_train)
X_test_standard = standardScaler.transform(X_test)
[10]
from playML.LinearRegression import LinearRegression

lin_reg2 = LinearRegression()
%time lin_reg2.fit_sgd(X_train_standard, y_train, n_iters=2)
lin_reg2.score(X_test_standard,y_test)
CPU times: total: 15.6 ms
Wall time: 8.98 ms

0.7911189802097699
[11]
%time lin_reg2.fit_sgd(X_train_standard, y_train, n_iters=50)
lin_reg2.score(X_test_standard,y_test)
CPU times: total: 219 ms
Wall time: 226 ms

0.8132588958621522
[13]
%time lin_reg2.fit_sgd(X_train_standard, y_train, n_iters=500)
lin_reg2.score(X_test_standard,y_test)
CPU times: total: 2.09 s
Wall time: 2.93 s

0.8129564757875579
scikit-learn 中的SGD
[14]
from sklearn.linear_model import SGDRegressor
[15]
sgd_reg = SGDRegressor()
%time sgd_reg.fit(X_train_standard,y_train)
sgd_reg.score(X_test_standard,y_test)
CPU times: total: 0 ns
Wall time: 116 ms

0.8129895569490898
[18]
sgd_reg = SGDRegressor(n_iter_no_change=100)# n_iter 报错 
%time sgd_reg.fit(X_train_standard,y_train)
sgd_reg.score(X_test_standard,y_test)
CPU times: total: 15.6 ms
Wall time: 22.9 ms

0.8131520202077357

6-8 如何确定梯度计算的准确性 调试梯度下降法

第6章 梯度下降法_第33张图片

第6章 梯度下降法_第34张图片

第6章 梯度下降法_第35张图片

 Notbook  示例

第6章 梯度下降法_第36张图片

Notbook 源码

如何调试梯度
[1]
import numpy as np
import matplotlib.pyplot as plt
[2]
np.random.seed(666)
X = np.random.random(size=(1000,10))
[3]
true_theta = np.arange(1,12,dtype = float)
[4]
X_b = np.hstack([np.ones((len(X),1)),X])
y = X_b.dot(true_theta) + np.random.normal(size=1000)
[5]
X.shape
(1000, 10)
[6]
y.shape
(1000,)
[9]
true_theta.shape
(11,)
[7]
true_theta
array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.])
[10]
def J(theta, X_b, y):
    try:
        return np.sum( ( y - X_b.dot(theta) ) ** 2 ) / len(y)
    except:
        return float('inf')
[11]
def dJ_math(theta,X_b,y):
    return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)
[12]
def dJ_debug(theta, X_b, y, epsilon=0.01):
    res = np.empty(len(theta))
    for i in range(len(theta)):
        theta_1 = theta.copy()
        theta_1[i] += epsilon
        theta_2 = theta.copy()
        theta_2[i] -= epsilon
        res[i] = (J(theta_1,X_b,y) - J(theta_2,X_b,y)) / (2 * epsilon)
    return res
[13]
def gradient_descent(dJ,X_b, y, initial_theta, eta, n_iters = 1e5, epsilon = 1e-8):
    theta = initial_theta
    i_iters = 0
    
    while i_iters < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        
        if(abs(J(theta, X_b, y)-J(last_theta,X_b,y)) < epsilon ):
            break
        i_iters +=1
        
    return theta
[14]
X_b = np.hstack( [ np.ones((len(X),1)) ,X ] )
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01 

%time theta = gradient_descent(dJ_debug,X_b,y,initial_theta,eta)
CPU times: total: 21.6 s
Wall time: 16.3 s

[15]
theta
array([ 1.07964823,  2.05912453,  2.92524399,  4.12967602,  5.05886967,
        5.91270186,  6.98378845,  8.0081538 ,  8.87263904,  9.99409247,
       10.91497018])
[16]
X_b = np.hstack( [ np.ones((len(X),1)) ,X ] )
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01 

%time theta = gradient_descent(dJ_math,X_b,y,initial_theta,eta)
theta
CPU times: total: 2.66 s
Wall time: 1.63 s

array([ 1.07964823,  2.05912453,  2.92524399,  4.12967602,  5.05886967,
        5.91270186,  6.98378845,  8.0081538 ,  8.87263904,  9.99409247,
       10.91497018])

6-9 有关梯度下降法的更多深入讨论

第6章 梯度下降法_第37张图片

第6章 梯度下降法_第38张图片

第6章 梯度下降法_第39张图片

第6章 梯度下降法_第40张图片

第6章 梯度下降法_第41张图片

第6章 梯度下降法_第42张图片

你可能感兴趣的:(机器学习笔记,python,开发语言,人工智能)