深度学习入门

Python Basic


$ python3

>>> 1 + 2
3

>>> 1 - 2
-1

>>> 4 * 5
20

>>> 7 / 5
1.4

>>> 3 ** 2
9

>>> type(10)
<class 'int'>

>>> type(2.718)
<class 'float'>

>>> type("hello")
<class 'str'>

>>> x = 10
>>> print(x)
10

>>> x = 100
>>> print(x)
100

>>> y = 3.14
>>> x * y
314.0

>>> type(x * y)
<class 'float'>

>>> a = [1, 2, 3, 4, 5]
>>> print(a)
[1, 2, 3, 4, 5]

>>> len(a)
5

>>> a[0]
1

>>> a[4]
5

>>> a[4] = 99
>>> print(a)
[1, 2, 3, 4, 99]

>>> a[0:2]
[1, 2]

>>> a[1:]
[2, 3, 4, 99]

>>> a[:3]
[1, 2, 3]

>>> a[:-1]
[1, 2, 3, 4]

>>> a[:-2]
[1, 2, 3]

>>> me = {'height' : 100}
>>> me['height']
100

>>> me['height'] = 70
>>> print(me)
{'height': 70}

>>> me['weight'] = 80
>>> print(me)
{'height': 70, 'weight': 80}

>>> hungry = True
>>> sleepy = False
>>> type(hungry)
<class 'bool'>

>>> not hungry
False

>>> hungry and sleepy

False

>>> hungry or sleepy
True

>>> hungry = True
>>> if hungry:
... print("I'm hungry")
...

"I'm hungry"

>>> hungry = False
>>> if hungry:
...     print("I'm hungry")
... else:
...     print("I'm not hungry")
... print("I'm sleepy")
...

"I'm not hungry"
"I'm sleepy"

>>> for i in [1, 2, 3]:
...     print(i)
...

1
2
3

>>> def hello():
...     print("Hello world")
...

>>> hello()
Hello world

>>> def hello(object):
...     print("Hello " + object + "!")
...

>>> hello("cat")
Hello cat!

>>>

Class


class Man:
	def __init__(self, name):
		self.name = name
		print("Initialized!")

	def hello(self):
		print("Hello " + self.name + "!")
	
	def goodbye(self):
		print("Good-bye " + self.name + "!")

m = Man("David")
m.hello()
m.goodbye()

Numpy


>>> import numpy as np
>>> x = np.array([1.0, 2.0, 3.0])
>>> print(x)
[1. 2. 3.]

>>> type(x)
<class 'numpy.ndarray'>

>>> x = np.array([1.0, 2.0, 3.0])
>>> y = np.array([2.0, 4.0, 6.0])
>>> x + y
array([3., 6., 9.])

>>> x - y
array([-1., -2., -3.])

>>> x * y
array([ 2., 8., 18.])

>>> x / y
array([0.5, 0.5, 0.5])

>>> x = np.array([1.0, 2.0, 3.0])
>>> x / 2.0
array([0.5, 1. , 1.5])

>>> A = np.array([[1,2], [3,4]])
>>> print(A)
[[1 2]
[3 4]]

>>> A.shape
(2, 2)

>>> A.dtype
dtype('int64')

>>> B = np.array([[3,0], [0,6]])
>>> A + B
array([[ 4, 2],
[ 3, 10]])

>>> A * B
array([[ 3, 0],
[ 0, 24]])

>>> print(A)
[[1 2]
[3 4]]

>>> A * 10
array([[10, 20],
[30, 40]])

>>> A = np.array([[1,2], [3,4]])
>>> B = np.array([10, 20])
>>> A * B
array([[10, 40],
[30, 80]])

>>> X = np.array([[51,55], [14, 19], [0, 4]])
>>> print(X)
[[51 55]
[14 19]
[ 0 4]]

>>> X[0] # get the first row of this array
array([51, 55])

>>> X[0][1] # get the number of (0, 1)
55

>>> for row in X:
... print(row)
...

[51 55]
[14 19]
[0 4]

>>> X = X.flatten()
>>> print(X)
[51 55 14 19 0 4]

>>> X[np.array([0, 2, 4])] # get the number whose the index is 0, 2, 4
array([51, 14, 0])

>>> X > 15 # filter the number that greater than 15
array([ True, True, False, True, False, False])

>>> X[X > 15]
array([51, 55, 19])

>>>

Matplotlib

Plot the sin function

import numpy as np
import matplotlib.pyplot as plt

x = np.arange(0, 6, 0.1)
y = np.sin(x)

plt.plot(x, y)
plt.show()

Plot the image of function sin and cos

import numpy as np
import matplotlib.pyplot as plt

  

x = np.arange(0, 6, 0.1)
y1 = np.sin(x)
y2 = np.cos(x)

plt.plot(x, y1, label="sin")
plt.plot(x, y2, linestyle = "--", label="cos")

plt.xlabel("x")
plt.ylabel("y")
plt.title('sin & cos')

plt.legend()
plt.show()

Display the image

import matplotlib.pyplot as plt
from matplotlib.image import imread

img = imread('lena.png') # the path of the image
plt.imshow(img)
plt.show()

Perceptron

感知机的运行原理

The equation below can represent the behavior of the perceptron

y = { 0   ( w 1 x 1 + w 2 x 2 ≤ θ ) 1   ( w 1 x 1 + w 2 x 2 > θ ) y = \left\{\begin{matrix}0 \ (w_1 x_1 + w_2 x_2 \le \theta) \\ 1 \ (w_1 x_1 + w_2 x_2 \gt \theta) \end{matrix}\right. y={0 (w1x1+w2x2θ)1 (w1x1+w2x2>θ)

Simple implementation


def AND(x1, x2):
	w1, w2, theta = 0.5, 0.5, 0.7
	tmp = w1 * x1 + w2 * x2
	
	if tmp <= theta:
		return 0
	elif tmp > theta:
		return 1

print(AND(0, 0))
print(AND(0, 1))
print(AND(1, 0))
print(AND(1, 1))

导入权重和偏置

b b b is called bias, w 1 w_1 w1 and w 2 w_2 w2 are called weight

y = { 0   ( b + w 1 x 1 + w 2 x 2 ≤ 0 ) 1   ( b + w 1 x 1 + w 2 x 2 > 0 ) y = \left\{\begin{matrix} 0 \ (b + w_1 x_1 + w_2 x_2 \le 0) \\ 1 \ (b + w_1 x_1 + w_2 x_2 \gt 0)\end{matrix}\right. y={0 (b+w1x1+w2x20)1 (b+w1x1+w2x2>0)

use numpy to complete a simple neuron

import numpy as np
x = np.array([0, 1]) # input
w = np.array([0.5, 0.5]) # weight
b = -0.7 # bias

print(w * x)
print(np.sum(w * x))
print(np.sum(w * x) + b)

use bias and weight to complete an AND gate

def AND(x1, x2):
	x = np.array([x1, x2])
	w = np.array([0.5, 0.5])
	b = -0.7
	tmp = np.sum(w * x) + b
	if tmp <= 0:
		return 0
	else:
		return 1

NAND gate, OR gate

def NAND(x1, x2):
	x = np.array([x1, x2])
	w = np.array([-0.5, -0.5]) # just bias and weight different from AND
	b = 0.7
	tmp = np.sum(w * x) + b
	
	if tmp <= 0:
		return 0
	else:
		return 1


def OR(x1, x2):
	x = np.array([x1, x2])
	w = np.array([0.5, 0.5]) # just bias and weight different from AND
	b = -0.2
	tmp = np.sum(w * x) + b
	
	if tmp <= 0:
		return 0
	else:
		return 1

Use AND, NAND, OR gate to build NOR gate

def XOR(x1, x2):
	s1 = NAND(x1, x2)
	s2 = OR(x1, x2)
	y = AND(s1, s2)
	return y

Neuron Network

y = { 0   ( b + w 1 x 1 + w 2 x 2 ≤ 0 ) 1   ( b + w 1 x 1 + w 2 x 2 > 0 ) y = \left\{\begin{matrix} 0 \ (b + w_1 x_1 + w_2 x_2 \le 0) \\ 1 \ (b + w_1 x_1 + w_2 x_2 \gt 0) \end{matrix}\right. y={0 (b+w1x1+w2x20)1 (b+w1x1+w2x2>0)

引入 h ( x ) h(x) h(x)

y = h ( b + w 1 x 1 + w 2 x 2 ) y = h(b + w_1 x_1 + w_2 x_2) y=h(b+w1x1+w2x2)

h ( x ) = { 0   ( x ≤ 0 ) 1   ( x > 0 ) h(x) = \left\{\begin{matrix} 0 \ (x \le 0) \\ 1 \ (x \gt 0) \end{matrix}\right. h(x)={0 (x0)1 (x>0)

激活函数

activation function

a = b + w 1 x 1 + w 2 x 2 a = b + w_1 x_1 + w_2 x_2 a=b+w1x1+w2x2
y = h ( a ) y = h(a) y=h(a)

sigmoid 函数

Sigmoid function

h ( x ) = 1 1 + e x p ( − x ) h(x) = \frac{1}{1 + exp(-x)} h(x)=1+exp(x)1 , e x p ( − x ) exp(-x) exp(x) represent e − x e^{-x} ex

def sigmoid(x):
	return 1 / (1 + np.exp(-x))

Function of step


import numpy as np
import matplotlib.pylab as plt

# def step_function(x):
# if x > 0:
#     return 1
# else:
#     return 0


# def step_function(x):
# y = x > 0
# return y.astype(np.int)

def step_function(x):
	return np.array(x > 0, dtype=np.int64)


x = np.arange(-5.0, 5.0, 0.1)
y = step_function(x)
plt.plot(x, y)
plt.ylim(-0.1, 1.1) # the arange of axis y plt.show()
plt.show()

Sigmoid function and step function:


import numpy as np
import matplotlib.pylab as plt


# def step_function(x):
#     if x > 0:
#         return 1
#     else:
#         return 0


# def step_function(x):
#     y = x > 0
#     return y.astype(np.int)


def step_function(x):
	return np.array(x > 0, dtype=np.int64)


def sigmoid(x):
	return 1 / (1 + np.exp(-x))


x = np.arange(-5.0, 5.0, 0.1)
y1 = step_function(x)
y2 = sigmoid(x)
plt.plot(x, y1, linestyle = "--")
plt.plot(x, y2)
plt.ylim(-0.1, 1.1) # the arange of axis y plt.show()
plt.show()

ReLU函数

ReLU function

h ( x ) = { x   ( x > 0 ) 0   ( x ≤ 0 ) h(x) = \left\{\begin{matrix} x \ (x \gt 0) \\ 0 \ (x \le 0) \end{matrix}\right. h(x)={x (x>0)0 (x0)

def relu(x):
	return np.maximum(0, x)

多维数组

>>> import numpy as np
>>> A = np.array([1, 2, 3, 4])
>>> print(A)
[1 2 3 4]
>>> np.ndim(A)
1
>>> A.shape
(4,)
>>> A.shape[0]
4

>>> B = np.array([[1,2], [3,4], [5,6]])
>>> print(B)
[[1 2]
 [3 4]
 [5 6]]
>>> np.ndim(B)
2
>>> B.shape
(3, 2)

矩阵乘法

>>> A = np.array([[1,2], [3,4]])
>>> A.shape
(2, 2)
>>> B = np.array([[5,6], [7,8]])
>>> B.shape
(2, 2)
>>> np.dot(A, B)
array([[19, 22],
       [43, 50]])
>>> A = np.array([[1,2,3], [4,5,6]])
>>> A.shape
(2, 3)

>>> B = np.array([[1,2], [3,4], [5,6]])
>>> B.shape
(3, 2)

>>> np.dot(A, B)
array([[22, 28],
       [49, 64]])
>>> C = np.array([[1,2], [3,4]])  
>>> C.shape
(2, 2)  
>>> A.shape  
(2, 3)  
>>> np.dot(A, C)  
Traceback (most recent call last):  
	File "", line 1, in <module>  
ValueError: shapes (2,3) and (2,2) not aligned: 3 (dim 1) != 2 (dim 0)

>>> A = np.array([[1,2], [3, 4], [5,6]])  
>>> A.shape  
(3, 2)  
  
>>> B = np.array([7,8])    
>>> B.shape  
(2,)  
  
>>> np.dot(A, B)  
array([23, 53, 83])
>>> A = np.array([[1,2], [3, 4], [5,6]])  
>>> A.shape  
(3, 2)  
  
>>> B = np.array([7,8])    
>>> B.shape  
(2,)  

>>> np.dot(A, B)  
array([23, 53, 83])

神经网络内积

>>> X = np.array([1, 2])  
>>> X.shape  
(2,)  
  
>>> W = np.array([[1, 3, 5], [2, 4, 6]])  
>>> print(W)  
[[1 3 5]  
[2 4 6]]  
  
>>> W.shape
(2, 3)  
>>> Y = np.dot(X, W)  
>>> print(Y)
[ 5 11 17]

多层神经网络

w 12 ( 1 ) w^{(1)}_{12} w12(1) ( 1 ) (1) (1) 表示第1层的权重,1 表示后一层的第一个神经元,2 表示前一层的第2个神经元
权重右下角按照“后一层的索引号、前一层的索引号”的顺序排列

各层间信号传递的实现

例如:

a 1 ( 1 ) = w 11 ( 1 ) x 1 + w 12 ( 1 ) x 2 + b 1 ( 1 ) a^{(1)}_1 = w^{(1)}_{11} x_1 + w^{(1)}_{12} x_2 + b^{(1)}_1 a1(1)=w11(1)x1+w12(1)x2+b1(1)

如果使用矩阵的乘法运算,则可以将第1层的加权和表示成下面的式
A ( 1 ) = X W ( 1 ) + B ( 1 ) A^{(1)} = XW^{(1)} + B^{(1)} A(1)=XW(1)+B(1)

其中, A ( 1 ) , X , W ( 1 ) , B ( 1 ) A^{(1)}, X, W^{(1)}, B^{(1)} A(1),X,W(1),B(1) 如下所示:

A ( 1 ) = ( a 1 ( 1 )   a 2 ( 1 )   a 3 ( 1 ) ) A^{(1)} = (a^{(1)}_1 \ a^{(1)}_2 \ a^{(1)}_3) A(1)=(a1(1) a2(1) a3(1)), X = ( x 1   x 2 ) X = (x_1 \ x_2) X=(x1 x2), B ( 1 ) = ( b 1 ( 1 )   b 2 ( 1 )   b 3 ( 1 ) ) B^{(1)} = (b^{(1)}_1 \ b^{(1)}_2 \ b^{(1)}_3) B(1)=(b1(1) b2(1) b3(1)), W ( 1 ) = ( w 11 ( 1 ) w 21 ( 1 ) w 31 ( 1 ) w 12 ( 1 ) w 22 ( 1 ) w 32 ( 1 ) ) W^{(1)} = \begin{pmatrix} w^{(1)}_{11} & w^{(1)}_{21} & w^{(1)}_{31} \\ w^{(1)}_{12} & w^{(1)}_{22} & w^{(1)}_{32} \end{pmatrix} W(1)=(w11(1)w12(1)w21(1)w22(1)w31(1)w32(1))

X = np.array([1.0, 0.5])
W1 = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]])
B1 = np.array([0.1, 0.2, 0.3])

print(W1.shape) # (2, 3)
print(X.shape) # (2,)
print(B1.shape) # (3,)  

A1 = np.dot(X, W1) + B1 # (2,) * (2, 3) + (3,) = (3,)

W 1 W1 W1 2 × 3 2 \times 3 2×3 的数组, X X X 为元素个数为 2 的一维数组,是用 s i g m o i d sigmoid sigmoid 作为激活函数

Z1 = sigmoid(A1)  
  
print(A1) # [0.3, 0.7, 1.1]  
print(Z1) # [0.57444252, 0.66818777, 0.75026011]

第一层的输出变成第二层的输入

W2 = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]])
B2 = np.array([0.1, 0.2])

print(Z1.shape) # (3,)
print(W2.shape) # (3, 2)
print(B2.shape) # (2,)

A2 = np.dot(Z1, W2) + B2 # (3,) * (3, 2) + (2,) = (2,)
Z2 = sigmoid(A2) # (2,)

激活函数

def identity_function(x):  
	return x  

W3 = np.array([[0.1, 0.3], [0.2, 0.4]])  
B3 = np.array([0.1, 0.2])  

A3 = np.dot(Z2, W3) + B3  
Y = identity_function(A3) # 或者Y = A3

代码实现

import numpy as np

def sigmoid(x):
	return 1 / (1 + np.exp(-x))


def identity_function(x):
	return x


def init_network():
	network = {}
	network['W1'] = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]])
	network['b1'] = np.array([0.1, 0.2, 0.3])
	network['W2'] = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]])
	network['b2'] = np.array([0.1, 0.2])
	network['W3'] = np.array([[0.1, 0.3], [0.2, 0.4]])
	network['b3'] = np.array([0.1, 0.2])
	return network


def forward(network, x):
	W1, W2, W3 = network['W1'], network['W2'], network['W3']
	b1, b2, b3 = network['b1'], network['b2'], network['b3']
	
	a1 = np.dot(x, W1) + b1
	z1 = sigmoid(a1)
	a2 = np.dot(z1, W2) + b2
	
	z2 = sigmoid(a2)
	a3 = np.dot(z2, W3) + b3
	y = identity_function(a3)
	
	return y


network = init_network()
x = np.array([1.0, 0.5])
y = forward(network, x)
print(y)

softmax 函数

y k = e x p ( a k ) ∑ i = 1 n e x p ( a i ) y_k = \frac{exp(a_k)}{\sum^{n}_{i = 1} exp(a_i)} yk=i=1nexp(ai)exp(ak)

>>> import numpy as np
>>> a = np.array([0.3, 2.9, 4.0])
>>> exp_a = np.exp(a)
>>> print(exp_a)
[ 1.34985881 18.17414537 54.59815003]
>>> sum_exp_a = np.sum(exp_a)
>>> print(sum_exp_a)
74.1221542101633
>>> y = exp_a / sum_exp_a
>>> print(y)
[0.01821127 0.24519181 0.73659691]

Softmax function

def softmax(a):
	exp_a = np.exp(a)
	sum_exp_a = np.sum(exp_a)
	y = exp_a / sum_exp_a
	return y

y k = e x p ( a k ) ∑ i = 1 n e x p ( a i ) = C e x p ( a k ) C ∑ i = 1 n e x p ( a i ) = e x p ( a k + l o g C ) ∑ i = 1 n e x p ( a i + l o g C ) = e x p ( a k + C ′ ) ∑ i = 1 n e x p ( a i + C ′ ) \begin{matrix} y_k & = \frac{exp(a_k)}{\sum^{n}_{i = 1} exp(a_i)} & = \frac{Cexp(a_k)}{C\sum^{n}_{i = 1}exp(a_i)} \\ & & = \frac{exp(a_k + log C)}{\sum^{n}_{i = 1} exp(a_i + logC)} \\ & & = \frac{exp(a_k + C')}{\sum^{n}_{i = 1} exp(a_i + C')} \end{matrix} yk=i=1nexp(ai)exp(ak)=Ci=1nexp(ai)Cexp(ak)=i=1nexp(ai+logC)exp(ak+logC)=i=1nexp(ai+C)exp(ak+C)

>>> a = np.array([1010, 1000, 990])
>>> np.exp(a) / np.sum(np.exp(a))
<stdin>:1: RuntimeWarning: invalid value encountered in divide
array([nan, nan, nan])
>>> c = np.max(a)
>>> a - c
array([  0, -10, -20])
>>> np.exp(a - c) / np.sum(np.exp(a - c))
array([9.99954600e-01, 4.53978686e-05, 2.06106005e-09])
def softmax(a):
	c = np.max(a)
	exp_a = np.exp(a - c) # 溢出对策 sum_exp_a = np.sum(exp_a)
	y = exp_a / sum_exp_a
	return y

Deep Learning

均方误差

mean squared error

E = 1 2 ∑ k ( y k − t k ) 2 E= \frac{1}{2} \sum_k (y_k - t_k)^2 E=21k(yktk)2

import numpy as np

y = [0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0]
t = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]

def mean_squared_error(y, t):
	return 0.5 * np.sum((y - t)**2)

print(mean_squared_error(np.array(y), np.array(t)))

交叉熵误差

cross entropy error

E = − ∑ k   t k   l o g   y k E = - \sum_k \ t_k \ log \ y_k E=k tk log yk

def cross_entropy_error(y, t):
	delta = 1e-7
	return -np.sum(t * np.log(y + delta))

mini-batch 学习

E = − 1 N ∑ n ∑ k   t n k   l o g   y n k E = - \frac{1}{N} \sum_n \sum_k \ t_{nk} \ log \ y_{nk} E=N1nk tnk log ynk

mini-batch 版交叉误差

def cross_entropy_error(y, t):
	if y.ndim == 1:
		t = t.reshape(1, t.size)
		y = y.reshape(1, y.size)
	
	batch_size = y.shape[0]
	return -np.sum(t * np.log(y + 1e-7)) / batch_size
#	return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

导数

numerical differentiation

d f ( x ) d x = l i m h → 0 f ( x + h ) − f ( x ) h \frac{df(x)}{dx} = lim_{h \rightarrow 0} \frac{f(x + h) - f(x)}{h} dxdf(x)=limh0hf(x+h)f(x)

def numerical_diff(f, x):
	h = 1e-4
	return (f(x + h) - f(x - h)) / (2 * h)

偏导数

∂ f ∂ x 0 , ∂ f ∂ x 1 \frac{\partial f}{\partial x_0},\frac{\partial f}{\partial x_1} x0f,x1f

梯度

( ∂ f ∂ x 0 , ∂ f ∂ x 1 ) (\frac{\partial f}{\partial x_0},\frac{\partial f}{\partial x_1}) (x0f,x1f)

def numerical_gradient(f, x):
	h = 1e-4 # 0.0001
	grad = np.zeros_like(x) # 生成和x形状相同的数组
	
	for idx in range(x.size):
		tmp_val = x[idx]
		
		# f(x+h)的计算
		x[idx] = tmp_val + h
		fxh1 = f(x)
		
		# f(x-h)的计算
		x[idx] = tmp_val - h
		fxh2 = f(x)
		
		grad[idx] = (fxh1 - fxh2) / (2 * h)
		x[idx] = tmp_val # 还原值
	
	return grad

梯度法

x 0 = x 0 − η ∂ f ∂ x 0 x_0 = x_0 - \eta \frac{\partial f}{\partial x_0} x0=x0ηx0f

x 1 = x 1 − η ∂ f ∂ x 1 x_1 = x_1 - \eta \frac{\partial f}{\partial x_1} x1=x1ηx1f

梯度下降法:

def gradient_descent(f, init_x, lr=0.01, step_num=100):
	x = init_x
	for i in range(step_num):
		grad = numerical_gradient(f, x)
		x -= lr * grad

	return x

神经网络的梯度

W W W 2 × 3 2 \times 3 2×3 权重的神经网络, L L L 表示损失函数

W = ( w 11 w 12 w 13 w 21 w 22 w 23 ) W = \begin{pmatrix} w_{11} & w_{12} & w_{13} \\ w_{21} & w_{22} & w_{23} \end{pmatrix} W=(w11w21w12w22w13w23)

梯度用 ∂ L ∂ W \frac{\partial L}{\partial W} WL 表示

∂ L ∂ W = ( ∂ L ∂ w 11 ∂ L ∂ w 12 ∂ L ∂ w 13 ∂ L ∂ w 21 ∂ L ∂ w 22 ∂ L ∂ w 23 ) \frac{\partial L}{\partial W} = \begin{pmatrix} \frac{\partial L}{\partial w_{11}} & \frac{\partial L}{\partial w_{12}} & \frac{\partial L}{\partial w_{13}} \\ \frac{\partial L}{\partial w_{21}} & \frac{\partial L}{\partial w_{22}} & \frac{\partial L}{\partial w_{23}}\end{pmatrix} WL=(w11Lw21Lw12Lw22Lw13Lw23L)

简单层的实现

乘法层

class MulLayer:
	def __init__(self):
		self.x = None
		self.y = None
	
	def forward(self, x, y):
		self.x = x
		self.y = y
		out = x * y
		return out
	
	def backward(self, dout):
		dx = dout * self.y
		dy = dout * self.x
		return dx, dy

加法层

class AddLayer:  
	def __init__(self):    
		pass
	
	def forward(self, x, y):  
		out = x + y
		return out  
	
	def backward(self, dout):  
		dx = dout * 1  
		dy = dout * 1  
		return dx, dy

ReLU 层

class Relu:
	def __init__(self):
		self.mask = None
	
	def forward(self, x):
		self.mask = (x <= 0)
		out = x.copy()
		out[self.mask] = 0
		
		return out
	
	def backward(self, dout):
		dout[self.mask] = 0
		dx = dout
		
		return dx

误差反向传播法

∂ z ∂ x = ∂ z ∂ t ∂ t ∂ x \frac{\partial{z}}{\partial{x}} = \frac{\partial{z}}{\partial{t}} \frac{\partial{t}}{\partial{x}} xz=tzxt ,

y = { x ( x > 0 ) 0 ( x ≤ 0 ) y = \left\{\begin{matrix} x & (x \gt 0) \\ 0 & (x \le 0) \end{matrix}\right. y={x0(x>0)(x0) , ∂ y ∂ x = { 1 ( x > 0 ) 0 ( x ≤ 0 ) \frac{\partial{y}}{\partial{x}} = \left\{\begin{matrix} 1 & (x \gt 0) \\ 0 & (x \le 0) \end{matrix}\right. xy={10(x>0)(x0) ,

y = 1 1 + e x p ( − x ) y = \frac{1}{1 + exp(-x) } y=1+exp(x)1 , ∂ y ∂ x = − 1 x 2 = − y 2 \frac{\partial{y}}{\partial{x}} = - \frac{1}{x^2} = -y^2 xy=x21=y2

x → × ( − 1 ) − x → e x p e x p ( − x ) → + 1 1 + e x p ( − x ) → / y ∂ L ∂ y y 2 e x p ( − x ) ← × ( − 1 ) ∂ L ∂ y y 2 e x p ( − x ) ← e x p ∂ L ∂ y y 2 ← + 1 ∂ L ∂ y y 2 ← / ∂ L ∂ y \begin{matrix} x & \overset{\times(-1)}{\rightarrow} & -x & \overset{exp}{\rightarrow} & exp(-x) & \overset{+ 1}{\rightarrow} & 1 + exp(-x) & \overset{/}{\rightarrow} & y \\ \frac{\partial{L}}{\partial{y}} y^2 exp(-x) & \overset{\times(-1)}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 exp(-x) & \overset{exp}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 & \overset{+ 1}{\leftarrow} & \frac{\partial{L}}{\partial{y}} y^2 & \overset{/}{\leftarrow} & \frac{\partial{L}}{\partial{y}}\end{matrix} xyLy2exp(x)×(1)×(1)xyLy2exp(x)expexpexp(x)yLy2+1+11+exp(x)yLy2//yyL

∂ L ∂ y y 2 e x p ( − x ) = ∂ L ∂ y 1 ( 1 + e x p ( − x ) ) 2 e x p ( − x ) = ∂ L ∂ y 1 1 + e x p ( − x ) e x p ( − x ) 1 + e x p ( − x ) = ∂ L ∂ y y ( 1 − y ) \begin{matrix}\frac{\partial{L}}{\partial{y}} y^2 exp(-x) & = & \frac{\partial{L}}{\partial{y}} \frac{1}{(1 + exp(-x))^2} exp(-x) \\ & = & \frac{\partial{L}}{\partial{y}} \frac{1}{1 + exp(-x)} \frac{exp(-x)}{1 + exp(-x)} \\ & = & \frac{\partial{L}}{\partial{y}} y (1-y)\end{matrix} yLy2exp(x)===yL(1+exp(x))21exp(x)yL1+exp(x)11+exp(x)exp(x)yLy(1y)

class Sigmoid:
	def __init__(self):
		self.out = None
	
	def forward(self, x):
		out = 1 / (1 + np.exp(-x))
		self.out = out
		
		return out
	
	def backward(self, dout):
		dx = dout * (1.0 - self.out) * self.out
		
		return dx

∂ L ∂ X = ∂ L ∂ Y ⋅ W T \frac{\partial{L}}{\partial{X}} = \frac{\partial{L}}{\partial{Y}} \cdot W^T XL=YLWT, ∂ L ∂ W = X T ⋅ ∂ L ∂ Y \frac{\partial{L}}{\partial{W}} = X^T \cdot \frac{\partial{L}}{\partial{Y}} WL=XTYL

W = ( w 11 w 12 w 13 w 21 w 22 w 23 ) W = \begin{pmatrix} w_{11} & w_{12} & w_{13} \\ w_{21} & w_{22} & w_{23} \end{pmatrix} W=(w11w21w12w22w13w23) W T = ( w 11 w 21 w 12 w 22 w 13 w 23 ) W^T = \begin{pmatrix} w_{11} & w_{21} \\ w_{12} & w_{22} \\ w_{13} & w_{23} \end{pmatrix} WT= w11w12w13w21w22w23

X = ( x 0 , x 1 , . . . , x n ) X=(x_0, x_1, ... , x_n) X=(x0,x1,...,xn) , ∂ L ∂ X = ( ∂ L ∂ x 0 , ∂ L ∂ x 1 , . . . , ∂ L ∂ x n ) \frac{\partial{L}}{\partial{X}} = (\frac{\partial{L}}{\partial{x_0}}, \frac{\partial{L}}{\partial{x_1}}, ... , \frac{\partial{L}}{\partial{x_n}}) XL=(x0L,x1L,...,xnL)

Affine 层

class Affine:  
	def __init__(self, W, b):  
		self.W = W  
		self.b = b  
		self.x = None  
		self.dW = None  
		self.db = None  
	
	def forward(self, x):  
		self.x = x  
		out = np.dot(x, self.W) + self.b  
		return out  
	  
	def backward(self, dout):  
		dx = np.dot(dout, self.W.T)  
		self.dW = np.dot(self.x.T, dout)  
		self.db = np.sum(dout, axis=0)  
		return dx

Softmax-with-Loss 层

class SoftmaxWithLoss:  
	def __init__(self):  
		self.loss = None # 损失  
		self.y = None # softmax的输出  
		self.t = None # 监督数据(one-hot vector)  
	
	def forward(self, x, t):  
		self.t = t  
		self.y = softmax(x)  
		self.loss = cross_entropy_error(self.y, self.t)  
		return self.loss  
	  
	def backward(self, dout=1):  
		batch_size = self.t.shape[0]  
		dx = (self.y - self.t) / batch_size  
		return dx

Trick

SGD

W ← W ← η ∂ L ∂ W W \leftarrow W \leftarrow \eta \frac{\partial{L}}{\partial{W}} WWηWL

class SGD:
	def __init__(self, lr=0.01):
		self.lr = lr
	
	def update(self, params, grads):
		for key in params.keys():
			params[key] -= self.lr * grads[key]

Momentum

v ← α v − η ∂ L ∂ W v \leftarrow \alpha v - \eta \frac{\partial{L}}{\partial{W}} vαvηWL
W ← W + v W \leftarrow W + v WW+v

class Momentum:
	def __init__(self, lr=0.01, momentum=0.9):
		self.lr = lr
		self.momentum = momentum
		self.v = None

	def update(self, params, grads):
		if self.v is None:
			self.v = {}
			for key, val in params.items():
				self.v[key] = np.zeros_like(val)

		for key in params.keys():
			self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
			params[key] += self.v[key]

AdaGrad

h ← h + ∂ L ∂ W ⊙ ∂ L ∂ W h \leftarrow h + \frac{\partial{L}}{\partial{W}} \odot \frac{\partial{L}}{\partial{W}} hh+WLWL

W ← W − η 1 h ∂ L ∂ W W \leftarrow W - \eta \frac{1}{\sqrt{h}} \frac{\partial{L}}{\partial{W}} WWηh 1WL

class AdaGrad:
	def __init__(self, lr=0.01):
		self.lr = lr
		self.h = None

	def update(self, params, grads):
		if self.h is None:
			self.h = {}
			for key, val in params.items():
				self.h[key] = np.zeros_like(val)
		for key in params.items():
			self.h[key] += grads[key] * grads[key]
			params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

Batch Normalization

μ B ← 1 m ∑ i = 1 m x i \mu_B \leftarrow \frac{1}{m} \sum_{i=1}^{m} x_i μBm1i=1mxi

σ B 2 ← 1 m ∑ i = 1 m ( x i − μ B ) 2 \sigma_{B}^{2} \leftarrow \frac{1}{m} \sum^{m}_{i = 1}(x_i - \mu_{B})^2 σB2m1i=1m(xiμB)2

x i ^ ← x i − μ B σ B 2 + ε \hat{x_i} \leftarrow \frac{x_i - \mu_B}{\sqrt{\sigma^2_B + \varepsilon}} xi^σB2+ε xiμB

y i ← γ x i ^ + β y_i \leftarrow \gamma \hat{x_i} + \beta yiγxi^+β

Dropout

class Dropout:  
	def __init__(self, dropout_ratio=0.5):  
		self.dropout_ratio = dropout_ratio
		self.mask = None
		
	def forward(self, x, train_flg=True):  
		if train_flg:
			self.mask = np.random.rand(*x.shape) > self.dropout_ratio
			return x * self.mask
		else:
			return x * (1.0 - self.dropout_ratio)
	
	def backward(self, dout):  
		return dout * self.mask

卷积神经网络

假设输入大小为 ( H , W ) (H,W) (H,W) ,滤波器大小为 ( F H , F W ) (FH,FW) (FH,FW) ,输出大小为 ( O H , O W ) (OH,OW) (OH,OW) ,填充为 P P P,步幅为 S S S 。此时,输出大小可通过下式进行计算

O H = H + 2 P − F H S + 1 OH = \frac{H + 2P -FH}{S} + 1 OH=SH+2PFH+1 , O W = W + 2 P − F W S + 1 OW = \frac{W + 2P -FW}{S} + 1 OW=SW+2PFW+1

通道数为 C C C、高度为 H H H、长度为W的数据的形状可以写成 ( C , H , W ) (C,H,W) (C,H,W)。滤波器也一样,要按(channel, height,width) 的顺序书写。比如,通道数为 C C C、滤波器高度为 F H FH FH (Filter Height)、长度为 F W FW FW (Filter Width) 时,可以写成 ( C , F H , F W ) (C,FH,FW) (C,FH,FW)

( C , H , W ) ∗ ( C , F H , F W ) → ( 1 , O H , O W ) (C, H, W) \ast (C, FH, FW) \rightarrow (1, OH, OW) (C,H,W)(C,FH,FW)(1,OH,OW)

基于多个滤波器的卷积运算
( C , H , W ) ∗ ( F N , C , F H , F W ) → ( F N , O H , O W ) (C, H, W) \ast (FN, C, FH, FW) \rightarrow (FN, OH, OW) (C,H,W)(FN,C,FH,FW)(FN,OH,OW)

卷积运算的处理流
( C , H , W ) ∗ ( F N , C , F H , F W ) → ( F N , O H , O W ) + ( F N , 1 , 1 ) → ( F N , O H , O W ) (C, H, W) \ast (FN, C, FH, FW) \rightarrow (FN, OH, OW) + (FN, 1, 1) \rightarrow (FN, OH, OW) (C,H,W)(FN,C,FH,FW)(FN,OH,OW)+(FN,1,1)(FN,OH,OW)

卷积运算的处理流(批处理)
( N , C , H , W ) ∗ ( F N , C , F H , F W ) → ( N , F N , O H , O W ) + ( F N , 1 , 1 ) → ( N , F N , O H , O W ) (N, C, H, W) \ast (FN, C, FH, FW) \rightarrow (N, FN, OH, OW) + (FN, 1, 1) \rightarrow (N, FN, OH, OW) (N,C,H,W)(FN,C,FH,FW)(N,FN,OH,OW)+(FN,1,1)(N,FN,OH,OW)

class Convolution:
	def __init__(self, W, b, stride=1, pad=0):
		self.W = W
		self.b = b
		self.stride = stride
		self.pad = pad
	
	def forward(self, x):
		FN, C, FH, FW = self.W.shape
		N, C, H, W = x.shape
		out_h = int(1 + (H + 2*self.pad - FH) / self.stride)  
		out_w = int(1 + (W + 2*self.pad - FW) / self.stride)  
		
		col = im2col(x, FH, FW, self.stride, self.pad)  
		col_W = self.W.reshape(FN, -1).T # 滤波器的展开  
		out = np.dot(col, col_W) + self.b  
		out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)  
		  
		return out

池化层

class Pooling:  
	def __init__(self, pool_h, pool_w, stride=1, pad=0):  
		self.pool_h = pool_h  
		self.pool_w = pool_w  
		self.stride = stride  
		self.pad = pad  
	
	def forward(self, x):  
		N, C, H, W = x.shape    
		out_h = int(1 + (H - self.pool_h) / self.stride)  
		out_w = int(1 + (W - self.pool_w) / self.stride)  
		
		# 展开(1) 
		col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad)  
		col = col.reshape(-1, self.pool_h*self.pool_w)  
		
		# 最大值(2)  
		out = np.max(col, axis=1)  
		
		# 转换(3)  
		out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)  
		return out

你可能感兴趣的:(深度学习,python,numpy)