深度学习入门的学习记录。
总忘,记录。
只记录各部分源码和少部分原理。
:《深度学习入门:基于Python的理论与实现》 --[日]斋藤康毅
超过阀值,切换输出信号的函数。
特点: sigmoid 函数比阶跃函数更平滑。
表达式:
h ( x ) = 1 1 + e x h(x) = \frac{1}{1+e^{x}} h(x)=1+ex1
源码:
def sigmoid(x):
return 1 / (1 + np.exp(-x))
特点: ReLU (Rectified Linear Unit),输入小于等于零时输出零,否则输出原值。
源码:
def relu(x):
return np.maximum(0,x)
def identity_function(x):
return x
init_network()
源码:
def init_network():
network = {} # 定义字典
network[’w1’] = np.array([[0.1,0.2,0.5],[0.2,0.4,0.6]])
network[’b1’] = np.array([0.1,0.2,0.3])
network[’w2’] = np.array([[0.1,0.4],[0.2,0.5],[0.3,0.6]])
network[’b2’] = np.array([0.1,0.2])
network[’w3’] = np.array([[0.1,0.3],[0.2,0.4]])
network[’b3’] = np.array([0.1,0.2])
return network
forward(network,x)
源码:
def forward(network,x):
w1,w2,w3 = network[’w1’],network[’w2’],network[’w3’]
b1,b2,b3 = network[’b1’],network[’b2’],network[’b3’]
a1 = np.dot(x,w1) + b1
z1 = sigmoid(a1)
a2 = np.dot(z1,w2) + b2
z2 = sigmoid(a2)
a3 = np.dot(z2,w3) + b3
y = identity_function(a3)
return y
network = init_network()
x = np.array([1.0,0.5])
y = forward(network,x)
print(y)
回归恒等,分类 softmax.
表达式:
y k = e a k ∑ i = 1 n e a i = e ( a k + C ’ ) ∑ i = 1 n e ( a i + C ’ ) y_k = \frac{e^{a_k}}{\sum_{i=1}^{n} e^{a_i}}= \frac{e^{(a_k + C^{’})}}{\sum_{i=1}^{n} e^{(a_i + C^{’})}} yk=∑i=1neaieak=∑i=1ne(ai+C’)e(ak+C’)
源码:
def softmax(a):
c = np.max(a)
exp_a = np.exp(a - c) # 避免溢出
sum_exp_a = np.sum(exp_a)
y = exp_a / sum_exp_a
return y
源码:
import sys,os
sys.path.append(os.pardir)
from dataset.moist import load_mnist
import numpy as np
from PIL import Image
def img_show():
pil_img = Image.fromarray(np.uint8(img))
pil_img.show()
# 调用数据集
(x_train,t_train),(x_test,t_test) = load_mnist(flatten = True, normalize = False)
img = x_train[0]
label = t_train[0]
print(image.shape)
img = img.reshape(28,28)
img_show(img)
载入已学习好的网络(权重)
源码:
def get_data():
(x_train,t_train),(x_test,t_test) = \
load_mnist(flatten = True, normalize = False,one_hot_label=False)
return x_test,t_test
def init_network():
with open(“sample_weight.pal”,’rb’) as f:
network = pickle.load(f)
return network
def predict(network,x):
w1,w2,w3 = network[’w1’],network[’w2’],network[’w3’]
b1,b2,b3 = network[’b1’],network[’b2’],network[’b3’]
a1 = np.dot(x,w1) + b1
z1 = sigmoid(a1)
a2 = np.dot(z1,w2) + b2
z2 = sigmoid(a2)
a3 = np.dot(z2,w3) + b3
y = softmax(a3)
return y
源码:
x,t = get_data()
network = init_network()
accuracy_cnt = 0
for i in range(len(x)):
y = predict(network,x[i])
p = np.argmax(y) # 概率最高元素的索引
if p == t[i]:
accuracy_cnt += 1
print(”Accuracy:” + str(float(accuracy_cnt)/len(x)))
数据预处理(pre-processing) 方法有 正规化(normalization)(将数据限定范围),白化(whitening)(讲数据整体分布形状均匀化)等。
x,t = get_data()
network = init_network()
batch_size = 100
accuracy_cnt = 0
for i in range(0,len(x),batch_size):
x_batch = x[i:i+batch_size)
y_batch = predict(network,x_batch)
p = np.argmax(y_batch,axis = 1)
accuracy_cnt += np.sum(p == t[i:i+batch_size])
print(”Accuracy:” + str(float(accuracy_cnt)/len(x)))
以 损失函数 为指引,寻找最优权重。
表达式:
E = 1 2 ∑ k ( y k − t k ) 2 E = \frac{1}{2} \sum_{k} (y_{k} - t_{k})^2 E=21k∑(yk−tk)2
源码:
def mean_squared_error(y,t):
return 0.5 * np.sum((y - t) ** 2)
表达式:
E = − ∑ k t k ln y k E = - \sum_{k} t_{k} \ln y_{k} E=−k∑tklnyk
源码:
def cross_entropy_error(y,t):
delta = 1e-7
return -np.sum(t * np.log(y + delta))
source:
train_size = x_train.shape[0]
batch_size = 10
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
0-1
source:
def cross_entropy_error_t(y,t):
if y.ndim == 1:
t = t.reshape(1,t.size)
y = y.reshape(1,y.size)
batch_size = y.shape[0]
return -np.sum(t * np.log(y + 1e-7))/ batch_size
one_hot_label = False
source:
def cross_entropy_error_f(y,t):
if y.ndim == 1:
t = t.reshape(1,t.size)
y = y.reshape(1,y.size)
batch_size = y.shape[0]
return -np.sum(np.log(y[np.arange(batch_size),t] + 1e-7))/ batch_size
Numerical Differentiation
source:
def numerical_diff():
h = 1e-4
return (f(x+h) - f(x-h))/(2*h)
Gradient
source:
def numerical_gradient(f,x):
h = 1e-4
grad = np.zero_like(x)
for idk in range(x.size):
tmp_val = x[idx]
x[idx] = tmp_val + h
fxh1 = f(x)
x[idx] = tmp_val - h
fxh2 = f(x)
grad[idx] = (fxh1 - fxh2) / (2 * h)
x[idx] = tmp_val
return grad
Methods: gradient descent method; gradient ascent method
x = x − η ∂ f ∂ x x = x - \eta \frac{\partial f}{\partial x} x=x−η∂x∂f
python for gradient_descent
source:
def gradient_descent(f,init_x,lr = 0.01,step_num = 100):
x init_x
for i in range(step_num):
grad = numerical_gradient(f,x)
x -= or * grad
return x
def a class – simpleNet
source:
class simpleNet:
def __init__(self):
self.w = np.random.randn(2,3) # Gauss
def predict(self,x):
return np.dot(x,self.w)
def loss(self,x,t):
z = self.predict(x)
y = softmax(z)
loss = cross_entropy_error(y,t)
return loss
# simpleNet
net = simpleNet()
x = np.array([0.6,0.9])
p = net.predict(x)
np.argmax(p)
t = np.array([0,0,1])
net.loss(x,t)
method 1
def f(W):
return net.loss(x,t)
method 2
f = lambda w: net.loss(x,t)
stochastic gradient descent(SGD)
TwoLayerNet
import sys,os
sys.path.append(os.pardir)
from common.functions import *
from common.gradient import numerical_gradient
class TwoLayerNet:
def __init__(self,input_size,hidden_size,output_size,weight_init_std=0.01):
# init weight
self.params = {}
self.params[’w1’] = weight_init_std * np.random.randn(input_size,hidden_size)
self.params[’b1’] = np.zeros(hidden_size)
self.params[’w2’] = weight_init_std * np.random.randn(hidden_size,output_size)
self.params[’b2’] = np.zeros(output_size)
def predict(self,x):
w1,w2 = self.params[’w1’],self.params[’w2’]
b1,b2 = self.params[’b1’],self.params[’b2’]
a1 = np.dot(x,w1) + b1
z1 = sigmoid(a1)
a2 = np.dot(z1,w2) + b2
y = softmax(a2)
return y
def loss(self,x,t):
y = self.predict(x)
return cross_entropy_error(y,t)
def accuracy(self,x,t):
y = self.predict(x)
y = np.argmax(y,axis = 1)
t = np.argmax(t,axis = 1)
accuracy = np.sum(y == t)/float(x.shape[0])
return accuracy
def numerical_gradient(self,x,t):
loss_w = lambda w: self.loss(x,t)
grads = {}
grads[’w1’] = numerical_gradient(loss_w,self.params[’w1’])
grads[’b1’] = numerical_gradient(loss_w,self.params[’b1’])
grads[’w2’] = numerical_gradient(loss_w,self.params[’w2’])
grads[’b2’] = numerical_gradient(loss_w,self.params[’b2’])
return grads
run
net = TwoLayerNet(input_size = 784, hidden_size = 100, output_size = 10)
x = np.random.rand(100,784)
y = net.predict(x)
x = np.random.rand(100,784)
t = np.random.rand(100,10)
grads = net.numerical_gradient(x,t)
import numpy as np
from dataset_mnist import load_mnist
from two_layer_net import TwoLayerNet
(x_train,t_train),(x_test,t_test) = load_mnist(normalize=True, one_hot_label=True)
train_loss_list = {}
# 超参数
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
for i in range(iters_num):
# 获取 mini-batch
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train(batch_mask)
t_batch = t_train(batch_mask)
grad = network.numerical_gradient(x_batch,t_batch)
# grad = network.gradient(x_batch,t_batch)
# 更新参数
for key in (‘w1’,‘b1’,’w2’,’b2’):
network.params[key] -= learning_rate * grad[key]
# 记录学习过程
loss = network.loss(x_batch,t_batch)
train_loss_list.append(loss)
import numpy as np
from dataset_mnist import load_mnist
from two_layer_net import TwoLayerNet
(x_train,t_train),(x_test,t_test) = load_mnist(normalize=True, one_hot_label=True)
train_loss_list = {}
# # #
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)
# 超参数
iters_num = 10000
# train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
for i in range(iters_num):
# 获取 mini-batch
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train(batch_mask)
t_batch = t_train(batch_mask)
grad = network.numerical_gradient(x_batch,t_batch)
# grad = network.gradient(x_batch,t_batch)
# 更新参数
for key in (‘w1’,‘b1’,’w2’,’b2’):
network.params[key] -= learning_rate * grad[key]
# 记录学习过程
loss = network.loss(x_batch,t_batch)
train_loss_list.append(loss)
# 计算每个epoch的识别精度
if i % iter_per_epoch == 0:
train_acc = network_accuracy(x_train,t_train)
test_acc = network_accuracy(x_test,t_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
print(“train_acc, test_acc | “ + str(train_acc) + “ , “ + str(test_acc))
激活函数更换为 sigmoid,ReLU等函数后,由感知机推至神经网络。神经网络学习过程大致可分四步:随机抽样 >>> 计算梯度 >>> 更新参数 >>> 重复。将数据集分为训练集和测试集,利用训练集确定权重偏置参数,测试集用于避免过拟合。
至于神经网络各部分,有输入层 >>> 隐含层 >>> 输出层,激活函数(转换输出)、损失函数(确定是否继续调整)、梯度(寻找最速下降/上升方向)、误差(不可替代损失函数)/精度。
其中,数值求解梯度利用的中心差分方法计算速度慢,将采用下节所述误差反向传播法解决。
其余问题将在实践中不断补充。