本文公式较多,由于简书不支持公式渲染,公式完整版请移步个人博客
import numpy as np
使用numpy实现多层感知机的正向和反向传播
正向传播的公式为:$Y = f(W \times X + b)$,其中,Y为输出,W为权值,b为偏置
对于反向传播,已知上一层传回的梯度为dY,对应的反向传播公式为:
$$dX = (W^{T} \times dY) \cdot f'(Y)$$
$$dW = \cfrac{1}{m} dY \times X^{T}$$
$$db = \cfrac{1}{m} \sum dY$$
class numpy_fc(object):
def __init__(self, in_channel, out_channel, optim):
self.weight = np.float64(np.random.randn(out_channel, in_channel) * 0.1)
self.bias = np.zeros((out_channel, 1),dtype=np.float64)
self.in_data = np.zeros((1, in_channel))
self.out_data = None
self.weight_grad = None
self.bias_grad = None
self.optimizer = optim
def forward(self, data):
self.in_data = data
self.out_data = np.dot(self.weight, data) + self.bias
return self.out_data
def backward(self, grad):
data_grad = np.dot(self.weight.T, grad)
self.weight_grad = np.dot(grad, self.in_data.T)
self.bias_grad = np.sum(grad, axis=1).reshape((-1,1))
return data_grad
def step(self):
# print(self.bias_grad.shape,self.bias.shape)
self.weight += self.optimizer(self.weight_grad)
self.bias += self.optimizer(self.bias_grad)
test_fc = numpy_fc(16,8,None)
test_fc_forward = test_fc.forward(np.random.rand(16,10))
print(test_fc_forward.shape)
test_fc_back = test_fc.backward(test_fc_forward)
print(test_fc_back.shape)
print(test_fc.weight_grad.shape,test_fc.weight.shape)
print(test_fc.bias_grad.shape,test_fc.bias.shape)
(8, 10)
(16, 10)
(8, 16) (8, 16)
(8, 1) (8, 1)
sigmoid函数是常用的二分类问题输出层激活函数,前向传播和反向传播分别如下所示:
$$ sigmoid(x) = \cfrac{1}{1 + e^{-x}}$$
$$ sigmoid'(x) = sigmoid(x) \cdot (1 - sigmoid(x))$$
class numpy_sigmoid(object):
def __init__(self):
self.result = None
def forward(self,data):
self.result = 1 / (1 + np.exp(-data))
return self.result
def backward(self,grad):
return grad * self.result * (1 - self.result)
def step(self):
pass
relu是现阶段最常用的隐层激活函数,前向传播和反向传播如下所示
$$relu(x) = max{0,x}$$
$$
relu'(x)=
\begin{cases}
0 &\mbox{$relu(x) \leq 0$}\
1 &\mbox{$relu(x) > 0$ }
\end{cases}
$$
class numpy_relu(object):
def __init__(self):
self.result = None
def forward(self,data):
self.result = data
self.result[data < 0] = 0
return self.result
def backward(self,grad):
relu_grad = self.result
relu_grad[self.result > 0] = 1
return grad * relu_grad
def step(self):
pass
MES代价函数的前向传播和反向传播为:
$$MES(y_pre,y) = \cfrac{1}{m} \sum ( y_pre - y )^2$$
$$\cfrac{dMES}{dy_pre} = \cfrac{2}{m} |y_pre - y|$$
def MES_loss(y_pre,y):
loss = np.sum((y_pre - y) ** 2)
loss_back = np.abs(y_pre - y)
return loss,loss_back
交叉熵的前向传播和反向传播分别为:
$$cross(y_pre,y) = - \cfrac{1}{m} \sum^m_{i = 1}(ylog(y_pre) + (1-y)log(1-y_pre))$$
$$\cfrac{dcross}{dy_pre} = -\cfrac{1}{m}(\cfrac{y}{y_pre} - \cfrac{1-y}{1-y_pre})$$
def Cross_loss(y_pre,y):
loss = -np.sum(y*np.log(y_pre)+(1-y)*np.log(1-y_pre))
loss_back = y/y_pre + (1-y)/(1-y_pre)
return loss,-loss_back
softmax函数是多分类问题常用的输出激活函数,一般与交叉熵代价函数结合使用,组合函数(softmax+交叉熵)的前向传播如下:
$$J(y_pre,y) = - \sum y_i * log(softmax(y_pre_i))$$
$$softmax_i(x) = \cfrac{e^{x_i}}{\sum_j e^{x_j}}$$
反向传播如下:
$$ \cfrac{dJ(y_pre,y)}{dy_pre} = y_pre - y$$
详细推导可参见这里
def Softmax_cross_loss(y_pre,y):
softmax = np.exp(y_pre) / np.sum(np.exp(y_pre),axis=0)
# print(np.sum(np.exp(y_pre),axis=1,keepdims=True))
# print(np.sum(softmax,axis=0))
# print(softmax)
loss = - np.sum(y * np.log(softmax))
loss_back = softmax - y
return loss,loss_back
Softmax_cross_loss(np.random.randn(2,4),np.random.randn(2,4))
(-4.9084963417988003,
array([[-0.09065384, 0.07506358, 0.32789286, 1.26735185],
[ 1.93958915, 0.01316283, 1.20922904, 2.87550082]]))
随机梯度下降优化器是一种比较简单的优化方法,优化公式如下:
$$W_{new} = W_{old} - learning_rate \times \cfrac{dJ}{dW_{old}}$$
class optim_sgd(object):
def __init__(self,learning_rate):
super(optim_sgd,self).__init__()
self.learning_rate = learning_rate
def __call__(self,grad):
return -self.learning_rate * grad
import re
import pandas as pd
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
data_label = """ 1. Sample code number 1id number
2. Clump Thickness 1 - 10
3. Uniformity of Cell Size 1 - 10
4. Uniformity of Cell Shape 1 - 10
5. Marginal Adhesion 1 - 10
6. Single Epithelial Cell Size 1 - 10
7. Bare Nuclei 1 - 10
8. Bland Chromatin 1 - 10
9. Normal Nucleoli 1 - 10
10. Mitoses 1 - 10
11. Class 2 for benign, 4 for malignant)
"""
data_label = [re.sub(r"\s+\d","",x[2:]) for x in re.findall(r"\. [\w\s]+\d",data_label)]
# print(data_label)
data = pd.read_csv(data_url,names=data_label)
# data["Bare Nuclei"] = data["Bare Nuclei"].map(int)
print(data.info())
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
Sample code numberid number 699 non-null int64
Clump Thickness 699 non-null int64
Uniformity of Cell Size 699 non-null int64
Uniformity of Cell Shape 699 non-null int64
Marginal Adhesion 699 non-null int64
Single Epithelial Cell Size 699 non-null int64
Bare Nuclei 699 non-null object
Bland Chromatin 699 non-null int64
Normal Nucleoli 699 non-null int64
Mitoses 699 non-null int64
Class 699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB
None
data = data.replace(to_replace="?",value=np.nan)
data = data.dropna(how='any')
data["Bare Nuclei"] = data["Bare Nuclei"].map(int)
print(data.info())
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
Sample code numberid number 683 non-null int64
Clump Thickness 683 non-null int64
Uniformity of Cell Size 683 non-null int64
Uniformity of Cell Shape 683 non-null int64
Marginal Adhesion 683 non-null int64
Single Epithelial Cell Size 683 non-null int64
Bare Nuclei 683 non-null int64
Bland Chromatin 683 non-null int64
Normal Nucleoli 683 non-null int64
Mitoses 683 non-null int64
Class 683 non-null int64
dtypes: int64(11)
memory usage: 64.0 KB
None
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data[data_label[1:10]],data[data_label[10]],test_size=0.25,random_state=1)
print(x_train.shape,x_test.shape)
print(y_train.shape)
print(pd.value_counts(y_train))
(512, 9) (171, 9)
(512,)
2 333
4 179
Name: Class, dtype: int64
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_train_ss = ss.fit_transform(x_train)
x_test_ss = ss.transform(x_test)
# x_train_ss = x_train.values
# x_test_ss = x_test.values
print(type(x_train_ss))
# print(x_train[:5]/,x_train_ss[:5])
def y_standard(data):
return (data / 2) - 1
y_train_ss = y_standard(y_train).values
y_test_ss = y_standard(y_test).values
print(pd.value_counts(y_train_ss))
print(pd.value_counts(y_test_ss))
0.0 333
1.0 179
dtype: int64
0.0 111
1.0 60
dtype: int64
import random
def dataset(data,lable,batch_size=100,epoch=10):
for _ in range(epoch):
index = [random.randint(0,data.shape[0]-1) for _ in range(batch_size)]
# print(index)
yield data[index],lable[index]
# print(x_train_ss,type(y_train_ss))
for i in dataset(x_train_ss,y_train_ss,batch_size=100):
print(i[0].shape,i[1].shape)
break
(100, 9) (100,)
def onehot(data,tp_num):
x = np.zeros((data.shape[0],tp_num))
for i in range(data.shape[0]):
x[i][int(data[i])] = 1
return x
test_onehot = np.arange(2)
onehot(test_onehot,2)
array([[ 1., 0.],
[ 0., 1.]])
class numpy_network_base(object):
def __init__(self,network_list):
self.network = network_list
def forward(self,x):
for layer in self.network:
x = layer.forward(x)
return x
def backward(self,grad):
last_grad = grad.copy()
for layer in self.network[::-1]:
last_grad = layer.backward(last_grad)
return last_grad
def step(self):
for layer in self.network:
layer.step()
def accuracy(y_pre,lable):
y_pre = np.argmax(y_pre,axis=0)
return np.mean(np.int8(y_pre == lable))
a = np.arange(4*8).reshape((4,8))
b = np.ones((1,8)) * 3
accuracy(a,b)
1.0
network = numpy_network_base([numpy_fc(9,20,optim_sgd(0.001)),numpy_relu(),numpy_fc(20,2,optim_sgd(0.001))])
for i,(din,lable) in enumerate(dataset(x_train_ss,y_train_ss,epoch=10,batch_size=100)):
# print(din)
result = network.forward(din.T)
# print(result)
# print(np.argmax(result,axis=0),lable)
loss,grad = Softmax_cross_loss(result.T,onehot(lable,2))
# print(loss)
# print(pd.get_dummies(lable))
# print(grad.shape)
print(accuracy(result,lable))
network.backward(grad.T)
network.step()
0.19
0.32
0.86
0.96
0.94
0.93
0.9
0.96
0.98
0.95
result = network.forward(x_test_ss.T)
print(accuracy(result,y_test_ss))
0.982456140351