理论层面看我以前的博文:(2020李宏毅)机器学习-Backpropagation
单个神经元
三层的神经网络(input layer+hidden layer+output layer)
损失函数(MSE损失):
ps:sigmoid函数求导参考我的博文激活函数Sigmoid求导
如果想深入理解反向传播,那就需要代码实现,不然理论只是空中楼阁,而C++语言偏底层一些,更容易理解反向传播的本质,所以我采用C++语言实现:
#include
#include
#include
#include
#include
#define INNODE 2 //输入层
#define HIDENODE 3 //隐藏层
#define OUTNODE 1 //输出层
double learning_rate = 0.8; //学习率
double threshold = 1e-4;//允许最大误差
size_t mosttimes = 1e6;//最大迭代次数
/*样本*/
struct Sample {
std::vector<double> in, out;
};
/*神经元节点*/
struct Node {
double value{
}, bias{
}, bias_delta{
};
std::vector<double> weight, weight_delta;
};
namespace utils {
/*激活函数*/
inline double sigmoid(double x) {
double res = 1.0 / (1.0 + std::exp(-x));
return res;
}
/*读入文件*/
std::vector<double> getFileData(std::string filename) {
std::vector<double> res;
std::ifstream in(filename);
if (in.is_open()) {
//打开成功
while (!in.eof()) {
double buffer;
in >> buffer;
res.push_back(buffer);
}
in.close();
} else {
//打开失败
std::cout << "Error in reading " << filename << std::endl;
}
return res;
}
/*读入训练数据集*/
std::vector<Sample> getTrainData(std::string filename) {
std::vector<Sample> res;
std::vector<double> buffer = getFileData(filename);
for (size_t i = 0; i < buffer.size(); i += INNODE + OUTNODE) {
Sample tmp;
for (size_t t = 0; t < INNODE; t++) {
tmp.in.push_back(buffer[i + t]);
}
for (size_t t = 0; t < OUTNODE; t++) {
tmp.out.push_back(buffer[i + INNODE + t]);
}
res.push_back(tmp);
}
return res;
}
/*读入测试数据集*/
std::vector<Sample> getTestData(std::string filename) {
std::vector<Sample> res;
std::vector<double> buffer = getFileData(filename);
for (size_t i = 0; i < buffer.size(); i += INNODE) {
Sample tmp;
for (size_t t = 0; t < INNODE; t++) {
tmp.in.push_back(buffer[i + t]);
}
res.push_back(tmp);
}
return res;
}
}
Node *inputLayer[INNODE], *hideLayer[HIDENODE], *outLayer[OUTNODE];
/*初始化*/
inline void init() {
std::mt19937 rd;
rd.seed(std::random_device()());
std::uniform_real_distribution<double> distribution(-1, 1);
for (size_t i = 0; i < INNODE; i++) {
::inputLayer[i] = new Node();
for (size_t j = 0; j < HIDENODE; j++) {
::inputLayer[i]->weight.push_back(distribution(rd));
::inputLayer[i]->weight_delta.push_back(0.f);
}
}
for (size_t i = 0; i < HIDENODE; i++) {
::hideLayer[i] = new Node();
::hideLayer[i]->bias = distribution(rd);
for (size_t j = 0; j < OUTNODE; j++) {
::hideLayer[i]->weight.push_back(distribution(rd));
::hideLayer[i]->weight_delta.push_back(0.f);
}
}
for (size_t i = 0; i < OUTNODE; i++) {
::outLayer[i] = new Node();
::outLayer[i]->bias = distribution(rd);
}
}
/*梯度清零*/
inline void reset_delta() {
for (size_t i = 0; i < INNODE; i++) {
::inputLayer[i]->weight_delta.assign(::inputLayer[i]->weight_delta.size(), 0.f);
}
for (size_t i = 0; i < HIDENODE; i++) {
::hideLayer[i]->bias_delta = 0.f;
::hideLayer[i]->weight_delta.assign(::hideLayer[i]->weight_delta.size(), 0.f);
}
for (size_t i = 0; i < OUTNODE; i++) {
::outLayer[i]->bias_delta = 0.f;
}
}
int main() {
init();
std::vector<Sample> train_data = utils::getTrainData("traindata.txt");
/*训练*/
for (size_t times = 0; times < mosttimes; times++) {
reset_delta();//每次训练完一个epoch,需要梯度清零,保留上次的权重和偏置,继续优化
double error_max = 0.f;
for (auto &idx : train_data) {
for (size_t i = 0; i < INNODE; i++) {
::inputLayer[i]->value = idx.in[i];
}
//正向传播
for (size_t j = 0; j < HIDENODE; j++) {
double sum = 0;
for (size_t i = 0; i < INNODE; i++) {
sum += ::inputLayer[i]->value * ::inputLayer[i]->weight[j];
}
sum -=::hideLayer[j]->bias;
::hideLayer[j]->value = utils::sigmoid(sum);
}
for (size_t j = 0; j < OUTNODE; j++) {
double sum = 0;
for (size_t i = 0; i < HIDENODE; i++) {
sum +=::hideLayer[i]->value * ::hideLayer[i]->weight[j];
}
sum -=::outLayer[j]->bias;
::outLayer[j]->value = utils::sigmoid(sum);
}
//计算误差
double error = 0.f;
for (size_t i = 0; i < OUTNODE; i++) {
double tmp = std::fabs(::outLayer[i]->value - idx.out[i]);
error += tmp * tmp / 2;
}
error_max = std::max(error_max, error);
/*反向传播*/
for (size_t i = 0; i < OUTNODE; i++) {
double bias_delta = -(idx.out[i] -::outLayer[i]->value)*::outLayer[i]->value * (1.0 -::outLayer[i]->value);
::outLayer[i]->bias_delta += bias_delta;
}
for (size_t i = 0; i < HIDENODE; i++) {
for (size_t j = 0; j < OUTNODE; j++) {
double weight_delta = (idx.out[j] -::outLayer[j]->value)*::outLayer[j]->value * (1.0 -::outLayer[j]->value)*::hideLayer[i]->value;
::hideLayer[i]->weight_delta[j] += weight_delta;
}
}
for (size_t i = 0; i < HIDENODE; i++) {
double sum = 0;
for (size_t j = 0; j < OUTNODE; j++) {
sum += -(idx.out[j] -::outLayer[j]->value)*::outLayer[j]->value * (1.0 -::outLayer[j]->value)*::hideLayer[i]->weight[j];
}
::hideLayer[i]->bias_delta += sum*::hideLayer[i]->value * (1.0 -::hideLayer[i]->value);
}
for (size_t i = 0; i < INNODE; i++) {
for (size_t j = 0; j < HIDENODE; j++) {
double sum = 0.f;
for (size_t k = 0; k < OUTNODE; k++) {
sum += (idx.out[k] -::outLayer[k]->value)*::outLayer[k]->value * (1.0 -::outLayer[k]->value)*::hideLayer[j]->weight[k];
}
::inputLayer[i]->weight_delta[j] += sum*::hideLayer[j]->value * (1.0 -::hideLayer[j]->value)*::inputLayer[i]->value;
}
}
}
/*误差小于允许误差时,我们认为此时网络可以接受,跳出循环*/
if (error_max <::threshold) {
std::cout << "Success with " << times + 1 << "times training." << std::endl;
std::cout << "Maximum error: " << error_max << std::endl;
break;
}
/*梯度下降*/
auto train_data_size = double(train_data.size());
for (size_t i = 0; i < INNODE; i++) {
for (size_t j = 0; j < HIDENODE; j++) {
::inputLayer[i]->weight[j] += learning_rate*::inputLayer[i]->weight_delta[j] / train_data_size;
}
}
for (size_t i = 0; i < HIDENODE; i++) {
::hideLayer[i]->bias += learning_rate*::hideLayer[i]->bias_delta / train_data_size;
for (size_t j = 0; j < OUTNODE; j++) {
::hideLayer[i]->weight[j] += learning_rate * hideLayer[i]->weight_delta[j] / train_data_size;
}
}
for (size_t i = 0; i < OUTNODE; i++) {
::outLayer[i]->bias += learning_rate*::outLayer[i]->bias_delta / train_data_size;
}
/*打印训练过程中的误差*/
std::cout << error_max << std::endl;
}
/*测试*/
std::vector<Sample> test_data = utils::getTestData("testdata.txt");
for (auto &idx : test_data) {
/*读入样本*/
for (size_t i = 0; i < INNODE; i++) {
::inputLayer[i]->value = idx.in[i];
}
/*前向传播*/
for (size_t j = 0; j < HIDENODE; j++) {
double sum = 0;
for (size_t i = 0; i < INNODE; i++) {
sum += ::inputLayer[i]->value*::inputLayer[i]->weight[j];
}
sum -=::hideLayer[j]->bias;
::hideLayer[j]->value = utils::sigmoid(sum);
}
for (size_t j = 0; j < OUTNODE; j++) {
double sum = 0;
for (size_t i = 0; i < HIDENODE; i++) {
sum += hideLayer[i]->value*::hideLayer[i]->weight[j];
}
sum -=::outLayer[j]->bias;
::outLayer[j]->value = utils::sigmoid(sum);
idx.out.push_back(::outLayer[j]->value);
for (auto &tmp : idx.in) {
std::cout << tmp << " ";
}
for (auto &tmp : idx.out) {
std::cout << tmp << " ";
}
std::cout << std::endl;
}
}
return 0;
}
0 0 0
0 1 1
1 0 1
1 1 0
0.8 0.8 0
0.6 0.6 0
0.4 0.4 0
0.2 0.2 0
1.0 0.8 1
1.0 0.6 1
1.0 0.4 1
1.0 0.2 1
0.8 0.6 1
0.6 0.4 1
0.4 0.2 1
0.2 0 1
0.999 0.666 1
0.666 0.333 1
0.333 0 1
0.8 0.4 1
0.4 0 1
0 0.123 1
0.12 0.23 1
0.23 0.34 1
0.34 0.45 1
0.45 0.56 1
0.56 0.67 1
0.67 0.78 1
0.78 0.89 1
0.89 0.99 1
testdata.txt
0.111 0.112
0.001 0.999
0.123 0.345
0.123 0.456
0.123 0.789
0.234 0.567
0.234 0.678
0.387 0.401
0.616 0.717
0.701 0.919
网络训练的目的是两个数相同输出为0,不同输出为1,traindata的格式为x,y,output