【手撕反向传播】反向传播推导及代码实现

【手撕反向传播】反向传播推导及代码实现_第1张图片

文章目录

  • 理论
  • 手推过程
  • 代码实现

理论


理论层面看我以前的博文:(2020李宏毅)机器学习-Backpropagation

手推过程


单个神经元
【手撕反向传播】反向传播推导及代码实现_第2张图片
三层的神经网络(input layer+hidden layer+output layer)
【手撕反向传播】反向传播推导及代码实现_第3张图片
损失函数(MSE损失):【手撕反向传播】反向传播推导及代码实现_第4张图片

链式法则(chain rule )求梯度:
手推过程如下:
【手撕反向传播】反向传播推导及代码实现_第5张图片
【手撕反向传播】反向传播推导及代码实现_第6张图片

【手撕反向传播】反向传播推导及代码实现_第7张图片
【手撕反向传播】反向传播推导及代码实现_第8张图片

ps:sigmoid函数求导参考我的博文激活函数Sigmoid求导


代码实现


如果想深入理解反向传播,那就需要代码实现,不然理论只是空中楼阁,而C++语言偏底层一些,更容易理解反向传播的本质,所以我采用C++语言实现:

#include
#include
#include
#include
#include

#define INNODE 2 //输入层
#define HIDENODE 3 //隐藏层
#define OUTNODE 1 //输出层 


double learning_rate = 0.8; //学习率
double threshold = 1e-4;//允许最大误差
size_t mosttimes = 1e6;//最大迭代次数

/*样本*/
struct Sample {
     
	std::vector<double> in, out;
};

/*神经元节点*/
struct Node {
     
	double value{
     }, bias{
     }, bias_delta{
     };
	std::vector<double> weight, weight_delta;
};


namespace utils {
     
	/*激活函数*/
	inline double sigmoid(double x) {
     
		double res = 1.0 / (1.0 + std::exp(-x));

		return res;
	}

	/*读入文件*/
	std::vector<double> getFileData(std::string filename) {
     
		std::vector<double> res;

		std::ifstream in(filename);
		if (in.is_open()) {
      //打开成功
			while (!in.eof()) {
     
				double buffer;
				in >> buffer;
				res.push_back(buffer);
			}
			in.close();
		} else {
      //打开失败
			std::cout << "Error in reading " << filename << std::endl;
		}

		return res;
	}

	/*读入训练数据集*/
	std::vector<Sample> getTrainData(std::string filename) {
     
		std::vector<Sample> res;

		std::vector<double> buffer = getFileData(filename);

		for (size_t i = 0; i < buffer.size(); i += INNODE + OUTNODE) {
     
			Sample tmp;
			for (size_t t = 0; t < INNODE; t++) {
     
				tmp.in.push_back(buffer[i + t]);
			}

			for (size_t t = 0; t < OUTNODE; t++) {
     
				tmp.out.push_back(buffer[i + INNODE + t]);
			}

			res.push_back(tmp);
		}

		return res;
	}

	/*读入测试数据集*/ 
	std::vector<Sample> getTestData(std::string filename) {
     
		std::vector<Sample> res;

		std::vector<double> buffer = getFileData(filename);

		for (size_t i = 0; i < buffer.size(); i += INNODE) {
     
			Sample tmp;
			for (size_t t = 0; t < INNODE; t++) {
     
				tmp.in.push_back(buffer[i + t]);
			}
			res.push_back(tmp);
		}

		return res;
	}
}


Node *inputLayer[INNODE], *hideLayer[HIDENODE], *outLayer[OUTNODE];

/*初始化*/
inline void init() {
     
	std::mt19937 rd;
	rd.seed(std::random_device()());

	std::uniform_real_distribution<double> distribution(-1, 1);

	for (size_t i = 0; i < INNODE; i++) {
     
		::inputLayer[i] = new Node();
		for (size_t j = 0; j < HIDENODE; j++) {
     
			::inputLayer[i]->weight.push_back(distribution(rd));
			::inputLayer[i]->weight_delta.push_back(0.f);
		}
	}

	for (size_t i = 0; i < HIDENODE; i++) {
     
		::hideLayer[i] = new Node();
		::hideLayer[i]->bias = distribution(rd);
		for (size_t j = 0; j < OUTNODE; j++) {
     
			::hideLayer[i]->weight.push_back(distribution(rd));
			::hideLayer[i]->weight_delta.push_back(0.f);
		}
	}

	for (size_t i = 0; i < OUTNODE; i++) {
     
		::outLayer[i] = new Node();
		::outLayer[i]->bias = distribution(rd);
	}
}

/*梯度清零*/
inline void reset_delta() {
     
	for (size_t i = 0; i < INNODE; i++) {
     
		::inputLayer[i]->weight_delta.assign(::inputLayer[i]->weight_delta.size(), 0.f);
	}

	for (size_t i = 0; i < HIDENODE; i++) {
     
		::hideLayer[i]->bias_delta = 0.f;
		::hideLayer[i]->weight_delta.assign(::hideLayer[i]->weight_delta.size(), 0.f);
	}

	for (size_t i = 0; i < OUTNODE; i++) {
     
		::outLayer[i]->bias_delta = 0.f;
	}
}

int main() {
     
	init();

	std::vector<Sample> train_data = utils::getTrainData("traindata.txt");

	/*训练*/ 
	for (size_t times = 0; times < mosttimes; times++) {
     
		reset_delta();//每次训练完一个epoch,需要梯度清零,保留上次的权重和偏置,继续优化 

		double error_max = 0.f;

		for (auto &idx : train_data) {
     

			for (size_t i = 0; i < INNODE; i++) {
     
				::inputLayer[i]->value = idx.in[i];
			}

			//正向传播
			for (size_t j = 0; j < HIDENODE; j++) {
     
				double sum = 0;
				for (size_t i = 0; i < INNODE; i++) {
     
					sum += ::inputLayer[i]->value * ::inputLayer[i]->weight[j];
				}
				sum -=::hideLayer[j]->bias;

				::hideLayer[j]->value = utils::sigmoid(sum);
			}

			for (size_t j = 0; j < OUTNODE; j++) {
     
				double sum = 0;
				for (size_t i = 0; i < HIDENODE; i++) {
     
					sum +=::hideLayer[i]->value * ::hideLayer[i]->weight[j];
				}
				sum -=::outLayer[j]->bias;

				::outLayer[j]->value = utils::sigmoid(sum);
			}

			//计算误差
			double error = 0.f;
			for (size_t i = 0; i < OUTNODE; i++) {
     
				double tmp = std::fabs(::outLayer[i]->value - idx.out[i]);
				error += tmp * tmp / 2;
			}

			error_max = std::max(error_max, error);

			/*反向传播*/
			for (size_t i = 0; i < OUTNODE; i++) {
     
				double bias_delta = -(idx.out[i] -::outLayer[i]->value)*::outLayer[i]->value * (1.0 -::outLayer[i]->value);
				::outLayer[i]->bias_delta += bias_delta;
			}

			for (size_t i = 0; i < HIDENODE; i++) {
     
				for (size_t j = 0; j < OUTNODE; j++) {
     
					double weight_delta = (idx.out[j] -::outLayer[j]->value)*::outLayer[j]->value * (1.0 -::outLayer[j]->value)*::hideLayer[i]->value;
					::hideLayer[i]->weight_delta[j] += weight_delta;
				}
			}

			for (size_t i = 0; i < HIDENODE; i++) {
     
				double sum = 0;
				for (size_t j = 0; j < OUTNODE; j++) {
     
					sum += -(idx.out[j] -::outLayer[j]->value)*::outLayer[j]->value * (1.0 -::outLayer[j]->value)*::hideLayer[i]->weight[j];

				}
				::hideLayer[i]->bias_delta += sum*::hideLayer[i]->value * (1.0 -::hideLayer[i]->value);
			}

			for (size_t i = 0; i < INNODE; i++) {
     
				for (size_t j = 0; j < HIDENODE; j++) {
     
					double sum = 0.f;
					for (size_t k = 0; k < OUTNODE; k++) {
     
						sum += (idx.out[k] -::outLayer[k]->value)*::outLayer[k]->value * (1.0 -::outLayer[k]->value)*::hideLayer[j]->weight[k];
					}
					::inputLayer[i]->weight_delta[j] += sum*::hideLayer[j]->value * (1.0 -::hideLayer[j]->value)*::inputLayer[i]->value;
				}
			}
		}



		/*误差小于允许误差时,我们认为此时网络可以接受,跳出循环*/
		if (error_max <::threshold) {
     
			std::cout << "Success with " << times + 1 << "times training." << std::endl;
			std::cout << "Maximum error: " << error_max << std::endl;
			break;
		}

		/*梯度下降*/
		auto train_data_size = double(train_data.size());

		for (size_t i = 0; i < INNODE; i++) {
     
			for (size_t j = 0; j < HIDENODE; j++) {
     
				::inputLayer[i]->weight[j] += learning_rate*::inputLayer[i]->weight_delta[j] / train_data_size;
			}
		}

		for (size_t i = 0; i < HIDENODE; i++) {
     
			::hideLayer[i]->bias += learning_rate*::hideLayer[i]->bias_delta / train_data_size;
			for (size_t j = 0; j < OUTNODE; j++) {
     
				::hideLayer[i]->weight[j] += learning_rate * hideLayer[i]->weight_delta[j] / train_data_size;
			}
		}

		for (size_t i = 0; i < OUTNODE; i++) {
     
			::outLayer[i]->bias += learning_rate*::outLayer[i]->bias_delta / train_data_size;
		}
		/*打印训练过程中的误差*/
		std::cout << error_max << std::endl;
	}
	/*测试*/
	std::vector<Sample> test_data = utils::getTestData("testdata.txt");

	for (auto &idx : test_data) {
     
		/*读入样本*/
		for (size_t i = 0; i < INNODE; i++) {
     
			::inputLayer[i]->value = idx.in[i];
		}
		/*前向传播*/ 
		for (size_t j = 0; j < HIDENODE; j++) {
     
			double sum = 0;
			for (size_t i = 0; i < INNODE; i++) {
     
				sum += ::inputLayer[i]->value*::inputLayer[i]->weight[j];
			}
			sum -=::hideLayer[j]->bias;
			::hideLayer[j]->value = utils::sigmoid(sum);
		}
		for (size_t j = 0; j < OUTNODE; j++) {
     
			double sum = 0;
			for (size_t i = 0; i < HIDENODE; i++) {
     
				sum += hideLayer[i]->value*::hideLayer[i]->weight[j];
			}
			sum -=::outLayer[j]->bias;
			::outLayer[j]->value = utils::sigmoid(sum);

			idx.out.push_back(::outLayer[j]->value);
			for (auto &tmp : idx.in) {
     
				std::cout << tmp << " ";
			}
			for (auto &tmp : idx.out) {
     
				std::cout << tmp << " ";
			}
			std::cout << std::endl;
		}

	}
	return 0;
}

运行结果:
【手撕反向传播】反向传播推导及代码实现_第9张图片
代码中的traindata.txt

0 0 0
0 1 1
1 0 1
1 1 0
0.8 0.8 0
0.6 0.6 0
0.4 0.4 0
0.2 0.2 0
1.0 0.8 1
1.0 0.6 1
1.0 0.4 1
1.0 0.2 1
0.8 0.6 1
0.6 0.4 1
0.4 0.2 1
0.2 0 1
0.999 0.666 1
0.666 0.333 1
0.333 0 1
0.8 0.4 1
0.4 0 1
0 0.123 1
0.12 0.23 1
0.23 0.34 1
0.34 0.45 1
0.45 0.56 1
0.56 0.67 1
0.67 0.78 1
0.78 0.89 1
0.89 0.99 1

testdata.txt

0.111 0.112
0.001 0.999
0.123 0.345
0.123 0.456
0.123 0.789
0.234 0.567
0.234 0.678
0.387 0.401
0.616 0.717
0.701 0.919

网络训练的目的是两个数相同输出为0,不同输出为1,traindata的格式为x,y,output

你可能感兴趣的:(C/C++,机器学习,神经网络,深度学习,机器学习,误差反向传播)