在网络的整体架构搭建好之后,剩下的最重要的部分就是该怎么训练网络,如何更新各层网络的weight和bias参数,LeNet网络采用的optimization的方式是Stochastic Diagonal Levenberg-Marquardt的方式。下面就先介绍以下Stochastic Doagonal Levenberg-Marquardt的基本思想,然后再看tiny_cnn中的实现过程。
1,Stochastic Diagonal Levenberg-Marquardt的原理:stochastic diagonal levenberg-marquardt主要做的就是利用二阶偏导数对训练进行加速,具体细节在《Neural Networks:Tricks of the Trade》里面有介绍,而stochastic指的是把levenberg-marquardt的方法用于在线学习,至于在线学习,在另一篇文章里有介绍。大体上来说这个方法就是利用二阶偏导数修正每个网络参数的学习速率,使得能加快训练速度。
2,tiny_cnn中的实现过程:首先在main函数中执行了nerwork对象的训练接口:
nn.train(train_images, train_labels, minibatch_size, 20, on_enumerate_minibatch, on_enumerate_epoch);
紧接着,程序就会进入network对象中,在network中train函数实现中,主要做了3件事:第一,初始化权值,权值的初始化对于最后的训练结果影响很大,但是对权值初始化的方法我们不深入了;第二,计算hessian矩阵,hessian矩阵用于后面对权值的更新,至于为何要计算hessian矩阵,在optimization的原理中,我们有提到;第三,开始训练,包括计算梯度和更新网络参数。这里网络是进行迭代20次的训练,就是说程序会对输入样本做20此相同的训练过程,只不过每次初始化的网络参数都不同。最后通过对网络参数的平均得出最终训练的网络。程序如下:
for (int iter = 0; iter < epoch; iter++) { // epoch:
if (optimizer_.requires_hessian())
calc_hessian(in, t_cost); // calculate derivatice for every layer
for (size_t i = 0; i < in.size(); i+=batch_size) {
train_once(&in[i], &t[i],
static_cast(std::min(batch_size, in.size() - i)),
n_threads,
get_target_cost_sample_pointer(t_cost, i));
on_batch_enumerate();
if (i % 100 == 0 && layers_.is_exploded()) {
std::cout << "[Warning]Detected infinite value in weight. stop learning." << std::endl;
return false;
}
}
这里我们主要介绍以下如何计算权值和更新网络参数,更新网络参数首先需要计算梯度,即最终的cost相对于每层网络的每一个权值和偏置的梯度,这个计算的过程很漫长,分为下面几步:
void train_onebatch(const vec_t* in, const vec_t* t, int batch_size, const int num_tasks, const vec_t* t_cost) {
int num_threads = std::min(batch_size, num_tasks);
// number of data points to use in each thread
int data_per_thread = (batch_size + num_threads - 1) / num_threads;
// i is the thread / worker index
for_i(num_threads, [&](int i) {
int start_index = i * data_per_thread;
int end_index = std::min(batch_size, start_index + data_per_thread);
// loop over data points in this batch assigned to thread i
for (int j = start_index; j < end_index; ++j)
bprop(fprop(in[j], i), t[j], i, t_cost ? &(t_cost[j]) : nullptr);
}, 1);
// merge all dW and update W by optimizer
layers_.update_weights(&optimizer_, num_threads, batch_size);
}
第一步:前向计算最终的输出结果,在整个network中,每层网络对象里面都有一个前向计算该层网络输出结果的函数,forward_propagation()函数,当当前层的forward_propagation计算完成之后,先把数据保存在当前对象中,然后就进入下一层的forward_propagation函数继续进行计算。程序如下:
virtual const vec_t& forward_propagation(const vec_t& in_raw, size_t worker_index) override
{
copy_and_pad_input(in_raw, static_cast(worker_index));
auto& ws = this->get_worker_storage(worker_index);
vec_t &a = ws.a_; // w*x
vec_t &out = ws.output_; // output
const vec_t &in = *(conv_layer_worker_storage_[worker_index].prev_out_padded_); // input
std::fill(a.begin(), a.end(), float_t(0));
for_i(parallelize_, out_.depth_, [&](int o) {
for (cnn_size_t inc = 0; inc < in_.depth_; inc++) {
if (!tbl_.is_connected(o, inc)) continue;
const float_t *pw = &this->W_[weight_.get_index(0, 0, in_.depth_ * o + inc)];
const float_t *pi = &in[in_padded_.get_index(0, 0, inc)];
float_t *pa = &a[out_.get_index(0, 0, o)];
for (cnn_size_t y = 0; y < out_.height_; y++) {
for (cnn_size_t x = 0; x < out_.width_; x++) {
const float_t * ppw = pw;
const float_t * ppi = pi + (y * h_stride_) * in_padded_.width_ + x * w_stride_;
float_t sum = float_t(0);
// should be optimized for small kernel(3x3,5x5)
for (cnn_size_t wy = 0; wy < weight_.height_; wy++) {
for (cnn_size_t wx = 0; wx < weight_.width_; wx++) {
sum += *ppw++ * ppi[wy * in_padded_.width_ + wx];
}
}
pa[y * out_.width_ + x] += sum;
}
}
}
if (!this->b_.empty()) {
float_t *pa = &a[out_.get_index(0, 0, o)];
float_t b = this->b_[o];
std::for_each(pa, pa + out_.width_ * out_.height_, [&](float_t& f) { f += b; });
}
});
for_i(parallelize_, out_size_, [&](int i) {
out[i] = h_.f(a, i);
});
CNN_LOG_VECTOR(in_raw, "[pc]in");
CNN_LOG_VECTOR(W_, "[pc]w");
CNN_LOG_VECTOR(a, "[pc]a");
CNN_LOG_VECTOR(out, "[pc]forward");
return next_ ? next_->forward_propagation(out, worker_index) : out;
}
const vec_t& back_propagation(const vec_t& curr_delta, size_t index) override {
auto& ws = this->get_worker_storage(index);
conv_layer_worker_specific_storage& cws = conv_layer_worker_storage_[index];
const vec_t& prev_out = *(cws.prev_out_padded_);
const activation::function& prev_h = prev_->activation_function();
vec_t* prev_delta = (pad_type_ == padding::same) ? &cws.prev_delta_padded_ : &ws.prev_delta_;
vec_t& dW = ws.dW_;
vec_t& db = ws.db_;
std::fill(prev_delta->begin(), prev_delta->end(), float_t(0));
// propagate delta to previous layer
for_i(in_.depth_, [&](int inc) {
for (cnn_size_t outc = 0; outc < out_.depth_; outc++) {
if (!tbl_.is_connected(outc, inc)) continue;
const float_t *pw = &this->W_[weight_.get_index(0, 0, in_.depth_ * outc + inc)];
const float_t *pdelta_src = &curr_delta[out_.get_index(0, 0, outc)];
float_t *pdelta_dst = &(*prev_delta)[in_padded_.get_index(0, 0, inc)];
for (cnn_size_t y = 0; y < out_.height_; y++) {
for (cnn_size_t x = 0; x < out_.width_; x++) {
const float_t * ppw = pw;
const float_t ppdelta_src = pdelta_src[y * out_.width_ + x];
float_t * ppdelta_dst = pdelta_dst + y * h_stride_ * in_padded_.width_ + x * w_stride_;
for (cnn_size_t wy = 0; wy < weight_.height_; wy++) {
for (cnn_size_t wx = 0; wx < weight_.width_; wx++) {
ppdelta_dst[wy * in_padded_.width_ + wx] += *ppw++ * ppdelta_src;
}
}
}
}
}
});
for_i(parallelize_, in_padded_.size(), [&](int i) {
(*prev_delta)[i] *= prev_h.df(prev_out[i]);
});
// accumulate dw
for_i(in_.depth_, [&](int inc) {
for (cnn_size_t outc = 0; outc < out_.depth_; outc++) {
if (!tbl_.is_connected(outc, inc)) continue;
for (cnn_size_t wy = 0; wy < weight_.height_; wy++) {
for (cnn_size_t wx = 0; wx < weight_.width_; wx++) {
float_t dst = float_t(0);
const float_t * prevo = &prev_out[in_padded_.get_index(wx, wy, inc)];
const float_t * delta = &curr_delta[out_.get_index(0, 0, outc)];
for (cnn_size_t y = 0; y < out_.height_; y++) {
dst += vectorize::dot(prevo + y * in_padded_.width_, delta + y * out_.width_, out_.width_);
}
dW[weight_.get_index(wx, wy, in_.depth_ * outc + inc)] += dst;
}
}
}
});
// accumulate db
if (!db.empty()) {
for (cnn_size_t outc = 0; outc < out_.depth_; outc++) {
const float_t *delta = &curr_delta[out_.get_index(0, 0, outc)];
db[outc] += std::accumulate(delta, delta + out_.width_ * out_.height_, float_t(0));
}
}
if (pad_type_ == padding::same)
copy_and_unpad_delta(cws.prev_delta_padded_, ws.prev_delta_);
CNN_LOG_VECTOR(curr_delta, "[pc]curr_delta");
CNN_LOG_VECTOR(prev_delta_[index], "[pc]prev_delta");
CNN_LOG_VECTOR(dW, "[pc]dW");
CNN_LOG_VECTOR(db, "[pc]db");
return prev_->back_propagation(ws.prev_delta_, index);
}
template
void update_weight(Optimizer *o, cnn_size_t worker_size, cnn_size_t batch_size) {
if (W_.empty()) return;
merge(worker_size, batch_size);
CNN_LOG_VECTOR(W_, "[W-before]");
CNN_LOG_VECTOR(b_, "[db-before]");
o->update(worker_storage_[0].dW_, Whessian_, W_);
o->update(worker_storage_[0].db_, bhessian_, b_);
CNN_LOG_VECTOR(W_, "[W-updated]");
CNN_LOG_VECTOR(b_, "[db-updated]");
clear_diff();
post_update();
}