
    在网络的整体架构搭建好之后,剩下的最重要的部分就是该怎么训练网络,如何更新各层网络的weight和bias参数,LeNet网络采用的optimization的方式是Stochastic Diagonal Levenberg-Marquardt的方式。下面就先介绍以下Stochastic Doagonal Levenberg-Marquardt的基本思想,然后再看tiny_cnn中的实现过程。

    1,Stochastic Diagonal Levenberg-Marquardt的原理:stochastic diagonal levenberg-marquardt主要做的就是利用二阶偏导数对训练进行加速,具体细节在《Neural Networks:Tricks of the Trade》里面有介绍,而stochastic指的是把levenberg-marquardt的方法用于在线学习,至于在线学习,在另一篇文章里有介绍。大体上来说这个方法就是利用二阶偏导数修正每个网络参数的学习速率,使得能加快训练速度。



nn.train(train_images, train_labels, minibatch_size, 20, on_enumerate_minibatch, on_enumerate_epoch);



for (int iter = 0; iter < epoch; iter++) { // epoch:
            if (optimizer_.requires_hessian())
                calc_hessian(in, t_cost); // calculate derivatice for every layer
            for (size_t i = 0; i < in.size(); i+=batch_size) {
                train_once(&in[i], &t[i],
                           static_cast(std::min(batch_size, in.size() - i)),
                           get_target_cost_sample_pointer(t_cost, i));

                if (i % 100 == 0 && layers_.is_exploded()) {
                    std::cout << "[Warning]Detected infinite value in weight. stop learning." << std::endl;
                    return false;


void train_onebatch(const vec_t* in, const vec_t* t, int batch_size, const int num_tasks, const vec_t* t_cost) {
        int num_threads = std::min(batch_size, num_tasks);

        // number of data points to use in each thread
        int data_per_thread = (batch_size + num_threads - 1) / num_threads;

        // i is the thread / worker index
        for_i(num_threads, [&](int i) {
            int start_index = i * data_per_thread;
            int end_index = std::min(batch_size, start_index + data_per_thread);

            // loop over data points in this batch assigned to thread i
            for (int j = start_index; j < end_index; ++j)
                bprop(fprop(in[j], i), t[j], i, t_cost ? &(t_cost[j]) : nullptr);
        }, 1);
        // merge all dW and update W by optimizer
        layers_.update_weights(&optimizer_, num_threads, batch_size);



virtual const vec_t& forward_propagation(const vec_t& in_raw, size_t worker_index) override
        copy_and_pad_input(in_raw, static_cast(worker_index));

        auto& ws = this->get_worker_storage(worker_index);
        vec_t &a = ws.a_; // w*x
        vec_t &out = ws.output_; // output
        const vec_t &in = *(conv_layer_worker_storage_[worker_index].prev_out_padded_); // input
        std::fill(a.begin(), a.end(), float_t(0));

        for_i(parallelize_, out_.depth_, [&](int o) {
            for (cnn_size_t inc = 0; inc < in_.depth_; inc++) {
                if (!tbl_.is_connected(o, inc)) continue;

                const float_t *pw = &this->W_[weight_.get_index(0, 0, in_.depth_ * o + inc)];
                const float_t *pi = &in[in_padded_.get_index(0, 0, inc)];
                float_t *pa = &a[out_.get_index(0, 0, o)];

                for (cnn_size_t y = 0; y < out_.height_; y++) {
                    for (cnn_size_t x = 0; x < out_.width_; x++) {
                        const float_t * ppw = pw;
                        const float_t * ppi = pi + (y * h_stride_) * in_padded_.width_ + x * w_stride_;
                        float_t sum = float_t(0);

                        // should be optimized for small kernel(3x3,5x5)
                        for (cnn_size_t wy = 0; wy < weight_.height_; wy++) {
                            for (cnn_size_t wx = 0; wx < weight_.width_; wx++) {
                                sum += *ppw++ * ppi[wy * in_padded_.width_ + wx];
                        pa[y * out_.width_ + x] += sum;

            if (!this->b_.empty()) {
                float_t *pa = &a[out_.get_index(0, 0, o)];
                float_t b = this->b_[o];
                std::for_each(pa, pa + out_.width_ * out_.height_, [&](float_t& f) { f += b; });

        for_i(parallelize_, out_size_, [&](int i) {
            out[i] = h_.f(a, i);

        CNN_LOG_VECTOR(in_raw, "[pc]in");
        CNN_LOG_VECTOR(W_, "[pc]w");
        CNN_LOG_VECTOR(a, "[pc]a");
        CNN_LOG_VECTOR(out, "[pc]forward");

        return next_ ? next_->forward_propagation(out, worker_index) : out;


const vec_t& back_propagation(const vec_t& curr_delta, size_t index) override {
        auto& ws = this->get_worker_storage(index);
        conv_layer_worker_specific_storage& cws = conv_layer_worker_storage_[index];

        const vec_t& prev_out = *(cws.prev_out_padded_);
        const activation::function& prev_h = prev_->activation_function();
        vec_t* prev_delta = (pad_type_ == padding::same) ? &cws.prev_delta_padded_ : &ws.prev_delta_;
        vec_t& dW = ws.dW_;
        vec_t& db = ws.db_;

        std::fill(prev_delta->begin(), prev_delta->end(), float_t(0));

        // propagate delta to previous layer
        for_i(in_.depth_, [&](int inc) {
            for (cnn_size_t outc = 0; outc < out_.depth_; outc++) {
                if (!tbl_.is_connected(outc, inc)) continue;

                const float_t *pw = &this->W_[weight_.get_index(0, 0, in_.depth_ * outc + inc)];
                const float_t *pdelta_src = &curr_delta[out_.get_index(0, 0, outc)];
                float_t *pdelta_dst = &(*prev_delta)[in_padded_.get_index(0, 0, inc)];

                for (cnn_size_t y = 0; y < out_.height_; y++) {
                    for (cnn_size_t x = 0; x < out_.width_; x++) {
                        const float_t * ppw = pw;
                        const float_t ppdelta_src = pdelta_src[y * out_.width_ + x];
                        float_t * ppdelta_dst = pdelta_dst + y * h_stride_ * in_padded_.width_ + x * w_stride_;

                        for (cnn_size_t wy = 0; wy < weight_.height_; wy++) {
                            for (cnn_size_t wx = 0; wx < weight_.width_; wx++) {
                                ppdelta_dst[wy * in_padded_.width_ + wx] += *ppw++ * ppdelta_src;

        for_i(parallelize_, in_padded_.size(), [&](int i) {
            (*prev_delta)[i] *= prev_h.df(prev_out[i]);

        // accumulate dw
        for_i(in_.depth_, [&](int inc) {
            for (cnn_size_t outc = 0; outc < out_.depth_; outc++) {

                if (!tbl_.is_connected(outc, inc)) continue;

                for (cnn_size_t wy = 0; wy < weight_.height_; wy++) {
                    for (cnn_size_t wx = 0; wx < weight_.width_; wx++) {
                        float_t dst = float_t(0);
                        const float_t * prevo = &prev_out[in_padded_.get_index(wx, wy, inc)];
                        const float_t * delta = &curr_delta[out_.get_index(0, 0, outc)];

                        for (cnn_size_t y = 0; y < out_.height_; y++) {
                            dst += vectorize::dot(prevo + y * in_padded_.width_, delta + y * out_.width_, out_.width_);
                        dW[weight_.get_index(wx, wy, in_.depth_ * outc + inc)] += dst;

        // accumulate db
        if (!db.empty()) {
            for (cnn_size_t outc = 0; outc < out_.depth_; outc++) {
                const float_t *delta = &curr_delta[out_.get_index(0, 0, outc)];
                db[outc] += std::accumulate(delta, delta + out_.width_ * out_.height_, float_t(0));

        if (pad_type_ == padding::same)
            copy_and_unpad_delta(cws.prev_delta_padded_, ws.prev_delta_);

        CNN_LOG_VECTOR(curr_delta, "[pc]curr_delta");
        CNN_LOG_VECTOR(prev_delta_[index], "[pc]prev_delta");
        CNN_LOG_VECTOR(dW, "[pc]dW");
        CNN_LOG_VECTOR(db, "[pc]db");

        return prev_->back_propagation(ws.prev_delta_, index);


    void update_weight(Optimizer *o, cnn_size_t worker_size, cnn_size_t batch_size) {
        if (W_.empty()) return;

        merge(worker_size, batch_size);

        CNN_LOG_VECTOR(W_, "[W-before]");
        CNN_LOG_VECTOR(b_, "[db-before]");

        o->update(worker_storage_[0].dW_, Whessian_, W_);
        o->update(worker_storage_[0].db_, bhessian_, b_);

        CNN_LOG_VECTOR(W_, "[W-updated]");
        CNN_LOG_VECTOR(b_, "[db-updated]");


