上一篇学习了RBM的代码,而DBN是由多个RBM构成的。其训练过程就是,先逐个训练每个RBM,用训练好的权值和偏置初始化一个相应的BP神经网络,再用有标签的数据调优整个网络。下面我们看一下整个DBN的代码。先是几个class的定义。
DBN类定义:
class DBN { public: int N; int n_ins; int *hidden_layer_sizes; int n_outs; int n_layers; HiddenLayer **sigmoid_layers; RBM **rbm_layers; LogisticRegression *log_layer; DBN(int, int, int*, int, int); ~DBN(); void pretrain(int*, double, int, int); void finetune(int*, int*, double, int); void predict(int*, double*); };
class HiddenLayer { public: int N; int n_in; int n_out; double **W; double *b; HiddenLayer(int, int, int, double**, double*); ~HiddenLayer(); double output(int*, double*, double); void sample_h_given_v(int*, int*); };
class LogisticRegression { public: int N; // num of inputs int n_in; int n_out; double **W; double *b; LogisticRegression(int, int, int); ~LogisticRegression(); void train(int*, int*, double); void softmax(double*); void predict(int*, double*); };
class RBM { public: int N; int n_visible; int n_hidden; double **W; double *hbias; double *vbias; RBM(int, int, int, double**, double*, double*); ~RBM(); void contrastive_divergence(int*, double, int); void sample_h_given_v(int*, double*, int*); void sample_v_given_h(int*, double*, int*); double propup(int*, double*, double); double propdown(int*, int, double); void gibbs_hvh(int*, double*, int*, double*, int*); void reconstruct(int*, double*); };
#include <iostream> #include <math.h> #include "HiddenLayer.h" #include "RBM.h" #include "LogisticRegression.h" #include "DBN.h" using namespace std; double uniform(double min, double max) { //在max和min之间随机一个数 return rand() / (RAND_MAX + 1.0) * (max - min) + min; } int binomial(int n, double p) { //二值化 if(p < 0 || p > 1) return 0; int c = 0; double r; for(int i=0; i<n; i++) { r = rand() / (RAND_MAX + 1.0); if (r < p) c++; } return c; } double sigmoid(double x) { return 1.0 / (1.0 + exp(-x)); } // DBN DBN::DBN(int size, int n_i, int *hls, int n_o, int n_l) { int input_size; //每个RBM输入节点数 N = size; n_ins = n_i; //最开始输入节点数 hidden_layer_sizes = hls; //每个隐含层节点个数 n_outs = n_o; //最后输出节点数 n_layers = n_l; //网络隐含层数 sigmoid_layers = new HiddenLayer*[n_layers]; //新建n_layers个hiddenlayer指针 rbm_layers = new RBM*[n_layers]; //新建n_layers个rbm指针 // construct multi-layer for(int i=0; i<n_layers; i++) { if(i == 0) { input_size = n_ins; //第一层RBM的输入节点就是最开始的输入节点 } else { input_size = hidden_layer_sizes[i-1]; //其余的输入节点数就是上一层的节点数 } // construct sigmoid_layer sigmoid_layers[i] = new HiddenLayer(N, input_size, hidden_layer_sizes[i], NULL, NULL); //分别建立每个隐含层 // construct rbm_layer rbm_layers[i] = new RBM(N, input_size, hidden_layer_sizes[i],sigmoid_layers[i]->W, sigmoid_layers[i]->b, NULL);//分别建立每个RBM单元,这里的inputsize是各个RBM的输入节点数 } // layer for output using LogisticRegression log_layer = new LogisticRegression(N, hidden_layer_sizes[n_layers-1], n_outs);//建立一个逻辑回归层作为最后输出层 } DBN::~DBN() { delete log_layer; for(int i=0; i<n_layers; i++) { delete sigmoid_layers[i]; delete rbm_layers[i]; } delete[] sigmoid_layers; delete[] rbm_layers; } void DBN::pretrain(int *input, double lr, int k, int epochs) { //训练过程,input为输入数据,lr为学习率,k是cd-k,epoch为训练轮次 int *layer_input=NULL; //这个指针必须初始化,源代码没有初始化,下面也有同样问题 int prev_layer_input_size; int *prev_layer_input; int *train_X = new int[n_ins]; for(int i=0; i<n_layers; i++) { // 逐层训练 for(int epoch=0; epoch<epochs; epoch++) { // training epochs for(int n=0; n<N; n++) { // input x1...xN // initial input for(int m=0; m<n_ins; m++) train_X[m] = input[n * n_ins + m]; // layer input for(int l=0; l<=i; l++) { if(l == 0) { //0层的输入数据layer_input[j] = train_X[j] layer_input = new int[n_ins]; for(int j=0; j<n_ins; j++) layer_input[j] = train_X[j]; } else { // if(l == 1) prev_layer_input_size = n_ins; //第一个隐含层的输入节点数为n_ins else prev_layer_input_size = hidden_layer_sizes[l-2]; //后面的隐含层输入节点数为其前一个隐含层的节点数, hidden_layer_sizes数组下标从0开始,所以是l-2 prev_layer_input = new int[prev_layer_input_size]; //初始化每层的输入数组 for(int j=0; j<prev_layer_input_size; j++) prev_layer_input[j] = layer_input[j]; delete[] layer_input; layer_input = new int[hidden_layer_sizes[l-1]]; sigmoid_layers[l-1]->sample_h_given_v(prev_layer_input, layer_input); // 得到其它层的layer input,是HiddenLayer::sample_h_given_v函数,layerinput是二值的 delete[] prev_layer_input; } } rbm_layers[i]->contrastive_divergence(layer_input, lr, k); //训练每个RBM } } } delete[] train_X; delete[] layer_input; } void DBN::finetune(int *input, int *label, double lr, int epochs) { //用有标签的数据调优 int *layer_input=NULL; // int prev_layer_input_size; int *prev_layer_input; int *train_X = new int[n_ins]; int *train_Y = new int[n_outs]; for(int epoch=0; epoch<epochs; epoch++) { for(int n=0; n<N; n++) { // input x1...xN // initial input for(int m=0; m<n_ins; m++) train_X[m] = input[n * n_ins + m]; for(int m=0; m<n_outs; m++) train_Y[m] = label[n * n_outs + m]; // layer input for(int i=0; i<n_layers; i++) { if(i == 0) { prev_layer_input = new int[n_ins]; for(int j=0; j<n_ins; j++) prev_layer_input[j] = train_X[j]; } else { prev_layer_input = new int[hidden_layer_sizes[i-1]]; for(int j=0; j<hidden_layer_sizes[i-1]; j++) prev_layer_input[j] = layer_input[j]; delete[] layer_input; } layer_input = new int[hidden_layer_sizes[i]]; sigmoid_layers[i]->sample_h_given_v(prev_layer_input, layer_input); delete[] prev_layer_input; } log_layer->train(layer_input, train_Y, lr); } // lr *= 0.95; } delete[] layer_input; delete[] train_X; delete[] train_Y; } void DBN::predict(int *x, double *y) { double *layer_input=NULL; // int prev_layer_input_size; double *prev_layer_input; double linear_output; prev_layer_input = new double[n_ins]; for(int j=0; j<n_ins; j++) prev_layer_input[j] = x[j]; // layer activation for(int i=0; i<n_layers; i++) { layer_input = new double[sigmoid_layers[i]->n_out]; for(int k=0; k<sigmoid_layers[i]->n_out; k++) { linear_output = 0.0; for(int j=0; j<sigmoid_layers[i]->n_in; j++) { linear_output += sigmoid_layers[i]->W[k][j] * prev_layer_input[j]; } linear_output += sigmoid_layers[i]->b[k]; layer_input[k] = sigmoid(linear_output); } delete[] prev_layer_input; if(i < n_layers-1) { prev_layer_input = new double[sigmoid_layers[i]->n_out]; for(int j=0; j<sigmoid_layers[i]->n_out; j++) prev_layer_input[j] = layer_input[j]; delete[] layer_input; } } for(int i=0; i<log_layer->n_out; i++) { y[i] = 0; for(int j=0; j<log_layer->n_in; j++) { y[i] += log_layer->W[i][j] * layer_input[j]; } y[i] += log_layer->b[i]; } log_layer->softmax(y); delete[] layer_input; } // HiddenLayer HiddenLayer::HiddenLayer(int size, int in, int out, double **w, double *bp) { //初始一个隐含层W[n_out][n_in],b[n_out] N = size; //样本数 n_in = in; //输入层的节点个数 n_out = out; //该隐层的节点个数 if(w == NULL) { W = new double*[n_out]; for(int i=0; i<n_out; i++) W[i] = new double[n_in]; double a = 1.0 / n_in; for(int i=0; i<n_out; i++) { for(int j=0; j<n_in; j++) { W[i][j] = uniform(-a, a); } } } else { W = w; } if(bp == NULL) { b = new double[n_out]; } else { b = bp; } } HiddenLayer::~HiddenLayer() { for(int i=0; i<n_out; i++) delete W[i]; delete[] W; delete[] b; } double HiddenLayer::output(int *input, double *w, double b) { //计算隐含层输出 double linear_output = 0.0; for(int j=0; j<n_in; j++) { linear_output += w[j] * input[j]; } linear_output += b; return sigmoid(linear_output); } void HiddenLayer::sample_h_given_v(int *input, int *sample) { //对隐层输出二值化 for(int i=0; i<n_out; i++) { sample[i] = binomial(1, output(input, W[i], b[i])); } } // RBM RBM::RBM(int size, int n_v, int n_h, double **w, double *hb, double *vb) { N = size; n_visible = n_v; n_hidden = n_h; if(w == NULL) { W = new double*[n_hidden]; for(int i=0; i<n_hidden; i++) W[i] = new double[n_visible]; double a = 1.0 / n_visible; for(int i=0; i<n_hidden; i++) { for(int j=0; j<n_visible; j++) { W[i][j] = uniform(-a, a); } } } else { W = w; } if(hb == NULL) { hbias = new double[n_hidden]; for(int i=0; i<n_hidden; i++) hbias[i] = 0; } else { hbias = hb; } if(vb == NULL) { vbias = new double[n_visible]; for(int i=0; i<n_visible; i++) vbias[i] = 0; } else { vbias = vb; } } RBM::~RBM() { // for(int i=0; i<n_hidden; i++) delete[] W[i]; // delete[] W; // delete[] hbias; delete[] vbias; } void RBM::contrastive_divergence(int *input, double lr, int k) { double *ph_mean = new double[n_hidden]; int *ph_sample = new int[n_hidden]; double *nv_means = new double[n_visible]; int *nv_samples = new int[n_visible]; double *nh_means = new double[n_hidden]; int *nh_samples = new int[n_hidden]; /* CD-k */ sample_h_given_v(input, ph_mean, ph_sample); for(int step=0; step<k; step++) { if(step == 0) { gibbs_hvh(ph_sample, nv_means, nv_samples, nh_means, nh_samples); } else { gibbs_hvh(nh_samples, nv_means, nv_samples, nh_means, nh_samples); } } for(int i=0; i<n_hidden; i++) { for(int j=0; j<n_visible; j++) { // W[i][j] += lr * (ph_sample[i] * input[j] - nh_means[i] * nv_samples[j]) / N; W[i][j] += lr * (ph_mean[i] * input[j] - nh_means[i] * nv_samples[j]) / N; } hbias[i] += lr * (ph_sample[i] - nh_means[i]) / N; } for(int i=0; i<n_visible; i++) { vbias[i] += lr * (input[i] - nv_samples[i]) / N; } delete[] ph_mean; delete[] ph_sample; delete[] nv_means; delete[] nv_samples; delete[] nh_means; delete[] nh_samples; } void RBM::sample_h_given_v(int *v0_sample, double *mean, int *sample) { for(int i=0; i<n_hidden; i++) { mean[i] = propup(v0_sample, W[i], hbias[i]); sample[i] = binomial(1, mean[i]); } } void RBM::sample_v_given_h(int *h0_sample, double *mean, int *sample) { for(int i=0; i<n_visible; i++) { mean[i] = propdown(h0_sample, i, vbias[i]); sample[i] = binomial(1, mean[i]); } } double RBM::propup(int *v, double *w, double b) { double pre_sigmoid_activation = 0.0; for(int j=0; j<n_visible; j++) { pre_sigmoid_activation += w[j] * v[j]; } pre_sigmoid_activation += b; return sigmoid(pre_sigmoid_activation); } double RBM::propdown(int *h, int i, double b) { double pre_sigmoid_activation = 0.0; for(int j=0; j<n_hidden; j++) { pre_sigmoid_activation += W[j][i] * h[j]; } pre_sigmoid_activation += b; return sigmoid(pre_sigmoid_activation); } void RBM::gibbs_hvh(int *h0_sample, double *nv_means, int *nv_samples, \ double *nh_means, int *nh_samples) { sample_v_given_h(h0_sample, nv_means, nv_samples); sample_h_given_v(nv_samples, nh_means, nh_samples); } void RBM::reconstruct(int *v, double *reconstructed_v) { double *h = new double[n_hidden]; double pre_sigmoid_activation; for(int i=0; i<n_hidden; i++) { h[i] = propup(v, W[i], hbias[i]); } for(int i=0; i<n_visible; i++) { pre_sigmoid_activation = 0.0; for(int j=0; j<n_hidden; j++) { pre_sigmoid_activation += W[j][i] * h[j]; } pre_sigmoid_activation += vbias[i]; reconstructed_v[i] = sigmoid(pre_sigmoid_activation); } delete[] h; } // LogisticRegression LogisticRegression::LogisticRegression(int size, int in, int out) { //初始化一个逻辑回归层 N = size; n_in = in; n_out = out; W = new double*[n_out]; for(int i=0; i<n_out; i++) W[i] = new double[n_in]; b = new double[n_out]; for(int i=0; i<n_out; i++) { for(int j=0; j<n_in; j++) { W[i][j] = 0; } b[i] = 0; } } LogisticRegression::~LogisticRegression() { for(int i=0; i<n_out; i++) delete[] W[i]; delete[] W; delete[] b; } void LogisticRegression::train(int *x, int *y, double lr) { //就像BP神经网络一样训练 double *p_y_given_x = new double[n_out]; double *dy = new double[n_out]; for(int i=0; i<n_out; i++) { //正向传播,得到最后输出 p_y_given_x[i] = 0; for(int j=0; j<n_in; j++) { p_y_given_x[i] += W[i][j] * x[j]; } p_y_given_x[i] += b[i]; } softmax(p_y_given_x); for(int i=0; i<n_out; i++) { //反向传播,跟新权值和偏置 dy[i] = y[i] - p_y_given_x[i]; for(int j=0; j<n_in; j++) { W[i][j] += lr * dy[i] * x[j] / N; } b[i] += lr * dy[i] / N; } delete[] p_y_given_x; delete[] dy; } void LogisticRegression::softmax(double *x) { double max = 0.0; double sum = 0.0; for(int i=0; i<n_out; i++) if(max < x[i]) max = x[i]; for(int i=0; i<n_out; i++) { x[i] = exp(x[i] - max); sum += x[i]; } for(int i=0; i<n_out; i++) x[i] /= sum; } void LogisticRegression::predict(int *x, double *y) { //该层网络正向跑一遍 for(int i=0; i<n_out; i++) { y[i] = 0; for(int j=0; j<n_in; j++) { y[i] += W[i][j] * x[j]; } y[i] += b[i]; } softmax(y); } void test_dbn() { srand(0); double pretrain_lr = 0.1; int pretraining_epochs = 1000; int k = 1; double finetune_lr = 0.1; int finetune_epochs = 500; int train_N = 6; int test_N = 3; int n_ins = 6; int n_outs = 2; int hidden_layer_sizes[] = {3, 3}; int n_layers = sizeof(hidden_layer_sizes) / sizeof(hidden_layer_sizes[0]); // training data int train_X[6][6] = { {1, 1, 1, 0, 0, 0}, {1, 0, 1, 0, 0, 0}, {1, 1, 1, 0, 0, 0}, {0, 0, 1, 1, 1, 0}, {0, 0, 1, 1, 0, 0}, {0, 0, 1, 1, 1, 0} }; int train_Y[6][2] = { {1, 0}, {1, 0}, {1, 0}, {0, 1}, {0, 1}, {0, 1} }; // construct DBN DBN dbn(train_N, n_ins, hidden_layer_sizes, n_outs, n_layers); // pretrain dbn.pretrain(*train_X, pretrain_lr, k, pretraining_epochs); // finetune dbn.finetune(*train_X, *train_Y, finetune_lr, finetune_epochs); // test data int test_X[3][6] = { {1, 1, 0, 0, 0, 0}, {0, 0, 0, 1, 1, 0}, {1, 1, 1, 1, 1, 0} }; double test_Y[3][2]; // test for(int i=0; i<test_N; i++) { dbn.predict(test_X[i], test_Y[i]); for(int j=0; j<n_outs; j++) { cout << test_Y[i][j] << " "; } cout << endl; } }
int main() { test_dbn(); return 0; }
从训练数据来看,这个结果还是比较正确的。