OpenPose训练过程解析(4)

CPMTransformationParameter参数解析

layer {
  name: "data"
  type: "CPMData"
  top: "data"
  top: "label"
  data_param {      //caffe.proto  Line:687
    source: "/mnt/sdb/yangbin/COCO_kpt/lmdb"
    batch_size: 10
    backend: LMDB
  }
  cpm_transform_param {
    stride: 8
    max_rotate_degree: 40
    visualize: false
    crop_size_x: 368
    crop_size_y: 368
    scale_prob: 1
    scale_min: 0.5
    scale_max: 1.1
    target_dist: 0.6
    center_perterb_max: 40
    do_clahe: false
    num_parts: 56
    np_in_lmdb: 17
  }
}
message CPMTransformationParameter {
  // 对数据进行预处理,可以执行简单的缩放或者减去图像均值(注意减去均值操作通常在缩放之前执行)
  optional float scale = 1 [default = 1];
  // 制定是否镜像数据(默认为false)
  optional bool mirror = 2 [default = false];
  // 制定图像裁剪尺寸(默认为0,实际使用中设置crop_size_x和crop_size_y)
  optional uint32 crop_size = 3 [default = 0];
  // 均值文件和均值大小可以同时指定
  optional string mean_file = 4;
  // if specified can be repeated once  (所有的通道都减去均值)
  // or can be repeated the same number of times as channels  (指定通道减去均值)
  repeated float mean_value = 5;
  optional uint32 stride = 6 [default = 4];
  optional float scale_cvg = 7 [default = 0.5];
  optional uint32 max_cvg_len = 8 [default = 50];
  optional uint32 min_cvg_len = 9 [default = 50];
  optional bool opaque_coverage = 10 [default = true];
  optional string coverage = 11 [default = "gridbox_max"];
  optional float flip_prob = 12 [default = 0.5];
  optional float max_rotate_degree = 13 [default = 5.0];
  optional bool visualize = 14 [default = false];
  optional uint32 crop_size_x = 15 [default = 368];
  optional uint32 crop_size_y = 16 [default = 368];
  optional float scale_prob = 17 [default = 0.5];
  optional float scale_min = 18 [default = 0.9];
  optional float scale_max = 19 [default = 1.1];
  optional float bbox_norm_factor = 20 [default = 300];
  optional string img_header = 21 [default = "."];
  // Force the decoded image to have 3 color channels.
  optional bool force_color = 22 [default = false];
  // Force the decoded image to have 1 color channels.
  optional bool force_gray = 23 [default = false];
  optional float target_dist = 24 [default = 1.0];
  optional float center_perterb_max = 25 [default = 10.0];
  optional float sigma = 26 [default = 7.0];
  optional float sigma_center = 27 [default = 21.0];
  optional float clahe_tile_size = 28 [default = 8.0];
  optional float clahe_clip_limit = 29 [default = 4.0];
  optional bool do_clahe = 30 [default = false];
  optional uint32 num_parts = 31 [default = 14];
  optional uint32 num_total_augs = 32 [default = 82];
  optional string aug_way = 33 [default = "rand"];
  optional uint32 gray = 34 [default = 0];
  optional uint32 np_in_lmdb = 35 [default = 16];
  optional bool transform_body_joint = 38 [default = true];
}

一个Datum有三个维度,channels, height,和width,可以看做是少了num维度的Blob。存放数据的地方有两个:byte_data和float_data,分别存放整数型和浮点型数据。图像数据一般是整形,放在byte_data里,特征向量一般是浮点型,放在float_data里。label存放数据的类别标签,是整数型。encoded标识数据是否需要被解码(里面有可能放的是JPEG或者PNG之类经过编码的数据)。

message Datum {
  optional int32 channels = 1;    //数据维度信息,channel*height*width
  optional int32 height = 2;
  optional int32 width = 3;
  // the actual image data, in bytes
  optional bytes data = 4;        //图像数据,以字节类型存储
  optional int32 label = 5;
  // Optionally, the datum could also hold float data.
  repeated float float_data = 6;    //可选,图像数据也可以float类型存储
  // If true data contains an encoded image that need to be decoded
  optional bool encoded = 7 [default = false];  //encoded标识数据是否需要被解码(里面有可能放的是JPEG或者PNG之类经过编码的数据)
}

DataLayerSetUp函数实现层设置

template 
CPMDataLayer::CPMDataLayer(const LayerParameter& param)
  : BasePrefetchingDataLayer(param),
    reader_(param),
    cpm_transform_param_(param.cpm_transform_param()){
}
template 
void CPMDataLayer::DataLayerSetUp(const vector*>& bottom,
      const vector*>& top) {
  cpm_data_transformer_.reset(
     new CPMDataTransformer(cpm_transform_param_, this->phase_));   // 调用DataLayerSetUp函数的类设置phase为train or test
  cpm_data_transformer_->InitRand();                                       // cpm_data_transformer_初始化

上述两部分代码都位于cpm_data_layer.cpp中,第一部分为CPMData层的构造函数,LayerParameter是包含所有层的类(例如Loss层、ReLU层、Data层……),然后param就是将我们写的网络读入的一个参数,用param.cpm_transform_param来初始化cpm_transform_param_(CPMTransformationParameter   cpm_transform_param_;)参数,这样cpm_transform_param_就包含了CPMData层的所有参数,即如下所示。

cpm_transform_param {
  stride: 8
  max_rotate_degree: 40
  visualize: false
  crop_size_x: 368
  crop_size_y: 368
  scale_prob: 1
  scale_min: 0.5
  scale_max: 1.1
  target_dist: 0.6
  center_perterb_max: 40
  do_clahe: false
  num_parts: 56
  np_in_lmdb: 17
}
  • 设置crop_size_x和crop_size_y
  // image
  const int crop_size = this->layer_param_.cpm_transform_param().crop_size();
  const int batch_size = this->layer_param_.data_param().batch_size();
  if (crop_size > 0) {              //实际运行中,设置crop_size为默认值0
    // top[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size);
    // for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
    //   this->prefetch_[i].data_.Reshape(batch_size, datum.channels(), crop_size, crop_size);
    // }
    // //this->transformed_data_.Reshape(1, 4, crop_size, crop_size);
    // this->transformed_data_.Reshape(1, 6, crop_size, crop_size);
  } 
  else {
    const int height = this->phase_ != TRAIN ? datum.height() :    //设置crop_size_x=368
      this->layer_param_.cpm_transform_param().crop_size_y();      //设置crop_size_y=368
    const int width = this->phase_ != TRAIN ? datum.width() :
      this->layer_param_.cpm_transform_param().crop_size_x();
    LOG(INFO) << "PREFETCH_COUNT is " << this->PREFETCH_COUNT;
    top[0]->Reshape(batch_size, datum.channels(), height, width);
    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {
      this->prefetch_[i].data_.Reshape(batch_size, datum.channels(), height, width);
    }
    //this->transformed_data_.Reshape(1, 4, height, width);
    this->transformed_data_.Reshape(1, datum.channels(), height, width);
  }
  • 设置num_parts stride
  // label
  if (this->output_labels_) {
    const int stride = this->layer_param_.cpm_transform_param().stride();
    const int height = this->phase_ != TRAIN ? datum.height() :
      this->layer_param_.cpm_transform_param().crop_size_y();
    const int width = this->phase_ != TRAIN ? datum.width() :
      this->layer_param_.cpm_transform_param().crop_size_x();

    int num_parts = this->layer_param_.cpm_transform_param().num_parts(); //COCO 's num_parts = 56
    top[1]->Reshape(batch_size, 2*(num_parts+1), height/stride, width/stride);
    //训练时产生如下结果:
    //I1008 14:29:50.468617 33177 net.cpp:157] Top shape: 10 6 368 368 (8125440)
    //I1008 14:29:50.468626 33177 net.cpp:157] Top shape: 10 114 46 46 (2412240)

    for (int i = 0; i < this->PREFETCH_COUNT; ++i) {      //static const int PREFETCH_COUNT = 3;
      this->prefetch_[i].label_.Reshape(batch_size, 2*(num_parts+1), height/stride, width/stride);  // 10,114,46,46
    }
    this->transformed_label_.Reshape(1, 2*(num_parts+1), height/stride, width/stride);  // 1,114,46,46
  }
  • 接下来的load_batch是一个纯虚函数,因此继承BasePrefetchingDataLayer类的子类都需要实现这个函数,用于 取数据,填充数据结构
virtual void load_batch(Batch* batch) = 0;
  • 调用data transformations(mirror, scale, crop……)
    // Apply data transformations (mirror, scale, crop...)
    timer.Start();
    const int offset_data = batch->data_.offset(item_id);
    const int offset_label = batch->label_.offset(item_id);
    this->transformed_data_.set_cpu_data(top_data + offset_data);
    this->transformed_label_.set_cpu_data(top_label + offset_label);
    if (datum.encoded()) {
      this->cpm_data_transformer_->Transform(cv_img, &(this->transformed_data_));  //调用Transform函数
    } else {
      this->cpm_data_transformer_->Transform_nv(datum, 
        &(this->transformed_data_),
        &(this->transformed_label_), cnt); //调用Transform_nv函数
      ++cnt;
    }

接下来通过调用TransformTransform_nv函数来对数据进行处理

  • Transform函数
template
void CPMDataTransformer::Transform(const cv::Mat& cv_img,
                                       Blob* transformed_blob) {
  const int img_channels = cv_img.channels();
  const int img_height = cv_img.rows;
  const int img_width = cv_img.cols;

  const int channels = transformed_blob->channels();
  const int height = transformed_blob->height();
  const int width = transformed_blob->width();
  const int num = transformed_blob->num();

  CHECK_EQ(channels, img_channels);
  CHECK_LE(height, img_height);
  CHECK_LE(width, img_width);
  CHECK_GE(num, 1);

  CHECK(cv_img.depth() == CV_8U) << "Image data type must be unsigned byte";

  const int crop_size = param_.crop_size();
  const Dtype scale = param_.scale();
  const bool do_mirror = param_.mirror() && Rand(2);
  const bool has_mean_file = param_.has_mean_file();
  const bool has_mean_values = mean_values_.size() > 0;

  CHECK_GT(img_channels, 0);
  CHECK_GE(img_height, crop_size);
  CHECK_GE(img_width, crop_size);

  Dtype* mean = NULL;
  if (has_mean_file) {
    CHECK_EQ(img_channels, data_mean_.channels());
    CHECK_EQ(img_height, data_mean_.height());
    CHECK_EQ(img_width, data_mean_.width());
    mean = data_mean_.mutable_cpu_data();
  }
  if (has_mean_values) {
    CHECK(mean_values_.size() == 1 || mean_values_.size() == img_channels) <<
     "Specify either 1 mean_value or as many as channels: " << img_channels;
    if (img_channels > 1 && mean_values_.size() == 1) {
      // Replicate the mean_value for simplicity
      for (int c = 1; c < img_channels; ++c) {
        mean_values_.push_back(mean_values_[0]);
      }
    }
  }

  int h_off = 0;
  int w_off = 0;
  cv::Mat cv_cropped_img = cv_img;
  if (crop_size) {
    CHECK_EQ(crop_size, height);
    CHECK_EQ(crop_size, width);
    // We only do random crop when we do training.
    if (phase_ == TRAIN) {
      h_off = Rand(img_height - crop_size + 1);
      w_off = Rand(img_width - crop_size + 1);
    } else {
      h_off = (img_height - crop_size) / 2;
      w_off = (img_width - crop_size) / 2;
    }
    cv::Rect roi(w_off, h_off, crop_size, crop_size);
    cv_cropped_img = cv_img(roi);
  } else {
    CHECK_EQ(img_height, height);
    CHECK_EQ(img_width, width);
  }

  CHECK(cv_cropped_img.data);

  Dtype* transformed_data = transformed_blob->mutable_cpu_data();
  int top_index;
  for (int h = 0; h < height; ++h) {
    const uchar* ptr = cv_cropped_img.ptr(h);
    int img_index = 0;
    for (int w = 0; w < width; ++w) {
      for (int c = 0; c < img_channels; ++c) {
        if (do_mirror) {
          top_index = (c * height + h) * width + (width - 1 - w);    
    //图像存储顺序:C*H*W,因此top_index之前有C个通道,每个通道有H*W个像素,
    //在当前通道top_index之前又有h*width像素,最后还要加上当前行所在的w个像素
        } else {
          top_index = (c * height + h) * width + w;
        }
        // int top_index = (c * height + h) * width + w;
        Dtype pixel = static_cast(ptr[img_index++]);
        if (has_mean_file) {
          int mean_index = (c * img_height + h_off + h) * img_width + w_off + w;
          transformed_data[top_index] =
            (pixel - mean[mean_index]) * scale;
        } else {
          if (has_mean_values) {
            transformed_data[top_index] =
              (pixel - mean_values_[c]) * scale;      //减去均值操作
          } else {
            transformed_data[top_index] = pixel * scale;
          }
        }
      }
    }
  }
}
  • Transform_nv函数

load_batch函数中:datum是数据的来源,作为Transform_nv函数中的data,应该是制作好的LMDB数据

Datum& datum = *(reader_.full().pop("Waiting for data"));
this->cpm_data_transformer_->Transform_nv(datum, 
        &(this->transformed_data_),
        &(this->transformed_label_), cnt);
      ++cnt;
template void CPMDataTransformer::Transform_nv(const Datum& datum, Dtype* transformed_data, Dtype* transformed_label, int cnt) {
  
  //TODO: some parameter should be set in prototxt
  int clahe_tileSize = param_.clahe_tile_size();
  int clahe_clipLimit = param_.clahe_clip_limit();
  //float targetDist = 41.0/35.0;
  AugmentSelection as = {
    false,              //bool flip
    0.0,                //float degree
    Size(),             //Size crop
    0,                  //float scale
  };
  MetaData meta;
  
  const string& data = datum.data();        //输入图像数据
  const int datum_channels = datum.channels();
  const int datum_height = datum.height();
  const int datum_width = datum.width();
  // To do: make this a parameter in caffe.proto
  //const int mode = 5; //related to datum.channels();
  const int mode = 5;

  /** 位于Transform函数中
  //const int crop_size = param_.crop_size();
  //const Dtype scale = param_.scale();
  //const bool do_mirror = param_.mirror() && Rand(2);
  //const bool has_mean_file = param_.has_mean_file();
  **/
  const bool has_uint8 = data.size() > 0;
  //const bool has_mean_values = mean_values_.size() > 0;
  int crop_x = param_.crop_size_x();
  int crop_y = param_.crop_size_y();

  CHECK_GT(datum_channels, 0);
  //CHECK_GE(datum_height, crop_size);
  //CHECK_GE(datum_width, crop_size);
  CPUTimer timer1;
  timer1.Start();
  //before any transformation, get the image from datum
  Mat img = Mat::zeros(datum_height, datum_width, CV_8UC3);
  Mat mask_all, mask_miss;
  if(mode >= 5){
    mask_miss = Mat::ones(datum_height, datum_width, CV_8UC1);
  }
  if(mode == 6){
    mask_all = Mat::zeros(datum_height, datum_width, CV_8UC1);
  }

  int offset = img.rows * img.cols;
  int dindex;
  Dtype d_element;
  for (int i = 0; i < img.rows; ++i) {
    for (int j = 0; j < img.cols; ++j) {
      Vec3b& rgb = img.at(i, j);
      for(int c = 0; c < 3; c++){
        dindex = c*offset + i*img.cols + j;  //C*H*W格式存储
        if (has_uint8)
          d_element = static_cast(static_cast(data[dindex]));
        else
          d_element = datum.float_data(dindex);
        rgb[c] = d_element;    //img.at(i, j)的c通道数据(uchar类型)
      }

      if(mode >= 5){
        dindex = 4*offset + i*img.cols + j;
        if (has_uint8)
          d_element = static_cast(static_cast(data[dindex]));
        else
          d_element = datum.float_data(dindex);
        if (round(d_element/255)!=1 && round(d_element/255)!=0){    //主要用来判断d_element是否为整数,若为小数则四舍五入(float/255会得到小数)
          cout << d_element << " " << round(d_element/255) << endl;
        }
        mask_miss.at(i, j) = d_element; //round(d_element/255);
      }

      if(mode == 6){
        dindex = 5*offset + i*img.cols + j;
        if (has_uint8)
          d_element = static_cast(static_cast(data[dindex]));
        else
          d_element = datum.float_data(dindex);
        mask_all.at(i, j) = d_element;
      }
    }
  }
  VLOG(2) << "  rgb[:] = datum: " << timer1.MicroSeconds()/1000.0 << " ms";
  timer1.Start();

  //color, contract
  if(param_.do_clahe())
    clahe(img, clahe_tileSize, clahe_clipLimit);    //直方图均衡化
  if(param_.gray() == 1){
    cv::cvtColor(img, img, CV_BGR2GRAY);
    cv::cvtColor(img, img, CV_GRAY2BGR);
  }
  VLOG(2) << "  color: " << timer1.MicroSeconds()/1000.0 << " ms";
  timer1.Start();

  int offset3 = 3 * offset;
  int offset1 = datum_width;
  int stride = param_.stride();    //stride = 8
  ReadMetaData(meta, data, offset3, offset1);
  if(param_.transform_body_joint()) // we expect to transform body joints, and not to transform hand joints
    TransformMetaJoints(meta);

  VLOG(2) << "  ReadMeta+MetaJoints: " << timer1.MicroSeconds()/1000.0 << " ms";
  timer1.Start();
  //visualize original
  if(0 && param_.visualize()) 
    visualize(img, meta, as);

  //Start transforming
  Mat img_aug = Mat::zeros(crop_y, crop_x, CV_8UC3);
  Mat mask_miss_aug, mask_all_aug ;
  //Mat mask_miss_aug = Mat::zeros(crop_y, crop_x, CV_8UC1);
  //Mat mask_all_aug = Mat::zeros(crop_y, crop_x, CV_8UC1);
  Mat img_temp, img_temp2, img_temp3; //size determined by scale
  VLOG(2) << "   input size (" << img.cols << ", " << img.rows << ")"; 
  // We only do random transform as augmentation when training.
  if (phase_ == TRAIN) {
    as.scale = augmentation_scale(img, img_temp, mask_miss, mask_all, meta, mode);
    //LOG(INFO) << meta.joint_self.joints.size();
    //LOG(INFO) << meta.joint_self.joints[0];
    as.degree = augmentation_rotate(img_temp, img_temp2, mask_miss, mask_all, meta, mode);
    //LOG(INFO) << meta.joint_self.joints.size();
    //LOG(INFO) << meta.joint_self.joints[0];
    if(0 && param_.visualize()) 
      visualize(img_temp2, meta, as);
    as.crop = augmentation_croppad(img_temp2, img_temp3, mask_miss, mask_miss_aug, mask_all, mask_all_aug, meta, mode);
    //LOG(INFO) << meta.joint_self.joints.size();
    //LOG(INFO) << meta.joint_self.joints[0];
    if(0 && param_.visualize()) 
      visualize(img_temp3, meta, as);
    as.flip = augmentation_flip(img_temp3, img_aug, mask_miss_aug, mask_all_aug, meta, mode);
    //LOG(INFO) << meta.joint_self.joints.size();
    //LOG(INFO) << meta.joint_self.joints[0];
    if(param_.visualize()) 
      visualize(img_aug, meta, as);

    // imshow("img_aug", img_aug);
    // Mat label_map = mask_miss_aug;
    // applyColorMap(label_map, label_map, COLORMAP_JET);
    // addWeighted(label_map, 0.5, img_aug, 0.5, 0.0, label_map);
    // imshow("mask_miss_aug", label_map);

    if (mode > 4){
      resize(mask_miss_aug, mask_miss_aug, Size(), 1.0/stride, 1.0/stride, INTER_CUBIC);
    }
    if (mode > 5){
      resize(mask_all_aug, mask_all_aug, Size(), 1.0/stride, 1.0/stride, INTER_CUBIC);
    }
  }
  else {
    img_aug = img.clone();
    as.scale = 1;
    as.crop = Size();
    as.flip = 0;
    as.degree = 0;
  }
  VLOG(2) << "  Aug: " << timer1.MicroSeconds()/1000.0 << " ms";
  timer1.Start();
  //LOG(INFO) << "scale: " << as.scale << "; crop:(" << as.crop.width << "," << as.crop.height 
  //          << "); flip:" << as.flip << "; degree: " << as.degree;

  //copy transformed img (img_aug) into transformed_data, do the mean-subtraction here
  offset = img_aug.rows * img_aug.cols;
  int rezX = img_aug.cols;
  int rezY = img_aug.rows;
  int grid_x = rezX / stride;
  int grid_y = rezY / stride;
  int channelOffset = grid_y * grid_x;

  for (int i = 0; i < img_aug.rows; ++i) {
    for (int j = 0; j < img_aug.cols; ++j) {
      Vec3b& rgb = img_aug.at(i, j);
      transformed_data[0*offset + i*img_aug.cols + j] = (rgb[0] - 128)/256.0;
      transformed_data[1*offset + i*img_aug.cols + j] = (rgb[1] - 128)/256.0;
      transformed_data[2*offset + i*img_aug.cols + j] = (rgb[2] - 128)/256.0;
    }
  }
  
  // label size is image size/ stride
  if (mode > 4){
    for (int g_y = 0; g_y < grid_y; g_y++){
      for (int g_x = 0; g_x < grid_x; g_x++){
        for (int i = 0; i < np; i++){
          float weight = float(mask_miss_aug.at(g_y, g_x)) /255; //mask_miss_aug.at(i, j); 
          if (meta.joint_self.isVisible[i] != 3){
            transformed_label[i*channelOffset + g_y*grid_x + g_x] = weight;
          }
        }  
        // background channel
        if(mode == 5){
          transformed_label[np*channelOffset + g_y*grid_x + g_x] = float(mask_miss_aug.at(g_y, g_x)) /255;
        }
        if(mode > 5){
          transformed_label[np*channelOffset + g_y*grid_x + g_x] = 1;
          transformed_label[(2*np+1)*channelOffset + g_y*grid_x + g_x] = float(mask_all_aug.at(g_y, g_x)) /255;
        }
      }
    }
  }  

  //putGaussianMaps(transformed_data + 3*offset, meta.objpos, 1, img_aug.cols, img_aug.rows, param_.sigma_center());
  //LOG(INFO) << "image transformation done!";
  generateLabelMap(transformed_label, img_aug, meta);

  VLOG(2) << "  putGauss+genLabel: " << timer1.MicroSeconds()/1000.0 << " ms";
  //starts to visualize everything (transformed_data in 4 ch, label) fed into conv1
  //if(param_.visualize()){
    //dumpEverything(transformed_data, transformed_label, meta);
  //}
}

你可能感兴趣的:(OpenPose训练过程解析(4))