[服务器推理加速工作-6.18]

 

1.C++判断字符串包含

std::string keycode1 = "quan";
std::string keycode2 = "pack";
std::string filepath = "C:\\Users\\ryankang\\Desktop\\fbgemm-log-1.txt";
std::ifstream fin;
fin.open(filepath);
std::string str;
while (!fin.eof())
{
		
	std::getline(fin, str);
	std::string::size_type idex1 = str.find(keycode1);
	std::string::size_type idex2 = str.find(keycode2);
	if (idex1 != std::string::npos || idex2 != std::string::npos) {
			
		std::cout << str << std::endl;
	}
}
fin.close();

2. pytorch编译完后需要设置环境变量才能调用caffe2的Python API

export PYTHONPATH=$PYTHONPATH:/home/ryankang/workspace/projects/pytorch/build

3.FBGEMM-int8量化中,在fbgemmPacked函数里的outputObject对象调用关系为:

requantized的原因是:在量化矩阵-矩阵乘法中,8位整数的乘积通常会被累加到32位的中间结果,随后重新量化以产生8bit的输出(重新量化的方式?如果不需要执行重新量化,直接用int32的中间结果反量化成fp32可以吗?dequantize/requantize还需要了解)

//    Fbgemm.cc包含fbgemmPacked实现
template <
    typename packingAMatrix,
    typename packingBMatrix,
    typename cT,
    typename processOutputType>
void fbgemmPacked(
    PackMatrix<
        packingAMatrix,
        typename packingAMatrix::inpType,
        typename packingAMatrix::accType>& packA,
    PackMatrix<
        packingBMatrix,
        typename packingBMatrix::inpType,
        typename packingBMatrix::accType>& packB,
    cT* C,
    int32_t* C_buffer,
    uint32_t ldc,
    const processOutputType& outProcess,
    int thread_id,
    int num_threads,
    const BlockingFactors* blocking_params) {
  static_assert(
      std::is_same<
          typename packingAMatrix::accType,
          typename packingBMatrix::accType>::value,
      "Accumulation type of both matrices should be the same");

  int MCB, KCB;
  int MR;

  // Run time CPU detection
  if (cpuinfo_initialize()) {
    if (blocking_params) {
      if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
        MCB = blocking_params->MCB;
        KCB = blocking_params->KCB;
        MR = blocking_params->MR;
      }
    } else {
      if (fbgemmHasAvx512Support()) {
        MCB = PackingTraits<
            typename packingAMatrix::inpType,
            typename packingAMatrix::accType,
            inst_set_t::avx512>::MCB;
        KCB = PackingTraits<
            typename packingAMatrix::inpType,
            typename packingAMatrix::accType,
            inst_set_t::avx512>::KCB;
        MR = PackingTraits<
            typename packingAMatrix::inpType,
            typename packingAMatrix::accType,
            inst_set_t::avx512>::MR;
      } else if (fbgemmHasAvx2Support()) {
        MCB = PackingTraits<
            typename packingAMatrix::inpType,
            typename packingAMatrix::accType,
            inst_set_t::avx2>::MCB;
        KCB = PackingTraits<
            typename packingAMatrix::inpType,
            typename packingAMatrix::accType,
            inst_set_t::avx2>::KCB;
        MR = PackingTraits<
            typename packingAMatrix::inpType,
            typename packingAMatrix::accType,
            inst_set_t::avx2>::MR;

      } else {
        // TODO: Have default slower path
        assert(0 && "unsupported architecture");
        return;
      }
    }
  } else {
    throw std::runtime_error("Failed to initialize cpuinfo!");
  }

  if (!packB.isPrePacked()) {
    throw std::runtime_error("B matrix must be prepacked");
  }
  int G = packA.numGroups();
  if (G != packB.numGroups()) {
    throw std::runtime_error(
        "A.groups = " + std::to_string(G) + " and B.groups = " +
        std::to_string(packB.numGroups()) + " are not the same");
  }

  int MDim = packA.numRows();
  int KDimPerGroup = packB.numRows() / G;

  int kBlocks = (KDimPerGroup + KCB - 1) / KCB;

  // remainders
  int _kc = KDimPerGroup % KCB;

  int kc, mc;

  block_type_t blockA{0, 0, 0, 0};
  int g_begin, g_end, i_begin, i_end;
  if (G >= num_threads) {
    // When G >= nthreads, just parallelize over G
    // TODO: when G == nthreads + 1, we'll have a big load imbalance because
    // only one thread will get 2 groups.
    fbgemmGetRange(num_threads, thread_id, G, 1, g_begin, g_end);
    i_begin = 0;
    i_end = MDim;
  } else {
    // Otherwise, each group is parallelized by multiple threads.
    // nthreads_per_group is floor(nthreads / G).
    // If we use ceil, some groups won't be handled by any thread.
    int nthreads_per_group = num_threads / G;
    g_begin = std::max(std::min(thread_id / nthreads_per_group, G - 1), 0);
    g_end = std::min(g_begin + 1, G);

    int tid_of_g_begin = std::min(g_begin * nthreads_per_group, num_threads);
    int tid_of_g_end = std::min(
        (g_end == G) ? num_threads : (tid_of_g_begin + nthreads_per_group),
        num_threads);
    int nthreads_within_group = tid_of_g_end - tid_of_g_begin;
    int tid_within_group = thread_id - tid_of_g_begin;
    fbgemmGetRange(
        nthreads_within_group, tid_within_group, MDim, MR, i_begin, i_end);
  }

  for (int g = g_begin; g < g_end; ++g) {    
//    运算执行函数,利用outputObject创建执行对象exeKernelObj,并调用类成员函数execute
    ExecuteKernel
        exeKernelObj(
            packA,
            packB,
            C,
            C_buffer,
            ldc,
            outProcess,
            thread_id,
            num_threads,
            blocking_params);
    for (int i = i_begin; i < i_end; i += MCB) { // i is the element index
      mc = std::min(i_end - i, MCB);
      for (int kb = 0; kb < kBlocks; ++kb) { // kb is the block index
        kc = (kb != kBlocks - 1 || _kc == 0) ? KCB : _kc;
        // pack A matrix
        blockA = {i, mc, g * KDimPerGroup + kb * KCB, kc};
        packA.pack(blockA);
        exeKernelObj.execute(g * kBlocks + kb);

      }
    }
  } // for each group

}
// ExecuteKernelU8S8.cc中实现了execute函数
template 
void ExecuteKernel<
    packingAMatrix,
    PackBMatrix,
    cT,
    processOutputType>::execute(int kBlock) {
  // packedA_.printPackedMatrix("packedA from kernel");
  // packedB_.printPackedMatrix("packedB from kernel");

  int32_t bColBlocks = packedB_.blockCols();

  int8_t* bBuf;
  int8_t* bBuf_pf;

  uint8_t* aBuf = packedA_.getBuf(0);

  int32_t packed_rows_A = packedA_.numPackedRows();
  int32_t row_start_A = packedA_.packedRowStart();

  int group = kBlock / packedB_.blockRows();
  int NDim = packedB_.numCols();
  bool lastKBlock = packedB_.isThisLastKBlock(kBlock % packedB_.blockRows());
  bool accum = (kBlock % packedB_.blockRows()) > 0;

  typename BaseType::jit_micro_kernel_fp fn;

  if (fbgemmHasAvx512Support()) {
    fn = BaseType::template getOrCreate(
        accum,
        packed_rows_A,
        packedB_.blockColSize(),
        packedA_.numPackedCols(),
        nbSize_);
  } else if (fbgemmHasAvx2Support()) {
    fn = BaseType::template getOrCreate(
        accum,
        packed_rows_A,
        packedB_.blockColSize(),
        packedA_.numPackedCols(),
        nbSize_);
  } else {
    // TODO: Have default slower path
    assert(0 && "unsupported architecture");
    return;
  }

  for (int jb = 0; jb < bColBlocks; ++jb) {
    if (jb == bColBlocks - 1) {
      int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_;
      if (nc != nbSize_) {
        if (fbgemmHasAvx512Support()) {
          fn = BaseType::template getOrCreate(
              accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
        } else if (fbgemmHasAvx2Support()) {
          fn = BaseType::template getOrCreate(
              accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
        } else {
          // TODO: Have default slower path
          assert(0 && "unsupported architecture");
          return;
        }
      }
    }

    bBuf = packedB_.getBuf(jb, kBlock);
    // prefetch addr of the next packed block of B matrix
    bBuf_pf = packedB_.getBuf(jb == bColBlocks - 1 ? jb : jb + 1, kBlock);

    // If the accumulation buffer C_buffer_ is the same as matC_ (inplace output
    // processing), then each thread use the different parts of output buffer
    // matC_;
    // Otherwise, each thread uses different portions of the accumulation
    // buffer C_buffer_. If m is large enough (m >= nthreads * MC), then we only
    // need to use (nthreads * MC) x n portion of C_buffer_, each thread access
    // the C_buffer_row_start as tid * MC * ldc_; else when m is very small, we
    // juse use the whole m x n C_buffer_: each thread use the different
    // portion.
    int32_t* C_buffer_row_start = C_buffer_ +
        ((C_buffer_ == reinterpret_cast(matC_) ||
          num_threads_ * mbSize_ > packedA_.numRows())
             ? row_start_A * ldc_ + NDim * group
             : thread_id_ * mbSize_ * ldc_ + NDim * group);

    int32_t* C_buffer_start = C_buffer_row_start + jb * nbSize_;
    int32_t leadingDim = ldc_;
    if (packedB_.isThereColRemainder() && (jb == bColBlocks - 1)) {
      // In case we will access memory past C_buffer_, we use C_tile_ scratchpad
      // instead.
      C_buffer_start = C_tile_;
      leadingDim = nbSize_;
    }

    fn(aBuf,
       bBuf,
       bBuf_pf,
       C_buffer_start,
       packedA_.numPackedCols(),
       leadingDim);

    // Output processing is done only once per rowblock to amortize overhead
    // and for better spatial locality.
    if (lastKBlock && jb == bColBlocks - 1) {
      // When C_tile_ is used for the last column block, we need a separate
      // handling for the last column block.
      int32_t nSize =
          C_buffer_start == C_tile_ ? jb * nbSize_ : packedB_.numCols();
      if (nSize) {
        if (fbgemmHasAvx512Support()) {
          // TODO: avx512 path
          // Currently use avx2 code
          outputProcess_.template f(
              matC_,
              C_buffer_row_start,
              {row_start_A, packed_rows_A, NDim * group, nSize},
              ldc_,
              ldc_);
        } else if (fbgemmHasAvx2Support()) {        
//    以AVX2指令为例,outputObject调用了内联函数f

          outputProcess_.template f(    
              matC_,
              C_buffer_row_start,
              {row_start_A, packed_rows_A, NDim * group, nSize},
              ldc_,
              ldc_);
        } else {
          // TODO: Have default slower path
          assert(0 && "unsupported architecure");
        }
      }

      if (C_buffer_start == C_tile_) {
        // When C_tile_ scratchpad was used to avoid accessing memory past
        // C_buffer_ .
        if (fbgemmHasAvx512Support()) {
          // TODO: avx512 path
          // Currently use avx2 code
          outputProcess_.template f(
              matC_,
              C_tile_,
              {row_start_A,
               packed_rows_A,
               NDim * group + jb * nbSize_,
               packedB_.lastBcol()},
              ldc_,
              leadingDim);
        } else if (fbgemmHasAvx2Support()) {
          outputProcess_.template f(
              matC_,
              C_tile_,
              {row_start_A,
               packed_rows_A,
               NDim * group + jb * nbSize_,
               packedB_.lastBcol()},
              ldc_,
              leadingDim);
        } else {
          // TODO: Have default slower path
          assert(0 && "unsupported architecure");
        }
      }
    } // output processing

  } // for each j block
}
  fbgemmPacked(
      packA,
      *Wq_packed_,
      Y_uint8_data,        //    最终输出的结果 uint8_t* Y_uint8_data OutputTensorCPU_(0)->template mutable_data();
      Y_int32->data(),    // 临时的中间结果,定义方式为类的成员
      M,
      outputProcObj,
      tid,
      nthreads);

4.计算中间结果Y_int32_data

//    核函数声明  
ExecuteKernel(
      PackMatrix&
          packA,
      PackMatrix<
          PackBMatrix,
          int8_t,
          typename packingAMatrix::accType>& packB,
      cT* matC,
      int32_t* C_buffer,
      int32_t ldc,
      const processOutputType& outputProcess,
      int thread_id,
      int num_threads,
      const BlockingFactors* params = nullptr);

 private:
  PackMatrix&
      packedA_; ///< Packed uint8 block of matrix A.
  PackMatrix<
      PackBMatrix,
      int8_t,
      typename packingAMatrix::accType>& packedB_; ///< Packed int8 matrix B.
  cT* matC_; ///< Output for matrix C.
  int32_t* C_buffer_; ///< the accumulation buffer for matrix C.
  int32_t ldc_; ///< the leading dimension of matrix C.
  const processOutputType& outputProcess_; ///< output processing function for
                                           ///< matrix C in the macro-kernel.
  int thread_id_; ///< the thread id.
  int num_threads_; ///< the total number of threads
  int32_t* C_tile_; ///< buffer for the last N block when NCB is not an exact
                    ///< multiple of N.
  int mbSize_; ///< block size in the m dimension.
  int nbSize_; ///< block size in the n dimension.
  int nrMinSize_; ///< minimum register size in the n dimension.
  int nrSize_; ///< register size in the n dimension.

5. mmdnn使用说明

caffe2tensorflow

1.把caffe转换成中间件

$ python -m mmdnn.conversion._script.convertToIR -f caffe -d kit_imagenet -n deploy.prototxt -w VGG16_SOD_finetune.caffemodel

2.把中间件转换成对应的框架tensorflow

$ python -m mmdnn.conversion._script.IRToCode -f pytorch -n kit_imagenet.pb -d kit_imagenet.py -w kit_imagenet.npy -dw kit_pytorch.npy

3.生成pytorch模型,pth文件内包括模型的参数和结构信息

python -m mmdnn.conversion.examples.pytorch.imagenet_test --dump vgg16.pth -n kit_imagenet.py -w kit_pytorch.npy

 

你可能感兴趣的:(工作笔记)