1.C++判断字符串包含
std::string keycode1 = "quan";
std::string keycode2 = "pack";
std::string filepath = "C:\\Users\\ryankang\\Desktop\\fbgemm-log-1.txt";
std::ifstream fin;
fin.open(filepath);
std::string str;
while (!fin.eof())
{
std::getline(fin, str);
std::string::size_type idex1 = str.find(keycode1);
std::string::size_type idex2 = str.find(keycode2);
if (idex1 != std::string::npos || idex2 != std::string::npos) {
std::cout << str << std::endl;
}
}
fin.close();
2. pytorch编译完后需要设置环境变量才能调用caffe2的Python API
export PYTHONPATH=$PYTHONPATH:/home/ryankang/workspace/projects/pytorch/build
3.FBGEMM-int8量化中,在fbgemmPacked函数里的outputObject对象调用关系为:
requantized的原因是:在量化矩阵-矩阵乘法中,8位整数的乘积通常会被累加到32位的中间结果,随后重新量化以产生8bit的输出(重新量化的方式?如果不需要执行重新量化,直接用int32的中间结果反量化成fp32可以吗?dequantize/requantize还需要了解)
// Fbgemm.cc包含fbgemmPacked实现
template <
typename packingAMatrix,
typename packingBMatrix,
typename cT,
typename processOutputType>
void fbgemmPacked(
PackMatrix<
packingAMatrix,
typename packingAMatrix::inpType,
typename packingAMatrix::accType>& packA,
PackMatrix<
packingBMatrix,
typename packingBMatrix::inpType,
typename packingBMatrix::accType>& packB,
cT* C,
int32_t* C_buffer,
uint32_t ldc,
const processOutputType& outProcess,
int thread_id,
int num_threads,
const BlockingFactors* blocking_params) {
static_assert(
std::is_same<
typename packingAMatrix::accType,
typename packingBMatrix::accType>::value,
"Accumulation type of both matrices should be the same");
int MCB, KCB;
int MR;
// Run time CPU detection
if (cpuinfo_initialize()) {
if (blocking_params) {
if (fbgemmHasAvx512Support() || fbgemmHasAvx2Support()) {
MCB = blocking_params->MCB;
KCB = blocking_params->KCB;
MR = blocking_params->MR;
}
} else {
if (fbgemmHasAvx512Support()) {
MCB = PackingTraits<
typename packingAMatrix::inpType,
typename packingAMatrix::accType,
inst_set_t::avx512>::MCB;
KCB = PackingTraits<
typename packingAMatrix::inpType,
typename packingAMatrix::accType,
inst_set_t::avx512>::KCB;
MR = PackingTraits<
typename packingAMatrix::inpType,
typename packingAMatrix::accType,
inst_set_t::avx512>::MR;
} else if (fbgemmHasAvx2Support()) {
MCB = PackingTraits<
typename packingAMatrix::inpType,
typename packingAMatrix::accType,
inst_set_t::avx2>::MCB;
KCB = PackingTraits<
typename packingAMatrix::inpType,
typename packingAMatrix::accType,
inst_set_t::avx2>::KCB;
MR = PackingTraits<
typename packingAMatrix::inpType,
typename packingAMatrix::accType,
inst_set_t::avx2>::MR;
} else {
// TODO: Have default slower path
assert(0 && "unsupported architecture");
return;
}
}
} else {
throw std::runtime_error("Failed to initialize cpuinfo!");
}
if (!packB.isPrePacked()) {
throw std::runtime_error("B matrix must be prepacked");
}
int G = packA.numGroups();
if (G != packB.numGroups()) {
throw std::runtime_error(
"A.groups = " + std::to_string(G) + " and B.groups = " +
std::to_string(packB.numGroups()) + " are not the same");
}
int MDim = packA.numRows();
int KDimPerGroup = packB.numRows() / G;
int kBlocks = (KDimPerGroup + KCB - 1) / KCB;
// remainders
int _kc = KDimPerGroup % KCB;
int kc, mc;
block_type_t blockA{0, 0, 0, 0};
int g_begin, g_end, i_begin, i_end;
if (G >= num_threads) {
// When G >= nthreads, just parallelize over G
// TODO: when G == nthreads + 1, we'll have a big load imbalance because
// only one thread will get 2 groups.
fbgemmGetRange(num_threads, thread_id, G, 1, g_begin, g_end);
i_begin = 0;
i_end = MDim;
} else {
// Otherwise, each group is parallelized by multiple threads.
// nthreads_per_group is floor(nthreads / G).
// If we use ceil, some groups won't be handled by any thread.
int nthreads_per_group = num_threads / G;
g_begin = std::max(std::min(thread_id / nthreads_per_group, G - 1), 0);
g_end = std::min(g_begin + 1, G);
int tid_of_g_begin = std::min(g_begin * nthreads_per_group, num_threads);
int tid_of_g_end = std::min(
(g_end == G) ? num_threads : (tid_of_g_begin + nthreads_per_group),
num_threads);
int nthreads_within_group = tid_of_g_end - tid_of_g_begin;
int tid_within_group = thread_id - tid_of_g_begin;
fbgemmGetRange(
nthreads_within_group, tid_within_group, MDim, MR, i_begin, i_end);
}
for (int g = g_begin; g < g_end; ++g) {
// 运算执行函数,利用outputObject创建执行对象exeKernelObj,并调用类成员函数execute
ExecuteKernel
exeKernelObj(
packA,
packB,
C,
C_buffer,
ldc,
outProcess,
thread_id,
num_threads,
blocking_params);
for (int i = i_begin; i < i_end; i += MCB) { // i is the element index
mc = std::min(i_end - i, MCB);
for (int kb = 0; kb < kBlocks; ++kb) { // kb is the block index
kc = (kb != kBlocks - 1 || _kc == 0) ? KCB : _kc;
// pack A matrix
blockA = {i, mc, g * KDimPerGroup + kb * KCB, kc};
packA.pack(blockA);
exeKernelObj.execute(g * kBlocks + kb);
}
}
} // for each group
}
// ExecuteKernelU8S8.cc中实现了execute函数
template
void ExecuteKernel<
packingAMatrix,
PackBMatrix,
cT,
processOutputType>::execute(int kBlock) {
// packedA_.printPackedMatrix("packedA from kernel");
// packedB_.printPackedMatrix("packedB from kernel");
int32_t bColBlocks = packedB_.blockCols();
int8_t* bBuf;
int8_t* bBuf_pf;
uint8_t* aBuf = packedA_.getBuf(0);
int32_t packed_rows_A = packedA_.numPackedRows();
int32_t row_start_A = packedA_.packedRowStart();
int group = kBlock / packedB_.blockRows();
int NDim = packedB_.numCols();
bool lastKBlock = packedB_.isThisLastKBlock(kBlock % packedB_.blockRows());
bool accum = (kBlock % packedB_.blockRows()) > 0;
typename BaseType::jit_micro_kernel_fp fn;
if (fbgemmHasAvx512Support()) {
fn = BaseType::template getOrCreate(
accum,
packed_rows_A,
packedB_.blockColSize(),
packedA_.numPackedCols(),
nbSize_);
} else if (fbgemmHasAvx2Support()) {
fn = BaseType::template getOrCreate(
accum,
packed_rows_A,
packedB_.blockColSize(),
packedA_.numPackedCols(),
nbSize_);
} else {
// TODO: Have default slower path
assert(0 && "unsupported architecture");
return;
}
for (int jb = 0; jb < bColBlocks; ++jb) {
if (jb == bColBlocks - 1) {
int nc = ((packedB_.lastBcol() - 1) / nrMinSize_ + 1) * nrMinSize_;
if (nc != nbSize_) {
if (fbgemmHasAvx512Support()) {
fn = BaseType::template getOrCreate(
accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
} else if (fbgemmHasAvx2Support()) {
fn = BaseType::template getOrCreate(
accum, packed_rows_A, nc, packedA_.numPackedCols(), nbSize_);
} else {
// TODO: Have default slower path
assert(0 && "unsupported architecture");
return;
}
}
}
bBuf = packedB_.getBuf(jb, kBlock);
// prefetch addr of the next packed block of B matrix
bBuf_pf = packedB_.getBuf(jb == bColBlocks - 1 ? jb : jb + 1, kBlock);
// If the accumulation buffer C_buffer_ is the same as matC_ (inplace output
// processing), then each thread use the different parts of output buffer
// matC_;
// Otherwise, each thread uses different portions of the accumulation
// buffer C_buffer_. If m is large enough (m >= nthreads * MC), then we only
// need to use (nthreads * MC) x n portion of C_buffer_, each thread access
// the C_buffer_row_start as tid * MC * ldc_; else when m is very small, we
// juse use the whole m x n C_buffer_: each thread use the different
// portion.
int32_t* C_buffer_row_start = C_buffer_ +
((C_buffer_ == reinterpret_cast(matC_) ||
num_threads_ * mbSize_ > packedA_.numRows())
? row_start_A * ldc_ + NDim * group
: thread_id_ * mbSize_ * ldc_ + NDim * group);
int32_t* C_buffer_start = C_buffer_row_start + jb * nbSize_;
int32_t leadingDim = ldc_;
if (packedB_.isThereColRemainder() && (jb == bColBlocks - 1)) {
// In case we will access memory past C_buffer_, we use C_tile_ scratchpad
// instead.
C_buffer_start = C_tile_;
leadingDim = nbSize_;
}
fn(aBuf,
bBuf,
bBuf_pf,
C_buffer_start,
packedA_.numPackedCols(),
leadingDim);
// Output processing is done only once per rowblock to amortize overhead
// and for better spatial locality.
if (lastKBlock && jb == bColBlocks - 1) {
// When C_tile_ is used for the last column block, we need a separate
// handling for the last column block.
int32_t nSize =
C_buffer_start == C_tile_ ? jb * nbSize_ : packedB_.numCols();
if (nSize) {
if (fbgemmHasAvx512Support()) {
// TODO: avx512 path
// Currently use avx2 code
outputProcess_.template f(
matC_,
C_buffer_row_start,
{row_start_A, packed_rows_A, NDim * group, nSize},
ldc_,
ldc_);
} else if (fbgemmHasAvx2Support()) {
// 以AVX2指令为例,outputObject调用了内联函数f
outputProcess_.template f(
matC_,
C_buffer_row_start,
{row_start_A, packed_rows_A, NDim * group, nSize},
ldc_,
ldc_);
} else {
// TODO: Have default slower path
assert(0 && "unsupported architecure");
}
}
if (C_buffer_start == C_tile_) {
// When C_tile_ scratchpad was used to avoid accessing memory past
// C_buffer_ .
if (fbgemmHasAvx512Support()) {
// TODO: avx512 path
// Currently use avx2 code
outputProcess_.template f(
matC_,
C_tile_,
{row_start_A,
packed_rows_A,
NDim * group + jb * nbSize_,
packedB_.lastBcol()},
ldc_,
leadingDim);
} else if (fbgemmHasAvx2Support()) {
outputProcess_.template f(
matC_,
C_tile_,
{row_start_A,
packed_rows_A,
NDim * group + jb * nbSize_,
packedB_.lastBcol()},
ldc_,
leadingDim);
} else {
// TODO: Have default slower path
assert(0 && "unsupported architecure");
}
}
} // output processing
} // for each j block
}
fbgemmPacked(
packA,
*Wq_packed_,
Y_uint8_data, // 最终输出的结果 uint8_t* Y_uint8_data OutputTensorCPU_(0)->template mutable_data();
Y_int32->data(), // 临时的中间结果,定义方式为类的成员
M,
outputProcObj,
tid,
nthreads);
4.计算中间结果Y_int32_data
// 核函数声明
ExecuteKernel(
PackMatrix&
packA,
PackMatrix<
PackBMatrix,
int8_t,
typename packingAMatrix::accType>& packB,
cT* matC,
int32_t* C_buffer,
int32_t ldc,
const processOutputType& outputProcess,
int thread_id,
int num_threads,
const BlockingFactors* params = nullptr);
private:
PackMatrix&
packedA_; ///< Packed uint8 block of matrix A.
PackMatrix<
PackBMatrix,
int8_t,
typename packingAMatrix::accType>& packedB_; ///< Packed int8 matrix B.
cT* matC_; ///< Output for matrix C.
int32_t* C_buffer_; ///< the accumulation buffer for matrix C.
int32_t ldc_; ///< the leading dimension of matrix C.
const processOutputType& outputProcess_; ///< output processing function for
///< matrix C in the macro-kernel.
int thread_id_; ///< the thread id.
int num_threads_; ///< the total number of threads
int32_t* C_tile_; ///< buffer for the last N block when NCB is not an exact
///< multiple of N.
int mbSize_; ///< block size in the m dimension.
int nbSize_; ///< block size in the n dimension.
int nrMinSize_; ///< minimum register size in the n dimension.
int nrSize_; ///< register size in the n dimension.
5. mmdnn使用说明
caffe2tensorflow
1.把caffe转换成中间件
$ python -m mmdnn.conversion._script.convertToIR -f caffe -d kit_imagenet -n deploy.prototxt -w VGG16_SOD_finetune.caffemodel
2.把中间件转换成对应的框架tensorflow
$ python -m mmdnn.conversion._script.IRToCode -f pytorch -n kit_imagenet.pb -d kit_imagenet.py -w kit_imagenet.npy -dw kit_pytorch.npy
3.生成pytorch模型,pth文件内包括模型的参数和结构信息
python -m mmdnn.conversion.examples.pytorch.imagenet_test --dump vgg16.pth -n kit_imagenet.py -w kit_pytorch.npy