本文以同步至公众号,欢迎订阅。
第四章 cm编译器
这一章,我们就用一个例子来说明cm的client和server时如何联系到一起,并最终运行的。以高斯模糊为例。此例也是intel提供的一个案例,但是没有说明怎么用。哈哈哈。
第一节 建一个client的程序
假设文件名字为gauss_client.cpp
#include "cm_rt.h"
#include "common/bitmap_helpers.h"
#include "common/cm_rt_helpers.h"
#include "common/isa_helpers.h"
using cm::util::bitmap::BitMap;
// Defines the number of columns per thread.
#define NUM_COLS_PER_THREAD 8
// Defines the number of rows per thread.
#define NUM_ROWS_PER_THREAD 8
// Declares coefficients for gaussian filter.
float a0 = 0, a1 = 0, a2 = 0, a3 = 0, b1 = 0, b2 = 0, coefp = 0, coefn = 0;
// This function is used to computes coefficients for gaussian filter.
void CalculateCoefficients(float sigma, int order) {
const float nsigma = sigma < 0.1f ? 0.1f : sigma;
const float alpha = 1.695f / nsigma;
const float ema = (float)exp(-alpha);
const float ema2 = (float)exp(-2 * alpha);
b1 = -2 * ema;
b2 = ema2;
switch (order) {
case 0: {
const float k = (1 - ema) * (1 - ema) / (1 + 2 * alpha * ema - ema2);
a0 = k;
a1 = k * (alpha - 1) * ema;
a2 = k * (alpha + 1) * ema;
a3 = -k * ema2;
} break;
case 1: {
const float k = (1 - ema) * (1 - ema) / ema;
a0 = k * ema;
a1 = a3 = 0;
a2 = -a0;
} break;
case 2: {
const float
ea = (float)exp(-alpha),
k = -(ema2 - 1) / (2 * alpha * ema),
kn = (-2 * (-1 + 3 * ea - 3 * ea * ea + ea * ea * ea) /
(3 * ea + 1 + 3 * ea * ea + ea * ea * ea));
a0 = kn;
a1 = -kn * (1 + k * alpha) * ema;
a2 = kn * (1 - k * alpha) * ema;
a3 = -kn * ema2;
} break;
default:
fprintf(stderr, "gaussianFilter: invalid order parameter!\n");
return;
}
coefp = (a0 + a1) / (1 + b1 + b2);
coefn = (a2 + a3) / (1 + b1 + b2);
printf("Coefficients are: \n");
printf(" a0 = %f, a1 = %f, a2 = %f, a3 = %f, b1 = %f, b2 = %f\n", a0, a1, a2, a3, b1, b2);
}
int main(int argc, char *argv[]) {
// Loads an input image named "lena.bmp".
auto input_image = BitMap::load("lena.bmp");
// Gets the width and height of the input image.
unsigned int width = input_image.getWidth();
unsigned int height = input_image.getHeight();
printf("image width = %d, height = %d\n", width, height);
// Checks the value of width, height and bpp(bits per pixel) of the image.
// Only images in 8-bit RGB format are supported.
// Only images with width and height a multiple of 8 are supported.
if (width & 7 || height & 7 || input_image.getBPP() != 24) {
std::cerr << "Error: Only images in 8-bit RGB format with width and "
<< "height a multiple of 8 are supported.\n";
std::exit(1);
}
// Copies input image to output except for the data.
auto output_image = input_image;
// Sets image size in bytes. There are a total of width*height pixels and
// each pixel occupies (out.getBPP()/8) bytes.
unsigned int img_size = width * height * output_image.getBPP() / 8;
// Sets output to blank image.
output_image.setData(new unsigned char[img_size]);
// Allocates system memory for rgb_to_rgba to convert image format from
// RGB to RGBA.
// Allocates system memory for rgba_to_rgb to convert image format from
// RGBA to RGB.
unsigned int num_pixels = width * height;
unsigned char *rgb_to_rgba = new unsigned char[num_pixels * 4];
unsigned char *rgba_to_rgb = new unsigned char[num_pixels * 4];
// Converts image format from RGB to RGBA.
// Copies the RGB values from the image, set the 4th byte with zero.
for (int i = 0; i < num_pixels; i++) {
rgb_to_rgba[i * 4] = input_image.getData()[i * 3];
rgb_to_rgba[i * 4 + 1] = input_image.getData()[i * 3 + 1];
rgb_to_rgba[i * 4 + 2] = input_image.getData()[i * 3 + 2];
rgb_to_rgba[i * 4 + 3] = 0;
}
// Computes coefficients for gaussian filter.
float sigma = 10.0f;
int order = 0;
CalculateCoefficients(sigma, order);
// Creates a CmDevice from scratch.
// Param device: pointer to the CmDevice object.
// Param version: CM API version supported by the runtime library.
CmDevice *device = nullptr;
unsigned int version = 0;
cm_result_check(::CreateCmDevice(device, version));
// The file gaussian_blur_test_genx.isa is generated when the kernels in the
// file gaussian_blur_test_genx.cpp are compiled by the CM compiler.
// Reads in the virtual ISA from "gaussian_blur_test_genx.isa" to the code
// buffer.
std::string isa_code = cm::util::isa::loadFile("gauss_genx.isa");
if (isa_code.size() == 0) {
std::cerr << "Error: empty ISA binary.\n";
std::exit(1);
}
// Creates a CmProgram object consisting of the kernels loaded from the code
// buffer.
// Param isa_code.data(): Pointer to the code buffer containing the virtual
// ISA.
// Param isa_code.size(): Size in bytes of the code buffer containing the
// virtual ISA.
CmProgram *program = nullptr;
cm_result_check(device->LoadProgram(const_cast(isa_code.data()),
isa_code.size(),
program));
// For vertical direction.
// Creates the kernel.
// Param program: CM Program from which the kernel is created.
// Param "gaussianVertical": The kernel name which should be no more than 256
// bytes including the null terminator.
CmKernel *kernel_vertical = nullptr;
cm_result_check(device->CreateKernel(program,
"gaussianVertical",
kernel_vertical));
// Creates input surface with given width and height in pixels and format.
CmSurface2D *input_surface = nullptr;
cm_result_check(device->CreateSurface2D(4 * width,
height,
CM_SURFACE_FORMAT_A8,
input_surface));
// Copies system memory content to the input surface using the CPU. The
// system memory content is the data of the input image in RGBA format.
// The size of data copied is the size of data in the rgb_to_rgba.
cm_result_check(input_surface->WriteSurface(rgb_to_rgba, nullptr));
// Creates the temp surface. The width, height and format is the same as
// the input surface.
// The temp surface contains the output of kernel_vertical.
CmSurface2D *temp_surface = nullptr;
cm_result_check(device->CreateSurface2D(4 * width,
height,
CM_SURFACE_FORMAT_A8,
temp_surface));
// When a surface is created by the CmDevice a SurfaceIndex object is
// created. This object contains a unique index value that is mapped to the
// surface.
// Gets the input surface index.
SurfaceIndex *input_surface_idx = nullptr;
cm_result_check(input_surface->GetIndex(input_surface_idx));
// Gets the temp surface index.
SurfaceIndex *temp_surface_idx = nullptr;
cm_result_check(temp_surface->GetIndex(temp_surface_idx));
// Sets a per kernel argument.
// Sets input surface index as the first argument of kernel_vertical.
// Sets temp surface index as the second argument of kernel_vertical.
cm_result_check(kernel_vertical->SetKernelArg(0,
sizeof(SurfaceIndex),
input_surface_idx));
cm_result_check(kernel_vertical->SetKernelArg(1,
sizeof(SurfaceIndex),
temp_surface_idx));
// Sets the image width and height as the third and the fourth argument
// of kernel_vertical.
cm_result_check(kernel_vertical->SetKernelArg(2, 4, &width));
cm_result_check(kernel_vertical->SetKernelArg(3, 4, &height));
// Sets filter coefficients as the rest arguments of kernel_vertical.
cm_result_check(kernel_vertical->SetKernelArg(4, 4, &a0));
cm_result_check(kernel_vertical->SetKernelArg(5, 4, &a1));
cm_result_check(kernel_vertical->SetKernelArg(6, 4, &a2));
cm_result_check(kernel_vertical->SetKernelArg(7, 4, &a3));
cm_result_check(kernel_vertical->SetKernelArg(8, 4, &b1));
cm_result_check(kernel_vertical->SetKernelArg(9, 4, &b2));
cm_result_check(kernel_vertical->SetKernelArg(10, 4, &coefp));
cm_result_check(kernel_vertical->SetKernelArg(11, 4, &coefn));
// Each CmKernel can be executed by multiple concurrent threads.
// Here, for "kernel_vertical" kernel, each thread works on
// NUM_COLS_PER_THREAD columns in vertical direction.
int thread_width = width / NUM_COLS_PER_THREAD;
// Creates a CmThreadSpace object.
// There are two usage models for the thread space. One is to define the
// dependency between threads to run in the GPU. The other is to define a
// thread space where each thread can get a pair of coordinates during
// kernel execution. For this example, we use the latter usage model.
CmThreadSpace *thread_space = nullptr;
cm_result_check(device->CreateThreadSpace(thread_width,
1,
thread_space));
// Creates a task queue.
// The CmQueue is an in-order queue. Tasks get executed according to the
// order they are enqueued. The next task does not start execution until the
// current task finishes.
CmQueue *cmd_queue = nullptr;
cm_result_check(device->CreateQueue(cmd_queue));
// Creates a CmTask object.
// The CmTask object is a container for CmKernel pointers. It is used to
// enqueue the kernels for execution.
CmTask *task = nullptr;
cm_result_check(device->CreateTask(task));
// Adds a CmKernel pointer to CmTask.
// This task has one kernel.
cm_result_check(task->AddKernel(kernel_vertical));
// Launches the task on the GPU. Enqueue is a non-blocking call, i.e. the
// function returns immediately without waiting for the GPU to start or
// finish execution of the task. The runtime will query the HW status. If
// the hardware is not busy, the runtime will submit the task to the
// driver/HW; otherwise, the runtime will submit the task to the driver/HW
// at another time.
// An event, "sync_event", is created to track the status of the task.
CmEvent *sync_event = nullptr;
cm_result_check(cmd_queue->Enqueue(task,
sync_event,
thread_space));
// Destroys a CmTask object.
// CmTask will be destroyed when CmDevice is destroyed.
// Here, the application destroys the CmTask object by itself.
cm_result_check(device->DestroyTask(task));
// For horizontal direction.
// Creates the kernel.
// Param program: CM Program from which the kernel is created.
// Param "gaussianHorizontal": The kernel name which should be no more than
// 256 bytes including the null terminator.
CmKernel *kernel_horizontal = nullptr;
cm_result_check(device->CreateKernel(program,
"gaussianHorizontal",
kernel_horizontal));
// Creates the output surface. The width, height and format is the same as
// the input surface.
// The output surface contains the output of kernel_horizontal.
CmSurface2D *output_surface = nullptr;
cm_result_check(device->CreateSurface2D(4 * width,
height,
CM_SURFACE_FORMAT_A8,
output_surface));
// Gets the output surface index.
SurfaceIndex *output_surface_idx = nullptr;
cm_result_check(output_surface->GetIndex(output_surface_idx));
// Sets a per kernel argument.
// Sets the output of kernel_vertical as the input of kernel_horizontal.
// Sets temp surface index as the first argument of kernel_horizontal.
// Sets output surface index as the second argument of kernel_horizontal.
cm_result_check(kernel_horizontal->SetKernelArg(0,
sizeof(SurfaceIndex),
temp_surface_idx));
cm_result_check(kernel_horizontal->SetKernelArg(1,
sizeof(SurfaceIndex),
output_surface_idx));
// Sets the image width and height as the third and the fourth argument
// of kernel_horizontal.
cm_result_check(kernel_horizontal->SetKernelArg(2, 4, &width));
cm_result_check(kernel_horizontal->SetKernelArg(3, 4, &height));
// Sets filter coefficients as the rest arguments of kernel_horizontal.
cm_result_check(kernel_horizontal->SetKernelArg(4, 4, &a0));
cm_result_check(kernel_horizontal->SetKernelArg(5, 4, &a1));
cm_result_check(kernel_horizontal->SetKernelArg(6, 4, &a2));
cm_result_check(kernel_horizontal->SetKernelArg(7, 4, &a3));
cm_result_check(kernel_horizontal->SetKernelArg(8, 4, &b1));
cm_result_check(kernel_horizontal->SetKernelArg(9, 4, &b2));
cm_result_check(kernel_horizontal->SetKernelArg(10, 4, &coefp));
cm_result_check(kernel_horizontal->SetKernelArg(11, 4, &coefn));
// Each CmKernel can be executed by multiple concurrent threads.
// Here, for "kernel_horizontal" kernel, each thread works on
// NUM_ROWS_PER_THREAD rows in horizontal direction.
int thread_height = height / NUM_ROWS_PER_THREAD;
// Creates a CmThreadSpace object.
// There are two usage models for the thread space. One is to define the
// dependency between threads to run in the GPU. The other is to define a
// thread space where each thread can get a pair of coordinates during
// kernel execution. For this example, we use the latter usage model.
cm_result_check(device->CreateThreadSpace(thread_height,
1,
thread_space));
// Creates a CmTask object.
// The CmTask object is a container for CmKernel pointers. It is used to
// enqueue the kernels for execution.
cm_result_check(device->CreateTask(task));
// Adds a CmKernel pointer to CmTask.
// This task has one kernels.
cm_result_check(task->AddKernel(kernel_horizontal));
// Launches the task on the GPU. Enqueue is a non-blocking call, i.e. the
// function returns immediately without waiting for the GPU to start or
// finish execution of the task. The runtime will query the HW status. If
// the hardware is not busy, the runtime will submit the task to the
// driver/HW; otherwise, the runtime will submit the task to the driver/HW
// at another time.
// An event, "sync_event", is created to track the status of the task.
cm_result_check(cmd_queue->Enqueue(task,
sync_event,
thread_space));
// Destroys a CmTask object.
// CmTask will be destroyed when CmDevice is destroyed.
// Here, the application destroys the CmTask object by itself.
cm_result_check(device->DestroyTask(task));
// Reads the output surface content to the system memory using the CPU.
// The size of data copied is the size of data in Surface.
// It is a blocking call. The function will not return until the copy
// operation is completed.
// The dependent event "sync_event" ensures that the reading of the surface
// will not happen until its state becomes CM_STATUS_FINISHED.
cm_result_check(output_surface->ReadSurface(rgba_to_rgb,
sync_event));
// Destroys the CmDevice.
// Also destroys surfaces, kernels, tasks, thread spaces, and queues that
// were created using this device instance that have not explicitly been
// destroyed by calling the respective destroy functions.
cm_result_check(::DestroyCmDevice(device));
// Converts image format from RGBA to RGB.
unsigned char *tmp = new unsigned char[num_pixels * 3];
for (int i = 0; i < num_pixels; i++) {
tmp[i * 3] = rgba_to_rgb[i * 4];
tmp[i * 3 + 1] = rgba_to_rgb[i * 4 + 1];
tmp[i * 3 + 2] = rgba_to_rgb[i * 4 + 2];
}
output_image.setData(tmp);
// Saves the output image data into the file "blur_out.bmp".
output_image.save("blur_out.bmp");
// Frees memory.
delete[] rgb_to_rgba;
delete[] rgba_to_rgb;
// Checks result.
if (BitMap::checkResult("blur_out.bmp",
"blur_gold.bmp",
5)) {
std::cout << "PASSED" << std::endl;
return 0;
} else {
std::cout << "FAILED" << std::endl;
return -1;
}
}
第二节 建一个server的程序
假设文件名字为gauss_genx.cpp
#include
#define NUM_COMPONENTS 4
// number of rows we read in at once
#define NUM_ROWS_PER_ITER 8
// number of columns per thread
#define NUM_COLS_PER_THREAD 8
#define SIMD_SIZE (NUM_COLS_PER_THREAD * NUM_COMPONENTS)
#define CLAMP_TO_EDGE 1
// for horizontal direction
// number of rows per thread
#define NUM_ROWS_PER_THREAD 8
// number of columns we read in at once
#define NUM_COLS_PER_ITER 8
// Each thread processes 32 columns independently
// For now assume height is divisible by 8
extern "C" _GENX_MAIN_ void
gaussianVertical( SurfaceIndex INBUF, SurfaceIndex OUTBUF, int width, int height, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn )
{
matrix image;
matrix outImage;
vector in;
vector out;
vector inMinusOne;
vector outMinusOne;
vector outMinusTwo;
uint id = get_thread_origin_x();
#if CLAMP_TO_EDGE
matrix firstRow;
read( INBUF, id * SIMD_SIZE, 0, firstRow);
inMinusOne = firstRow;
inMinusOne *= 1/255.0f;
outMinusTwo = coefp * inMinusOne;
outMinusOne = outMinusTwo;
#else
inMinusOne = 0;
outMinusOne = 0;
outMinusTwo = 0;
#endif
//read in 8 rows at a time
for( int i = 0; i < height; i += NUM_ROWS_PER_ITER ) {
read( INBUF, id * SIMD_SIZE, i, image );
#pragma unroll
for( unsigned j = 0; j < NUM_ROWS_PER_ITER; j++ ) {
in = image.row(j);
in *= 1/255.0f;
//out = a0 * in + a1 * inMinusOne - b1 * outMinusOne - b2 * outMinusTwo;
out = a0 * in + a1 * inMinusOne - (b1 * outMinusOne + b2 * outMinusTwo);
inMinusOne = in;
outMinusTwo = outMinusOne;
outMinusOne = out;
//clamp the value to [0,1]
out = cm_add(out, 0.0f, SAT);
outImage.row(j) = out * 255.0f;
}
//write back to surface
write( OUTBUF, id*SIMD_SIZE, i, outImage );
}
vector inPlusOne;
vector inPlusTwo;
vector outPlusOne;
vector outPlusTwo;
vector temp;
#if CLAMP_TO_EDGE
matrix lastRow;
read( INBUF, id * SIMD_SIZE, height - 1, lastRow );
inPlusOne = lastRow;
inPlusOne *= 1/255.0f;
inPlusTwo = inPlusOne;
outPlusOne = coefn * inPlusOne;
outPlusTwo = outPlusOne;
#else
inPlusOne = 0;
inPlusTwo = 0;
outPlusOne = 0;
outPlusTwo = 0;
#endif
//read 8 rows at a time, in reverse direction
for( int i = height - NUM_ROWS_PER_ITER; i >= 0; i -= NUM_ROWS_PER_ITER ) {
read( INBUF, id * SIMD_SIZE, i, image );
read( MODIFIED(OUTBUF), id * SIMD_SIZE, i, outImage );
#pragma unroll
for( int j = NUM_ROWS_PER_ITER - 1; j >= 0; j-- ) {
in = image.row(j);
in *= 1 / 255.0f;
//temp = a2 * inPlusOne + a3 * inPlusTwo - b1 * outPlusOne - b2 * outPlusTwo;
temp = a2 * inPlusOne + a3 * inPlusTwo - (b1 * outPlusOne + b2 * outPlusTwo);
inPlusTwo = inPlusOne;
inPlusOne = in;
outPlusTwo = outPlusOne;
outPlusOne = temp;
out = outImage.row(j);
out = cm_add( out * (1/255.0f), temp, SAT );
outImage.row(j) = out * 255;
}
//write back to surface
write( OUTBUF, id*SIMD_SIZE, i, outImage );
}
}
extern "C" _GENX_MAIN_ void
transpose( SurfaceIndex INBUF, SurfaceIndex OUTBUF, unsigned id, int width, int height ) {
matrix in;
matrix out;
for( int i = 0; i < height; i += 8 ) {
read( INBUF, id * 32, i, in );
out.row(0) = in.column(0);
out.row(1) = in.column(1);
out.row(2) = in.column(2);
out.row(3) = in.column(3);
out.row(4) = in.column(4);
out.row(5) = in.column(5);
out.row(6) = in.column(6);
out.row(7) = in.column(7);
write( OUTBUF, i * 4, id * 8, out );
}
}
// Like gaussianVertical, except we process 8 independent rows at once
extern "C" _GENX_MAIN_ void
gaussianHorizontal( SurfaceIndex INBUF, SurfaceIndex OUTBUF, int width, int height, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn )
{
matrix image;
matrix outImage;
matrix in;
matrix out;
matrix inMinusOne;
matrix outMinusOne;
matrix outMinusTwo;
uint id = get_thread_origin_x();
#if CLAMP_TO_EDGE
matrix firstColumn;
read( MODIFIED(INBUF), 0, id * NUM_ROWS_PER_THREAD, firstColumn );
inMinusOne = firstColumn;
inMinusOne *= 1/255.0f;
outMinusTwo = coefp * inMinusOne;
outMinusOne = outMinusTwo;
#else
inMinusOne = 0;
outMinusOne = 0;
outMinusTwo = 0;
#endif
//read 8 rows at a time
for( int i = 0; i < width; i += NUM_COLS_PER_ITER ) {
read( MODIFIED(INBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, image );
#pragma unroll
for( unsigned j = 0; j < NUM_COLS_PER_ITER; j++ ) {
in = image.select(0, j * NUM_COMPONENTS);
in *= 1/255.0f;
//out = a0 * in + a1 * inMinusOne - b1 * outMinusOne - b2 * outMinusTwo;
out = a0 * in + a1 * inMinusOne - (b1 * outMinusOne + b2 * outMinusTwo);
inMinusOne = in;
outMinusTwo = outMinusOne;
outMinusOne = out;
//clamp the value to [0,1]
out = cm_add( out, 0.0f, SAT ) * 255.0f;
outImage.select(0, j*NUM_COMPONENTS) = out;
}
//write back to surface
write( OUTBUF, i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, outImage );
}
//reverse direction
matrix inPlusOne;
matrix inPlusTwo;
matrix outPlusOne;
matrix outPlusTwo;
matrix temp;
#if CLAMP_TO_EDGE
matrix lastColumn;
read( MODIFIED(INBUF), width - NUM_COMPONENTS, id * 8, lastColumn );
inPlusOne = lastColumn;
inPlusOne *= 1/255.0f;
inPlusTwo = inPlusOne;
outPlusOne = coefn * inPlusOne;
outPlusTwo = outPlusOne;
#else
inPlusOne = 0;
inPlusTwo = 0;
outPlusOne = 0;
outPlusTwo = 0;
#endif
for( int i = width - NUM_COLS_PER_ITER; i >= 0; i -= NUM_COLS_PER_ITER ) {
read( MODIFIED(INBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, image );
read( MODIFIED(OUTBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, outImage );
#pragma unroll
for( int j = NUM_COLS_PER_ITER - 1; j >= 0; j-- ) {
in = image.select(0, j*NUM_COMPONENTS);
in *= 1/255.0f;
//temp = a2 * inPlusOne + a3 * inPlusTwo - b1 * outPlusOne - b2 * outPlusTwo;
temp = a2 * inPlusOne + a3 * inPlusTwo - (b1 * outPlusOne + b2 * outPlusTwo);
inPlusTwo = inPlusOne;
inPlusOne = in;
outPlusTwo = outPlusOne;
outPlusOne = temp;
//The mul * 1 forces out to not be coalesced with outImage, so we can use SIMD16
//operations instead of SIMD4
out = outImage.select(0, j*NUM_COMPONENTS) * 1.0f;
//out = outImage.select(0, j*NUM_COMPONENTS);
out = cm_add( out * (1/255.0f), temp, SAT );
outImage.select(0, j*NUM_COMPONENTS) = out * 255.0f;
}
write( OUTBUF, i * NUM_COMPONENTS, id * 8, outImage );
}
}
第三节 最重要的编译
在第一节中,有一行代码时核心,
std::string isa_code = cm::util::isa::loadFile("gauss_genx.isa");
那么这个gauss_genx.isa 是哪来的呢?用NVIDIA显卡做过深度学习的同学肯定知道ISA总线这个名词,本文不写这些。主要是告诉您,怎么由gauss_genx.cpp变为gauss_genx.isa.
首先,你得找到cm的编译器。
(1)从https://github.com/intel/cm-compiler这个地方clone代码
(2)安装:VS2015(或以上版本),安装python2.7,安装cygwin,安装cmake,安装unzip,安装curl
(3)打开cygwin,进入cm-compiler里面执行下面的代码:
cmake path/to/llvm/source/root
cmake --build .
(4)执行support/scripts/build.bash -s vs2015 -d -m --64
(5)找到.exe,应该是在build.64.vs2015文件夹里面。
(6)在cmd中执行
cmc.exe -isystem path\to\cm-compiler\support\include path\to\gauss_genx.cpp -march=SKL
(7)可以看到,在cmc.exe所在文件夹生成了gauss_genx.isa文件
(8)我们打开isa文件看一下,实际上是一些16进制的数据。再底层是怎么加载实现的,本文就不深入探讨了。
好了,写到这里,基本上把CM有关的一些基本知识都介绍了。跟CUDA很类似,CM同样可以集成到深度学习框架当中,作为框架的kernel部分。
下一个更新,则主要集中在深度学习的应用上。那么为什么之前没有写CUDA呢。按照我的思路,先了解Intel的东西,然后在应用层面上,以大家用的最多的为主。
所以,后面将会带你从零开始,在笔记本上搭建一个基于深度学习的目标检测模型。此模型能够实现将训练好的模型,用C++进行集成,并作为最终的开发文件一起发布。
拭目以待吧。