头文件:
编译指令:icc -o test test.c(默认编译成mic程序)
设置线程数量:
程序运行之前在命令行输入:
export num_threads=240
offload模式(主从模式):
1.变量/函数声明:
#pragama offload_attribute(push, target(mic))
......
#pragma offload_attribute(pop)
2.上传变量:
#pragma offload target(mic) in(......)out(...)
3.判断是否有MIC卡可用:
#ifdef __MIC__
#else ...
#endif
4.在函数定义之前需要加上:
__attribute__(target(mic))
以表明函数的类型。
向量化:
数据类型:__m512 Data Type
如:__m512i(int 32/64 vector), __m512(float 32 vector), __m512d(float 64 vector)
1.变量声明:__m512i v1, v2, v3;
2.变量对齐:
windows:__declspec(align(n, off))
linux:__attribute__(align(n, off))
如:__attribute__(align(64))
permutevar函数:
1 #include
2 #include
3 #include
4
5 int main(void)
6 {
7 int a[16] = {1, 9, 2, 5, 2, 7, 8, 1, 10, 14, 15, 4, 3, 7, 12, 11};
8 int b[16] = {3, 4, 6, 2, 5, 7, 9, 4, 13, 12, 7, 8, 15, 7, 2, 11};
9 int c[16];
10 int s[16] = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
11 int c1[16];
12 #pragma offload target(mic)in(a, b)
13 {
14 __m512i va, vb, result, source;
15 __mmask16 mask, fmask;
16 va = _mm512_load_epi32(a);
17 vb = _mm512_load_epi32(b);
18 source = _mm512_load_epi32(s);
19
20 mask = _mm512_cmp_epi32_mask(va, vb, _MM_CMPINT_LT);
21 printf("%o\n", mask);
22 fmask = _mm512_knot(mask);
23 printf("%o\n", fmask);
24 result = _mm512_set1_epi32(0);
25 result = _mm512_mask_permutevar_epi32(result, mask, va, source);
26 _mm512_store_epi32(c1, result);
27 result = _mm512_mask_permutevar_epi32(result, fmask, vb, source);
28
29 _mm512_store_epi32(c, result);
30 }
31 for(int i = 0; i < 16; i++)
32 {
33 printf("%d ", c1[i]);
34 }
35 printf("\n");
36 for(int i = 0; i < 16; i++)
37 printf("%d ", c[i]);
38 return 0;
39 }
40 //输出结果:
41 //mask打印:14725
42 //转换成二进制:1100111010101
43 //fmask打印:163052
44 //转换成二进制:1110011000101010
45 //
46 //也就是说在将a, b的值load到va,vb中时,是按照从右到左的顺序。
./cmp
17 0 18 0 18 0 24 17 26 0 0 20 19 0 0 0
14725
163052
17 20 18 18 18 23 24 17 26 28 23 20 19 23 18 27