头文件:<immintrin.h>
编译指令:icc -o test test.c(默认编译成mic程序)
设置线程数量:
程序运行之前在命令行输入:
export num_threads=240
offload模式(主从模式):
1.变量/函数声明:
#pragama offload_attribute(push, target(mic))
......
#pragma offload_attribute(pop)
2.上传变量:
#pragma offload target(mic) in(......)out(...)
3.判断是否有MIC卡可用:
#ifdef __MIC__
#else ...
#endif
4.在函数定义之前需要加上:
__attribute__(target(mic))
以表明函数的类型。
向量化:
数据类型:__m512 Data Type
如:__m512i(int 32/64 vector), __m512(float 32 vector), __m512d(float 64 vector)
1.变量声明:__m512i v1, v2, v3;
2.变量对齐:
windows:__declspec(align(n, off))
linux:__attribute__(align(n, off))
如:__attribute__(align(64))
permutevar函数:
1 #include<stdio.h> 2 #include<stdlib.h> 3 #include<immintrin.h> 4 5 int main(void) 6 { 7 int a[16] = {1, 9, 2, 5, 2, 7, 8, 1, 10, 14, 15, 4, 3, 7, 12, 11}; 8 int b[16] = {3, 4, 6, 2, 5, 7, 9, 4, 13, 12, 7, 8, 15, 7, 2, 11}; 9 int c[16]; 10 int s[16] = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; 11 int c1[16]; 12 #pragma offload target(mic)in(a, b) 13 { 14 __m512i va, vb, result, source; 15 __mmask16 mask, fmask; 16 va = _mm512_load_epi32(a); 17 vb = _mm512_load_epi32(b); 18 source = _mm512_load_epi32(s); 19 20 mask = _mm512_cmp_epi32_mask(va, vb, _MM_CMPINT_LT); 21 printf("%o\n", mask); 22 fmask = _mm512_knot(mask); 23 printf("%o\n", fmask); 24 result = _mm512_set1_epi32(0); 25 result = _mm512_mask_permutevar_epi32(result, mask, va, source); 26 _mm512_store_epi32(c1, result); 27 result = _mm512_mask_permutevar_epi32(result, fmask, vb, source); 28 29 _mm512_store_epi32(c, result); 30 } 31 for(int i = 0; i < 16; i++) 32 { 33 printf("%d ", c1[i]); 34 } 35 printf("\n"); 36 for(int i = 0; i < 16; i++) 37 printf("%d ", c[i]); 38 return 0; 39 } 40 //输出结果: 41 //mask打印:14725 42 //转换成二进制:1100111010101 43 //fmask打印:163052 44 //转换成二进制:1110011000101010 45 // 46 //也就是说在将a, b的值load到va,vb中时,是按照从右到左的顺序。
./cmp 17 0 18 0 18 0 24 17 26 0 0 20 19 0 0 0 14725 163052 17 20 18 18 18 23 24 17 26 28 23 20 19 23 18 27