基于MIC平台的向量加示例,包括:OpenMP版本,MIC offload版本,MIC native版本,CPU+MIC offload版本,CPU+MIC对等版本,以及MIC网络配置方法,希望对想学习MIC的网友有些帮助。
1. 向量加串行程序
vectoradd_cpu.cpp
1 #include <stdio.h> 2 #include <stdlib.h> 3 4 #define N 200000 5 6 void VecAdd_cpu(float* A, float* B, float* C, int size) 7 { 8 for(int i=0;i<size;i++) 9 C[i] = A[i] + B[i]; 10 } 11 12 int main( int argc, char** argv) 13 { 14 int i; 15 int size = N * sizeof(float); 16 17 float *A,*B,*C; 18 A = (float*)malloc(size); 19 B = (float*)malloc(size); 20 C = (float*)malloc(size); 21 22 srand(2013); 23 for(i=0;i<N;i++) 24 { 25 A[i]=rand()%10; 26 B[i]=rand()%10; 27 } 28 29 VecAdd_cpu(A, B, C, N); 30 31 for(i=0;i<N;i+=10000) 32 { 33 printf("%6d: %4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]); 34 } 35 36 free(A); 37 free(B); 38 free(C); 39 } |
编译:icpc -O3 -o vectoradd_cpu -cvectoradd_cpu.cpp
运行:./vectoradd_cpu
结果:
0:9.00 + 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
2. 向量加OpenMP多线程并行程序
vectoradd_omp.cpp
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <omp.h> 4 5 #define N 200000 6 7 void VecAdd_omp(float* A, float* B, float* C, int size) 8 { 9 #pragma omp parallel for 10 for(int i=0;i<size;i++) 11 C[i] = A[i] + B[i]; 12 } 13 14 int main( int argc, char** argv) 15 { 16 int i; 17 int size = N * sizeof(float); 18 19 float *A,*B,*C; 20 A = (float*)malloc(size); 21 B = (float*)malloc(size); 22 C = (float*)malloc(size); 23 24 srand(2013); 25 for(i=0;i<N;i++) 26 { 27 A[i]=rand()%10; 28 B[i]=rand()%10; 29 } 30 31 VecAdd_omp(A, B, C, N); 32 33 for(i=0;i<N;i+=10000) 34 { 35 printf("%6d: %4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]); 36 } 37 38 free(A); 39 free(B); 40 free(C); 41 } |
编译:icpc -O3 -openmp -o vectoradd_omp vectoradd_cpu.cpp
运行:./vectoradd_omp
结果:
0:9.00 + 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
3. 向量加MIC offload多线程并行程序
vectoradd_mic_offload.cpp
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <omp.h> 4 5 #define N 200000 6 7 __attribute__((target(mic))) void offload_check(void) 8 { 9 #ifdef __MIC__ 10 printf("Code running on MIC\n"); 11 #else 12 printf("Code running on host\n"); 13 #endif 14 } 15 16 __attribute__((target(mic))) 17 void VecAdd_mic(float* A, float* B, float* C, int size) 18 { 19 #pragma omp parallel for 20 for(int i=0;i<size;i++) 21 C[i] = A[i] + B[i]; 22 } 23 24 int main( int argc, char** argv) 25 { 26 int i; 27 int size = N * sizeof(float); 28 29 float *A,*B,*C; 30 A = (float*)malloc(size); 31 B = (float*)malloc(size); 32 C = (float*)malloc(size); 33 34 srand(2013); 35 for(i=0;i<N;i++) 36 { 37 A[i]=rand()%10; 38 B[i]=rand()%10; 39 } 40 41 #pragma offload target(mic) in(A,B: length(N)) out(C: length(N)) 42 { 43 offload_check(); 44 VecAdd_mic(A, B, C, N); 45 } 46 47 for(i=0;i<N;i+=10000) 48 { 49 printf("%6d: %4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]); 50 } 51 52 free(A); 53 free(B); 54 free(C); 55 } |
编译:icpc -O3 -openmp -ovectoradd_mic_offload vectoradd_mic_offload.cpp
运行:./vectoradd_mic_offload
结果:
0:9.00 + 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
Code running on MIC
4. 向量加MIC native多线程并行程序
vectoradd_mic_native.cpp
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <omp.h> 4 5 #define N 200000 6 7 void VecAdd_omp(float* A, float* B, float* C, int size) 8 { 9 #pragma omp parallel for 10 for(int i=0;i<size;i++) 11 C[i] = A[i] + B[i]; 12 } 13 14 int main( int argc, char** argv) 15 { 16 int i; 17 int size = N * sizeof(float); 18 19 float *A,*B,*C; 20 A = (float*)malloc(size); 21 B = (float*)malloc(size); 22 C = (float*)malloc(size); 23 24 srand(2013); 25 for(i=0;i<N;i++) 26 { 27 A[i]=rand()%10; 28 B[i]=rand()%10; 29 } 30 31 VecAdd_omp(A, B, C, N); 32 33 for(i=0;i<N;i+=10000) 34 { 35 printf("%6d: %4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]); 36 } 37 38 free(A); 39 free(B); 40 free(C); 41 } |
编译:icpc -O3 -openmp -mmic -ovectoradd_mic_native vectoradd_mic_native.cpp
运行:
scp/opt/intel/composer_xe_2013.0.079/compiler/lib/mic/libiomp5.so mic0:/tmp/
scpvectoradd_mic_native mic0:/tmp/
ssh mic0 (登录到MIC卡上)
cd /tmp
exportLD_LIBRARY_PATH=/tmp/ (设置lib路径,如果前面把libiomp5.so复杂到/lib64下面,这步可以省略)
./vectoradd_mic_native
结果:
0: 9.00+ 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
5. 向量加CPU+MIC offload并行程序
vectoradd_cpu_mic_offload.cpp
1 #include <mpi.h> 2 #include <omp.h> 3 #include <stdio.h> 4 #include <stdlib.h> 5 6 #define N 200000 7 8 __attribute__((target(mic))) void offload_check(int rankID) 9 { 10 #ifdef __MIC__ 11 printf("RankID %d running on MIC\n", rankID); 12 #else 13 printf("RankID %d running on host\n", rankID); 14 #endif 15 } 16 17 __attribute__((target(mic))) 18 void VecAdd_omp(float* A, float* B, float* C, int size) 19 { 20 #pragma omp parallel for 21 for(int i=0;i<size;i++) 22 C[i] = A[i] + B[i]; 23 } 24 25 int main( int argc, char** argv) 26 { 27 int i,M; 28 int myrank, root=0, totalrank; 29 MPI_Status status; 30 31 MPI_Init(&argc,&argv); 32 MPI_Comm_rank(MPI_COMM_WORLD,&myrank); 33 MPI_Comm_size(MPI_COMM_WORLD, &totalrank); 34 35 if(myrank == root) 36 printf("total rank is:%d\n",totalrank); 37 M = N / (totalrank-1); 38 39 if(myrank == root) 40 { 41 float *A, *B, *C; 42 int size = N * sizeof(float); 43 A = (float*)malloc(size); 44 B = (float*)malloc(size); 45 C = (float*)malloc(size); 46 47 srand(2013); 48 for(i=0;i<N;i++) 49 { 50 A[i]=rand()%10; 51 B[i]=rand()%10; 52 } 53 54 for(i=1;i<totalrank;i++) 55 { 56 MPI_Send(A+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD); 57 MPI_Send(B+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD); 58 } 59 60 for(i=1;i<totalrank;i++) 61 { 62 MPI_Recv(C+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD, &status); 63 } 64 for(i=0;i<N;i+=10000) 65 { 66 printf("%6d: %4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]); 67 } 68 free(A); 69 free(B); 70 free(C); 71 } 72 else 73 { 74 float *A, *B, *C; 75 int size = M * sizeof(float); 76 A = (float*)malloc(size); 77 B = (float*)malloc(size); 78 C = (float*)malloc(size); 79 80 MPI_Recv(A, M, MPI_FLOAT, 0, myrank, MPI_COMM_WORLD, &status); 81 MPI_Recv(B, M, MPI_FLOAT, 0, myrank, MPI_COMM_WORLD, &status); 82 if(myrank==1) //cpu 83 { 84 offload_check(myrank); 85 VecAdd_omp(A, B, C, M); 86 } 87 else if(myrank==2) //mic 88 { 89 #pragma offload target(mic) in(A,B: length(M)) out(C: length(M)) 90 { 91 offload_check(myrank); 92 VecAdd_omp(A, B, C, M); 93 } 94 } 95 MPI_Send(C, M, MPI_FLOAT, 0, myrank, MPI_COMM_WORLD); 96 free(A); 97 free(B); 98 free(C); 99 } 100 MPI_Finalize(); 101 } |
编译:mpiicpc -O3 -openmp -ovectoradd_cpu_mic_offload vectoradd_cpu_mic_offload.cpp
运行:mpirun -np 3 ./vectoradd_cpu_mic_offload
// 进程0为主进程,进程1为CPU计算进程,进程2为MIC计算进程
结果:
total rank is:3
RankID 1 running on host
0: 9.00 + 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
RankID 2 running on MIC
6. 向量加CPU+MIC Symmetric并行程序
vectoradd_cpu_mic_symmetric.cpp
1 #include <mpi.h> 2 #include <omp.h> 3 #include <stdio.h> 4 #include <stdlib.h> 5 6 #define N 200000 7 8 void offload_check(int rankID) 9 { 10 #ifdef __MIC__ 11 printf("RankID %d running on MIC\n", rankID); 12 #else 13 printf("RankID %d running on host\n", rankID); 14 #endif 15 } 16 17 void VecAdd_omp(float* A, float* B, float* C, int size) 18 { 19 #pragma omp parallel for 20 for(int i=0;i<size;i++) 21 C[i] = A[i] + B[i]; 22 } 23 24 int main( int argc, char** argv) 25 { 26 int i,M; 27 int myrank, root=0, totalrank; 28 MPI_Status status; 29 30 MPI_Init(&argc,&argv); 31 MPI_Comm_rank(MPI_COMM_WORLD,&myrank); 32 MPI_Comm_size(MPI_COMM_WORLD, &totalrank); 33 34 if(myrank == root) 35 printf("total rank is:%d\n",totalrank); 36 M = N / (totalrank-1); 37 38 if(myrank == root) 39 { 40 float *A, *B, *C; 41 int size = N * sizeof(float); 42 A = (float*)malloc(size); 43 B = (float*)malloc(size); 44 C = (float*)malloc(size); 45 46 srand(2013); 47 for(i=0;i<N;i++) 48 { 49 A[i]=rand()%10; 50 B[i]=rand()%10; 51 } 52 53 for(i=1;i<totalrank;i++) 54 { 55 MPI_Send(A+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD); 56 MPI_Send(B+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD); 57 } 58 59 for(i=1;i<totalrank;i++) 60 { 61 MPI_Recv(C+(i-1)*M, M, MPI_FLOAT, i, i, MPI_COMM_WORLD, &status); 62 } 63 for(i=0;i<N;i+=10000) 64 { 65 printf("%6d: %4.2f + %4.2f = %4.2f\n",i,A[i],B[i],C[i]); 66 } 67 free(A); 68 free(B); 69 free(C); 70 } 71 else 72 { 73 float *A, *B, *C; 74 int size = M * sizeof(float); 75 A = (float*)malloc(size); 76 B = (float*)malloc(size); 77 C = (float*)malloc(size); 78 79 MPI_Recv(A, M, MPI_FLOAT, 0, myrank, MPI_COMM_WORLD, &status); 80 MPI_Recv(B, M, MPI_FLOAT, 0, myrank, MPI_COMM_WORLD, &status); 81 82 offload_check(myrank); 83 VecAdd_omp(A, B, C, M); 84 85 MPI_Send(C, M, MPI_FLOAT, 0, myrank, MPI_COMM_WORLD); 86 87 free(A); 88 free(B); 89 free(C); 90 } 91 MPI_Finalize(); 92 } |
编译:
mpiicpc -O3 -openmp -ovectoradd_cpu_mic_symmetric vectoradd_cpu_mic_symmetric.cpp
mpiicpc -O3 -openmp -mmic -ovectoradd_cpu_mic_symmetric.out vectoradd_cpu_mic_symmetric.cpp
运行:
scp vectoradd_cpu_mic_symmetric.outmic0:/tmp
scp/opt/intel/impi/4.1.0.024/mic/lib/*.so* mic0:/tmp
scp /opt/intel/impi/4.1.0.024/mic/bin/*mic0:/bin/
export MIC_LD_LIBRARY_PATH=/tmp
export I_MPI_MIC=enable
mpiexec.hydra -host 192.168.1.100 -n 2 ./vectoradd_cpu_mic_symmetric: -host 192.168.1.101 -n 1 -wdir /tmp /tmp/vectoradd_cpu_mic_symmetric.out
// 192.168.1.100为主机IP,192.168.1.101为MIC0 IP,MIC网络配置见附录。
// 进程0为主进程,进程1为CPU计算进程,进程2为MIC计算进程
结果:
total rank is:3
RankID 1 running on host
RankID 2 running on MIC
0: 9.00 + 6.00 = 15.00
10000: 7.00 + 0.00 = 7.00
20000: 1.00 + 5.00 = 6.00
30000: 7.00 + 6.00 = 13.00
40000: 7.00 + 9.00 = 16.00
50000: 8.00 + 1.00 = 9.00
60000: 8.00 + 8.00 = 16.00
70000: 0.00 + 1.00 = 1.00
80000: 4.00 + 7.00 = 11.00
90000: 0.00 + 4.00 = 4.00
100000: 7.00 + 6.00 = 13.00
110000: 3.00 + 6.00 = 9.00
120000: 2.00 + 0.00 = 2.00
130000: 8.00 + 9.00 = 17.00
140000: 9.00 + 3.00 = 12.00
150000: 1.00 + 6.00 = 7.00
160000: 0.00 + 6.00 = 6.00
170000: 6.00 + 0.00 = 6.00
180000: 4.00 + 6.00 = 10.00
190000: 0.00 + 9.00 = 9.00
7. 附录
MIC局域网ip配置
首先确保一个软件包安装:
#rpm -qa | grep bridge-utils
然后关闭服务:
/etc/init.d/NetworkManager stop
chkconfig --level 345 NetworkManager off
改配置文件:注意备份eth1中的MAC地址!
cd /etc/sysconfig/network-scripts/
vim ifcfg-eth1
改为:
DEVICE="eth1"
NM_CONTROLLED="yes"
ONBOOT=yes
TYPE=Ethernet
BRIDGE=br0
HWADDR=6C:92:BF:00:43:CB
NAME="System eth1"
UUID=9c92fad9-6ecb-3e6c-eb4d-8a47c6f50c04
最后两行是无所谓的,有就留着,没有也可以不要。
创建ifcfg-br0并修改:
vim ifcfg-br0
DEVICE=br0
TYPE=Bridge
ONBOOT=yes
DELAY=0
NM_CONTROLLED="no"
MTU=9000
NOZEROCONF=yes
BOOTPROTO=static
IPADDR=192.168.1.100
NETMASK=255.255.255.0
IPADDR请手动指定,不要重复(留出MIC卡的地址)
重启网络服务:
service network restart
重启后如果ifconfig看到br0的地址是设定的地址,且eth1没有地址了,就是正确的。
设置MIC卡的IP地址:
首先停掉mpss服务:
service mpss stop
如果没有执行过micctrl –initdefaults,请执行。(不要重复执行!装卡的时候执行一次就够了!)
修改配置文件:
cd /etc/sysconfig/mic
vim default.conf
去掉#BridgeName micbr0 的“#”并改为刚才设置的bridge名称,如:
BridgeName br0
修改Subnet 172.31 字段为想设置的第一块MIC卡的IP地址,即主机端如果是192.168.1.100,这里设置为192.168.1.101:
Subnet 192.168.1.101
加上两行:
MTUsize 9000
NetBits 24
修改每块MIC卡的ip地址:
vim mic0.conf 添加:
MicIPaddress 192.168.1.101
vim mic1.conf 添加:
MicIPaddress 192.168.1.102
重新设置MIC参数:
micctrl –resetconfig
重启MIC服务
service mpss restart
如果配置了两个节点的话,节点之间就可以互相通信了。例如,以现在配好的为例:node01主机端192.168.1.100 ,两块卡分别是101、102。node02主机端192.168.1.103,两块卡分别是104、105。此时无论是卡还是host,都是可以ping通的,并且可以通过ssh互相登录。但是不同节点间登录还需要密码。
免密码方法:
创建ssh-keygen,书上有,不再赘述。
将本机的~/.ssh/ id_rsa.pub 或dsa,scp到远程主机并改名,例如改为id_rsa. node02.pub
cat id_rsa. node02.pub >> ~/.ssh/authorized_keys
此时就可以从本机免密码登录远程主机了。
然后关闭mpss服务,resetconfig,再重启mpss服务,就可以免密码登录远程mic卡了。
Bridge设置错误解决办法:
首先删掉或改名ifcfg-br0
编辑 /etc/udev/rules.d/70-persistent-net.rules注释掉后面不需要或MAC地址重复的项。
编辑ifcfg-eth1:
注释掉(用#号)BRIDGE=br0
添加BOOTPROTO=dhcp(或手动指定ip地址)
重启节点(似乎service network restart不管用)
重启后eth1应是直接连接路由器的ip。
这时恢复ifcfg-br0
编辑ifcfg-eth1,恢复BRIDGE并注释掉BOOTPROTO
重启节点即可。
附ifcfg-br0和ifcfg-eth1示例:
[root@ node01 network-scripts]# catifcfg-br0
DEVICE=br0
TYPE=Bridge
ONBOOT=yes
DELAY=0
NM_CONTROLLED="no"
MTU=9000
NOZEROCONF=yes
BOOTPROTO=static
IPADDR=192.168.1.100
NETMASK=255.255.255.0
[root@ node01 network-scripts]# catifcfg-eth1
DEVICE="eth1"
NM_CONTROLLED="yes"
ONBOOT=yes
DELAY=5
HWADDR=6C:92:BF:00:15:C5
TYPE=Ethernet
BRIDGE=br0
#BOOTPROTO=dhcp
#DEFROUTE=yes
#PEERDNS=yes
#PEERROUTES=yes
#IPV4_FAILURE_FATAL=yes
#IPV6INIT=no
MTU=9000
NAME="System eth1"
UUID=9c92fad9-6ecb-3e6c-eb4d-8a47c6f50c04
源代码见:
http://hpcbbs.it168.com/forum.php?mod=attachment&aid=MzIyMnwxNDUxZTBkY3wxMzYxNDM1NjAyfDIwNzYxMDIwfDczMzc%3D