多机使用dockerswarm部署多个容器并行openmpi

将镜像导出到node节点,并开启nfs

docker save > openmpi.tar mrwangwei/centos-openmpi:v1
scp openmpi.tar node1:/usr/local/src/code/
scp openmpi.tar node2:/usr/local/src/code/
docker load < openmpi.tar

yum install nfs-utils -y

vi /etc/exports
/usr/local/src/code 192.168.220.0/16(rw,no_root_squash)
systemctl restart rpcbind
systemctl restart nfs

初始化docker swarm环境

docker swarm init --advertise-addr 192.168.220.132

--------------------------------------------------------------------------
Swarm initialized: current node (dxn1zf6l61qsb1josjja83ngz) is now a manager.

To add a worker to this swarm, run the following command:

docker swarm join \
--token SWMTKN-1-49nj1cmql0jkz5s954yi3oex3nedyz0fb0xx14ie39trti4wxv-8vxv8rssmk743ojnwacrr2e7c \
192.168.220.132:2377

To add a manager to this swarm, run 'docker swarm join-token manager' and follow the instructions.
--------------------------------------------------------------------------

docker swarm join \
--token SWMTKN-1-49nj1cmql0jkz5s954yi3oex3nedyz0fb0xx14ie39trti4wxv-8vxv8rssmk743ojnwacrr2e7c \
192.168.220.132:2377

docker node list

使用docker-compose文件创建容器

vi docker-compose.yaml

version: "3.2"
services:
  mpi_master:
    image: mrwangwei/centos-openmpi:v1
    command: /bin/bash -c "/usr/sbin/sshd -D"
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.role == manager
    networks:
      - mpi_overlay
    volumes:
      - "mpi_code:/usr/local/src/code"
    ports:
     - "22"

  mpi_node:
    image: mrwangwei/centos-openmpi:v1
    command: /bin/bash -c "/usr/sbin/sshd -D"
    deploy:
      replicas: 2
      placement:
        constraints:
          - node.role == worker
    volumes:
      - "mpi_code:/usr/local/src/code"
    networks:
        - mpi_overlay

networks:
  mpi_overlay:

volumes:
  mpi_code:
    driver: local
    driver_opts:
      type: "nfs"
      o: "addr=192.168.220.132,rw"
      device: ":/usr/local/src/code"

启动部署docker swarm

docker stack deploy --compose-file docker-compose.yaml example

docker service ls

docker service ps --no-trunc {serviceName}

docker exec -it /bin/bash
source /etc/profile
cd /usr/local/src/code/
dig tasks.mpi_node | grep ^tasks|awk '{print $5}' > machines
cat machines
ssh-keygen -t rsa
ssh-copy-id ...
ssh-copy-id ...
mpicc main.c
mpirun -np 10 --oversubscribe --allow-run-as-root --machinefile machines --prefix /usr/local/openmpi a.out

docker stack rm example

main.c

#include 
#include 

int main(int argc, char *argv[])
{
        int myrank, nprocs;
        char name[10];
        int name_len;
        int i,j,k,sum=0;
        MPI_Init(&argc, &argv);
        MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
        MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
        MPI_Get_processor_name(name, &name_len);
        printf("core[%3d] of [%3d] in {%s},  in dir[ ~/托腮etMPI ]\n", myrank, nprocs, name);
        /* for(i =0 ; i<10000; i++) */
                /* for(j=0 ;j< myrank ; j++) */
                        /* sum +=j; */
        /* printf("core[%3d], sum= %12d\n", myrank,sum ); */

        MPI_Finalize();

        return 0;
}

lammps测试
docker-compose.yaml

```yaml
version: "3"
services:
  mpi_master:
    image: lammps-gpu:deepmd
    command: /bin/bash -c "/usr/sbin/sshd -D"
    deploy:
      replicas: 1
      placement:
        constraints:
          - node.role == manager
    networks:
      - mpi_overlay
    volumes:
      - "mpi_code:/root/lammps-data"
    working_dir: /root/lammps-data/
  
  mpi_node:
    image: lammps-gpu:deepmd
    command: /bin/bash -c "/usr/sbin/sshd -D"
    deploy:
      replicas: 2
      placement:
        constraints:
          - node.role == worker
    volumes:
      - "mpi_code:/root/lammps-data"
    networks:
        - mpi_overlay

networks:
  mpi_overlay:

volumes:
  mpi_code:
    driver: local
    driver_opts:
      type: "nfs"
      o: "addr=192.168.220.132,rw"
      device: ":/root/lammps-data"
```

```shell
docker stack deploy --compose-file docker-compose.yaml lammps
docker service ls
# 进入master容器中进行配置
source activate dpdev
cd /root/lammps-data/CH.airebo
yum -y install bind-utils
dig tasks.mpi_node | grep ^tasks|awk '{print $5}' > machines
ssh-keygen -t rsa -f ~/.ssh/id_rsa -P ''
ssh-copy-id $(cat machines | awk 'NR==1')
ssh-copy-id $(cat machines | awk 'NR==2')
mpirun --machinefile machines -np 56 /opt/lammps-3Mar20/src/lmp_mpi -in opt.in
mpirun --machinefile machines -np 28 /opt/lammps-3Mar20/src/lmp_mpi -sf gpu -pk gpu 2 -in gpu-opt.in
```

https://github.com/moby/moby/issues/37855
https://my.oschina.net/u/1787735/blog/4374958

你可能感兴趣的:(多机使用dockerswarm部署多个容器并行openmpi)