多机使用dockerswarm部署多个容器并行openmpi
2020-11-07 本文已影响0人
全村滴希望
将镜像导出到node节点,并开启nfs
docker save > openmpi.tar mrwangwei/centos-openmpi:v1
scp openmpi.tar node1:/usr/local/src/code/
scp openmpi.tar node2:/usr/local/src/code/
docker load < openmpi.tar
yum install nfs-utils -y
vi /etc/exports
/usr/local/src/code 192.168.220.0/16(rw,no_root_squash)
systemctl restart rpcbind
systemctl restart nfs
初始化docker swarm环境
docker swarm init --advertise-addr 192.168.220.132
--------------------------------------------------------------------------
Swarm initialized: current node (dxn1zf6l61qsb1josjja83ngz) is now a manager.
To add a worker to this swarm, run the following command:
docker swarm join \
--token SWMTKN-1-49nj1cmql0jkz5s954yi3oex3nedyz0fb0xx14ie39trti4wxv-8vxv8rssmk743ojnwacrr2e7c \
192.168.220.132:2377
To add a manager to this swarm, run 'docker swarm join-token manager' and follow the instructions.
--------------------------------------------------------------------------
docker swarm join \
--token SWMTKN-1-49nj1cmql0jkz5s954yi3oex3nedyz0fb0xx14ie39trti4wxv-8vxv8rssmk743ojnwacrr2e7c \
192.168.220.132:2377
docker node list
使用docker-compose文件创建容器
vi docker-compose.yaml
version: "3.2"
services:
mpi_master:
image: mrwangwei/centos-openmpi:v1
command: /bin/bash -c "/usr/sbin/sshd -D"
deploy:
replicas: 1
placement:
constraints:
- node.role == manager
networks:
- mpi_overlay
volumes:
- "mpi_code:/usr/local/src/code"
ports:
- "22"
mpi_node:
image: mrwangwei/centos-openmpi:v1
command: /bin/bash -c "/usr/sbin/sshd -D"
deploy:
replicas: 2
placement:
constraints:
- node.role == worker
volumes:
- "mpi_code:/usr/local/src/code"
networks:
- mpi_overlay
networks:
mpi_overlay:
volumes:
mpi_code:
driver: local
driver_opts:
type: "nfs"
o: "addr=192.168.220.132,rw"
device: ":/usr/local/src/code"
启动部署docker swarm
docker stack deploy --compose-file docker-compose.yaml example
docker service ls
docker service ps --no-trunc {serviceName}
docker exec -it /bin/bash
source /etc/profile
cd /usr/local/src/code/
dig tasks.mpi_node | grep ^tasks|awk '{print $5}' > machines
cat machines
ssh-keygen -t rsa
ssh-copy-id ...
ssh-copy-id ...
mpicc main.c
mpirun -np 10 --oversubscribe --allow-run-as-root --machinefile machines --prefix /usr/local/openmpi a.out
docker stack rm example
main.c
#include <stdio.h>
#include <mpi.h>
int main(int argc, char *argv[])
{
int myrank, nprocs;
char name[10];
int name_len;
int i,j,k,sum=0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Get_processor_name(name, &name_len);
printf("core[%3d] of [%3d] in {%s}, in dir[ ~/托腮etMPI ]\n", myrank, nprocs, name);
/* for(i =0 ; i<10000; i++) */
/* for(j=0 ;j< myrank ; j++) */
/* sum +=j; */
/* printf("core[%3d], sum= %12d\n", myrank,sum ); */
MPI_Finalize();
return 0;
}
lammps测试
docker-compose.yaml
```yaml
version: "3"
services:
mpi_master:
image: lammps-gpu:deepmd
command: /bin/bash -c "/usr/sbin/sshd -D"
deploy:
replicas: 1
placement:
constraints:
- node.role == manager
networks:
- mpi_overlay
volumes:
- "mpi_code:/root/lammps-data"
working_dir: /root/lammps-data/
mpi_node:
image: lammps-gpu:deepmd
command: /bin/bash -c "/usr/sbin/sshd -D"
deploy:
replicas: 2
placement:
constraints:
- node.role == worker
volumes:
- "mpi_code:/root/lammps-data"
networks:
- mpi_overlay
networks:
mpi_overlay:
volumes:
mpi_code:
driver: local
driver_opts:
type: "nfs"
o: "addr=192.168.220.132,rw"
device: ":/root/lammps-data"
```
```shell
docker stack deploy --compose-file docker-compose.yaml lammps
docker service ls
# 进入master容器中进行配置
source activate dpdev
cd /root/lammps-data/CH.airebo
yum -y install bind-utils
dig tasks.mpi_node | grep ^tasks|awk '{print $5}' > machines
ssh-keygen -t rsa -f ~/.ssh/id_rsa -P ''
ssh-copy-id $(cat machines | awk 'NR==1')
ssh-copy-id $(cat machines | awk 'NR==2')
mpirun --machinefile machines -np 56 /opt/lammps-3Mar20/src/lmp_mpi -in opt.in
mpirun --machinefile machines -np 28 /opt/lammps-3Mar20/src/lmp_mpi -sf gpu -pk gpu 2 -in gpu-opt.in
```
https://github.com/moby/moby/issues/37855
https://my.oschina.net/u/1787735/blog/4374958