slurm 安装使用(centos7)

1、准备3台 机器 并设置 hosts

echo 192.168.108.138 m1 >> /etc/hosts
echo 192.168.108.139 s2 >> /etc/hosts

echo 192.168.108.140 s3 >> /etc/hosts

echo $hostname > /etc/hostname

hostnamectl set-hostname $hostname

2、创建slurm 用户(id 一定要是 412)

export SLURMUSER=412 
groupadd -g $SLURMUSER slurm 

useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm

id slurm

3、 关闭防火墙 SElinux

systemctl stop firewalld
systemctl disable firewalld
 vim /etc/selinux/config  中 SELINUX=disabled  永久需重启

setenforce 0

4、安装ohpc 源

yum install http://build.openhpc.community/OpenHPC:/1.3/CentOS_7/x86_64/ohpc-release-1.3-1.el7.x86_64.rpm

5、安装依赖

yum install openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad -y

6、安装server 端(m1 机器)

yum -y install ohpc-slurm-server
vim  /etc/slurm/slurm.conf

ControlMachine=m1
###***  CPUs=1 = Sockets*CoresPerSocket*ThreadsPerCore
NodeName=m1,s[2-3] CPUs=1 RealMemory=1024 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 Procs=1 State=IDLE

PartitionName=clients Nodes=s[2-3] Default=YES MaxTime=INFINITE State=UP

7. 安装 client 端 (s2,s3)

yum -y  install ohpc-slurm-client

scp -pr $m1IP:/etc/slurm/slurm.conf  /etc/slurm/
scp -pr $m1IP:/etc/scp -pr $m1IP:/etc/slurm/slurm.conf  /etc/slurm/

munge/munge.keyscp -pr $m1IP:/etc/munge/munge.key /etc/munge/munge.key

8 启动服务

server 端 (m1)
systemctl start munge

systemctl start slurmctld

client 端(s2,s3)
systemctl start munge
systemctl start slurmd

9、 sinfo 查看状态

# sinfo
PARTITION AVAIL  TIMELIMIT  NODES  STATE    NODELIST

normal*      up         infinite            2        drain     s[2-3]

### drain 状态修复 

scontrol update NodeName=s[2-3] State=RESUME

# sinfo
PARTITION AVAIL  TIMELIMIT  NODES  STATE    NODELIST

normal*      up         infinite            2        idle     s[2-3]

10、运行作业

#srun  hostname
s2

# srun -N 2 -l hostname
0: s2

1: s3

11、 其他命令

squeue -a  #查询作业

scancel   #取消作业

# scontrol show config
# scontrol show partition
# scontrol show node

# scontrol show jobs

12 、对比 PBS(参考 https://blog.csdn.net/weixin_39497034/article/details/79100799)
Command                 PBS Pro                 SLURM
Submit batch job         qsub [job script]        sbatch [job script]
Request interactive shell qsub -I /bin/bash        srun –pty /bin/bash
Delete job                 qdel [job id]                scancel [job id]
Queue status                  qstat -q                        sinfo
Job status                 qstat -f [job id]        scontrol show job [job id]
Node status                 pbsnodes [node name] scontrol show node [node id]


你可能感兴趣的:(slurm)