1、准备3台 机器 并设置 hosts
echo 192.168.108.138 m1 >> /etc/hosts
echo 192.168.108.139 s2 >> /etc/hosts
echo 192.168.108.140 s3 >> /etc/hosts
echo $hostname > /etc/hostname
hostnamectl set-hostname $hostname
2、创建slurm 用户(id 一定要是 412)
export SLURMUSER=412
groupadd -g $SLURMUSER slurm
useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm
id slurm
3、 关闭防火墙 SElinux
systemctl stop firewalld
systemctl disable firewalld
vim /etc/selinux/config 中 SELINUX=disabled 永久需重启
setenforce 0
4、安装ohpc 源
yum install http://build.openhpc.community/OpenHPC:/1.3/CentOS_7/x86_64/ohpc-release-1.3-1.el7.x86_64.rpm
5、安装依赖
yum install openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad -y
6、安装server 端(m1 机器)
yum -y install ohpc-slurm-serverControlMachine=m1
###*** CPUs=1 = Sockets*CoresPerSocket*ThreadsPerCore
NodeName=m1,s[2-3] CPUs=1 RealMemory=1024 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 Procs=1 State=IDLE
PartitionName=clients Nodes=s[2-3] Default=YES MaxTime=INFINITE State=UP
7. 安装 client 端 (s2,s3)
yum -y install ohpc-slurm-client
scp -pr $m1IP:/etc/slurm/slurm.conf /etc/slurm/
scp -pr $m1IP:/etc/scp -pr $m1IP:/etc/slurm/slurm.conf /etc/slurm/
munge/munge.keyscp -pr $m1IP:/etc/munge/munge.key /etc/munge/munge.key
8 启动服务
server 端 (m1)
systemctl start munge
systemctl start slurmctld
client 端(s2,s3)
systemctl start munge
systemctl start slurmd
9、 sinfo 查看状态
# sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
normal* up infinite 2 drain s[2-3]
### drain 状态修复
scontrol update NodeName=s[2-3] State=RESUME
# sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
normal* up infinite 2 idle s[2-3]
10、运行作业
#srun hostname# srun -N 2 -l hostname
0: s2
1: s3
11、 其他命令
squeue -a #查询作业
scancel
# scontrol show config
# scontrol show partition
# scontrol show node
# scontrol show jobs
12 、对比 PBS(参考 https://blog.csdn.net/weixin_39497034/article/details/79100799)
Command PBS Pro SLURM
Submit batch job qsub [job script] sbatch [job script]
Request interactive shell qsub -I /bin/bash srun –pty /bin/bash
Delete job qdel [job id] scancel [job id]
Queue status qstat -q sinfo
Job status qstat -f [job id] scontrol show job [job id]
Node status pbsnodes [node name] scontrol show node [node id]