说明:
一. 集群环境搭建分为:【NTP】、【MUNGE】、【Slurm】
网络拓扑
计算机名称 IP地址 角色
master 192.168.114.242 管理节点(master)
compute1 192.168.114.243 计算节点(compute1)
compute2 192.168.114.244 计算节点(compute1)
集群节点基本操作
关闭SELinux
# vi /etc/sysconfig/selinux #注释:修改内容
SELINUX=disabled
关闭 Firewall
# systemctl stop firewalld.service
# systemctl disable firewalld.service
#vi /etc/hosts #注释:增加内容
192.168.114.242 slurm-master
192.168.114.243 slurm-compute1
192.168.114.244 slurm-compute2
# reboot
创建 munge 和 slurm 用户
# export MUNGEUSER=1001 && groupadd -g $MUNGEUSER munge
# useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge
# export SLURMUSER=1002 && groupadd -g $SLURMUSER slurm
# useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm
安装依赖软件(NTP、MUNGE、Slurm 全部软件包安装)
# yum install -y epel-release axel yum-axelget
# yum install -y openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad python3-pip perl-ExtUtils-MakeMaker gcc rpm-build json-c json-c-devel http-parser http-parser-devel mysql-devel libaio net-tools epel-release openssh-clients munge munge-libs munge-devel rng-tools
集群节点安装 NTP
# systemctl enable ntpd.service
# ntpdate pool.ntp.org
# systemctl start ntpd
# rngd -r /dev/urandom
管理节点(master)安装 MUNGE
# /usr/sbin/create-munge-key -r
# dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key
# chown munge: /etc/munge/munge.key && chmod 400 /etc/munge/munge.key
# scp -p /etc/munge/munge.key [email protected]:/etc/munge #注释:同步到计算节点
# scp -p /etc/munge/munge.key [email protected]:/etc/munge #注释:同步到计算节点
集群节点启动 MUNGE 服务
# chown -R munge: /etc/munge/ /var/log/munge/ && chmod 0700 /etc/munge/ /var/log/munge/
# systemctl enable munge
# systemctl start munge
# systemctl status munge
集群节点安装 Slurm
# cd /usr/local
# rpmbuild -ta --with mysql slurm-20.11.2.tar.bz2 #注释:编译Slurm
# cd /root/rpmbuild/RPMS/x86_64
# yum localinstall slurm-*.rpm -y #注释:安装Slurm
管理节点(master)配置 Slurm
# cp /etc/slurm/slurm.conf.example /etc/slurm/slurm.conf
# cp /etc/slurm/slurmdbd.conf.example /etc/slurm/slurmdbd.conf
# cp /etc/slurm/cgroup.conf.example /etc/slurm/cgroup.conf
# vi /etc/slurm/slurm.conf #注释:替换内容
SlurmctldHost=slurm-master
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=root
StateSaveLocation=/var/spool
ClusterName=cluster-slurm
JobCompHost=slurm-master
JobCompLoc=slurm_jobcomp_db
JobCompPass=123456
JobCompPort=3306
JobCompType=jobcomp/mysql
JobCompUser=root
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurm/slurmd.log
NodeName=slurm-compute[1-2] CPUs=4 RealMemory=8192 State=UNKNOWN
PartitionName=debug Nodes=slurm-compute[1-2] Default=YES MaxTime=INFINITE State=UP
# vi /etc/slurm/slurmdbd.conf #注释:替换内容
AuthInfo=/var/run/munge/munge.socket.2
AuthType=auth/munge
DbdHost=slurm-master
DebugLevel=info
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd.pid
SlurmUser=root
StoragePass=123456
StorageType=accounting_storage/mysql
StorageUser=root
StorageLoc=slurm_acct_db
# vi /etc/slurm/cgroup.conf #注释:替换内容
CgroupAutomount=yes
ConstrainCores=no
ConstrainRAMSpace=no
# scp -p /etc/slurm/slurm.conf [email protected]:/etc/slurm/ #注释:同步到计算节点
# scp -p /etc/slurm/slurm.conf [email protected]:/etc/slurm/
# scp -p /etc/slurm/slurmdbd.conf [email protected]:/etc/slurm/ #注释:同步到计算节点
# scp -p /etc/slurm/slurmdbd.conf [email protected]:/etc/slurm/
# scp -p /etc/slurm/cgroup.conf [email protected]:/etc/slurm/ #注释:同步到计算节点
# scp -p /etc/slurm/cgroup.conf [email protected]:/etc/slurm/
集群节点执行(创建slurm日志文件,slurm默认不创建)
# mkdir /var/spool/slurmctld && chown slurm: /var/spool/slurmctld && chmod 755 /var/spool/slurmctld
# mkdir /var/log/slurm && touch /var/log/slurm/slurmctld.log && chown slurm: /var/log/slurm/slurmctld.log
# touch /var/log/slurm/slurm_jobacct.log /var/log/slurm/slurm_jobcomp.log && chown slurm: /var/log/slurm/slurm_jobacct.log /var/log/slurm/slurm_jobcomp.log
# mkdir /var/spool/slurmd && chown slurm: /var/spool/slurmd && chmod 755 /var/spool/slurmd
# touch /var/log/slurm/slurmd.log && chown slurm: /var/log/slurm/slurmd.log
# touch /var/log/slurm/slurmdbd.log && chown slurm: /var/log/slurm/slurmdbd.log
# touch /var/log/slurm/slurm.log && chown slurm: /var/log/slurm/slurm.log
管理节点(master)安装 Mysql
# rpm -e --nodeps mariadb-devel-5.5.68-1.el7.x86_64 mariadb-libs-5.5.68-1.el7.x86_64
# cd /usr/local
# tar -xvf mysql-5.7.28-1.el7.x86_64.rpm-bundle.tar
# rpm -ivh mysql-community-common-5.7.28-1.el7.x86_64.rpm
# rpm -ivh mysql-community-libs-5.7.28-1.el7.x86_64.rpm
# rpm -ivh mysql-community-client-5.7.28-1.el7.x86_64.rpm
# rpm -ivh mysql-community-server-5.7.28-1.el7.x86_64.rpm
# rpm -ivh mysql-community-devel-5.7.28-1.el7.x86_64.rpm
# rpm -ivh mysql-community-libs-compat-5.7.28-1.el7.x86_64.rpm
# systemctl start mysqld.service
# grep "password" /var/log/mysqld.log # 查询mysql root初始密码
# mysql -u root -p # 登录 Msql
# set global validate_password_policy=LOW;
# set global validate_password_length=6;
# ALTER USER 'root'@'localhost' IDENTIFIED BY '123456';
创建slurm的数据库
# create database slurm_acct_db;
# create database slurm_jobcomp_db;
设置主机访问虚拟机 mysql,如果不是在虚拟机中运行,请忽略
# use mysql;
# select host from user where user='root';
# update user set host = '%' where user = 'root';
# select host from user where user='root';
# flush privileges;
管理节点(master)启动 Slurm 服务
# systemctl enable slurmdbd.service
# systemctl start slurmdbd.service
# systemctl status slurmdbd.service
# systemctl enable slurmctld.service
# systemctl start slurmctld.service
# systemctl status slurmctld.service
计算节点(compute)启动 Slurm 服务
# systemctl enable slurmd.service
# systemctl start slurmd.service
# systemctl status slurmd.service