Slurm22.11.2 入门教程01-Centos7.6 安装 Slurm


说明:

一. 集群环境搭建分为:【NTP】、【MUNGE】、【Slurm】

网络拓扑

计算机名称                IP地址                           角色

 master                     192.168.114.242           管理节点(master)

 compute1                 192.168.114.243           计算节点(compute1)

 compute2                 192.168.114.244           计算节点(compute1)


集群节点基本操作


关闭SELinux


         # vi /etc/sysconfig/selinux              #注释:修改内容

                SELINUX=disabled                                       

关闭 Firewall

          # systemctl stop firewalld.service

          # systemctl disable firewalld.service

          #vi /etc/hosts                               #注释:增加内容

                  192.168.114.242  slurm-master

                  192.168.114.243  slurm-compute1

                  192.168.114.244  slurm-compute2

          # reboot


创建 munge 和 slurm 用户

         # export MUNGEUSER=1001 && groupadd -g $MUNGEUSER munge

         # useradd  -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge  -s /sbin/nologin munge

        # export SLURMUSER=1002 && groupadd -g $SLURMUSER slurm

        # useradd  -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm  -s /bin/bash slurm


安装依赖软件(NTP、MUNGE、Slurm 全部软件包安装)

         # yum install -y epel-release axel yum-axelget

         # yum install -y openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad python3-pip perl-ExtUtils-MakeMaker gcc rpm-build json-c json-c-devel http-parser http-parser-devel mysql-devel libaio net-tools epel-release openssh-clients munge munge-libs munge-devel rng-tools 


集群节点安装 NTP

          # systemctl enable ntpd.service

          # ntpdate pool.ntp.org

          # systemctl start ntpd

          # rngd -r /dev/urandom


管理节点(master)安装 MUNGE

          # /usr/sbin/create-munge-key -r

          # dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key

          # chown munge: /etc/munge/munge.key && chmod 400 /etc/munge/munge.key

          # scp -p /etc/munge/munge.key [email protected]:/etc/munge                     #注释:同步到计算节点

          # scp -p /etc/munge/munge.key [email protected]:/etc/munge                    #注释:同步到计算节点


集群节点启动 MUNGE 服务

         # chown -R munge: /etc/munge/ /var/log/munge/ && chmod 0700 /etc/munge/ /var/log/munge/

         # systemctl enable munge

         # systemctl start munge

        # systemctl status munge


集群节点安装 Slurm

          # cd /usr/local

          # rpmbuild -ta --with mysql slurm-20.11.2.tar.bz2                                                  #注释:编译Slurm

          # cd /root/rpmbuild/RPMS/x86_64

         # yum localinstall slurm-*.rpm -y                                                                             #注释:安装Slurm


管理节点(master)配置 Slurm

          # cp /etc/slurm/slurm.conf.example /etc/slurm/slurm.conf

          # cp /etc/slurm/slurmdbd.conf.example /etc/slurm/slurmdbd.conf

          # cp /etc/slurm/cgroup.conf.example /etc/slurm/cgroup.conf

          # vi /etc/slurm/slurm.conf               #注释:替换内容

                  SlurmctldHost=slurm-master

                  SlurmctldPidFile=/var/run/slurmctld.pid

                  SlurmctldPort=6817

                  SlurmdPidFile=/var/run/slurmd.pid

                  SlurmdPort=6818

                  SlurmdSpoolDir=/var/spool/slurmd

                 SlurmUser=root

                 StateSaveLocation=/var/spool

                 ClusterName=cluster-slurm

                JobCompHost=slurm-master

                JobCompLoc=slurm_jobcomp_db

               JobCompPass=123456

               JobCompPort=3306

               JobCompType=jobcomp/mysql

               JobCompUser=root

               SlurmctldDebug=info

               SlurmctldLogFile=/var/log/slurm/slurmctld.log

               SlurmdDebug=info

               SlurmdLogFile=/var/log/slurm/slurmd.log

               NodeName=slurm-compute[1-2] CPUs=4 RealMemory=8192 State=UNKNOWN

               PartitionName=debug Nodes=slurm-compute[1-2] Default=YES MaxTime=INFINITE State=UP

         # vi /etc/slurm/slurmdbd.conf          #注释:替换内容

               AuthInfo=/var/run/munge/munge.socket.2

               AuthType=auth/munge

               DbdHost=slurm-master

              DebugLevel=info

              LogFile=/var/log/slurm/slurmdbd.log

              PidFile=/var/run/slurmdbd.pid

              SlurmUser=root

              StoragePass=123456

               StorageType=accounting_storage/mysql

              StorageUser=root

              StorageLoc=slurm_acct_db

        # vi /etc/slurm/cgroup.conf               #注释:替换内容

              CgroupAutomount=yes

              ConstrainCores=no

             ConstrainRAMSpace=no

        # scp -p /etc/slurm/slurm.conf [email protected]:/etc/slurm/                           #注释:同步到计算节点

        # scp -p /etc/slurm/slurm.conf [email protected]:/etc/slurm/

        # scp -p /etc/slurm/slurmdbd.conf [email protected]:/etc/slurm/                    #注释:同步到计算节点

        # scp -p /etc/slurm/slurmdbd.conf [email protected]:/etc/slurm/

        # scp -p /etc/slurm/cgroup.conf [email protected]:/etc/slurm/                        #注释:同步到计算节点

        # scp -p /etc/slurm/cgroup.conf [email protected]:/etc/slurm/


集群节点执行(创建slurm日志文件,slurm默认不创建)

        # mkdir /var/spool/slurmctld && chown slurm: /var/spool/slurmctld && chmod 755 /var/spool/slurmctld

        # mkdir /var/log/slurm && touch /var/log/slurm/slurmctld.log && chown slurm: /var/log/slurm/slurmctld.log

        # touch /var/log/slurm/slurm_jobacct.log /var/log/slurm/slurm_jobcomp.log && chown slurm: /var/log/slurm/slurm_jobacct.log /var/log/slurm/slurm_jobcomp.log

        # mkdir /var/spool/slurmd && chown slurm: /var/spool/slurmd && chmod 755 /var/spool/slurmd

        # touch /var/log/slurm/slurmd.log && chown slurm: /var/log/slurm/slurmd.log

        # touch /var/log/slurm/slurmdbd.log && chown slurm: /var/log/slurm/slurmdbd.log

       # touch /var/log/slurm/slurm.log && chown slurm: /var/log/slurm/slurm.log


管理节点(master)安装 Mysql

        # rpm -e --nodeps mariadb-devel-5.5.68-1.el7.x86_64 mariadb-libs-5.5.68-1.el7.x86_64

        # cd /usr/local

        # tar -xvf mysql-5.7.28-1.el7.x86_64.rpm-bundle.tar

        # rpm -ivh mysql-community-common-5.7.28-1.el7.x86_64.rpm

        # rpm -ivh mysql-community-libs-5.7.28-1.el7.x86_64.rpm

        # rpm -ivh mysql-community-client-5.7.28-1.el7.x86_64.rpm

        # rpm -ivh mysql-community-server-5.7.28-1.el7.x86_64.rpm

        # rpm -ivh mysql-community-devel-5.7.28-1.el7.x86_64.rpm

        # rpm -ivh mysql-community-libs-compat-5.7.28-1.el7.x86_64.rpm

        # systemctl start mysqld.service

        # grep "password" /var/log/mysqld.log      # 查询mysql root初始密码

       # mysql -u root -p                                       # 登录 Msql

             # set global validate_password_policy=LOW;

             # set global validate_password_length=6;

             # ALTER USER 'root'@'localhost' IDENTIFIED BY '123456';

      创建slurm的数据库

             # create database slurm_acct_db;

             # create database slurm_jobcomp_db;

      设置主机访问虚拟机 mysql,如果不是在虚拟机中运行,请忽略

             # use mysql;

             # select host from user where user='root';

             # update user set host = '%' where user = 'root';

            # select host from user where user='root';

            # flush privileges;


管理节点(master)启动 Slurm 服务

          # systemctl enable slurmdbd.service

          # systemctl start slurmdbd.service

          # systemctl status slurmdbd.service

          # systemctl enable slurmctld.service

          # systemctl start slurmctld.service

          # systemctl status slurmctld.service


计算节点(compute)启动 Slurm 服务

         # systemctl enable slurmd.service

         # systemctl start slurmd.service

         # systemctl status slurmd.service

你可能感兴趣的:(Slurm22.11.2 入门教程01-Centos7.6 安装 Slurm)