头节点机器上流程化安装slurm的脚本

自己写了个

#!/bin/sh

#in head node , make sure the enviornment is clean , or you should use 'clean' script

#run as root , and run in SLURM_DIR

HEADNODE_IP=192.168.192.*

NODE1_IP=192.168.192.*

SLURM_DIR=/home/slurm

mkdir -p $SLURM_DIR

cd $SLURM_DIR

 

if [ ! -f "munge_0.5.10.orig.tar.bz2" ]

wget https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/munge/0.5.10-1/munge_0.5.10.orig.tar.bz2

fi

 

if [ ! -f "slurm-18.08.5-2.tar.bz2" ]

wget https://download.schedmd.com/slurm/slurm-18.08.5-2.tar.bz2

fi

 

#install dependence package

grep 'Red Hat' /proc/version

if [ "$?" -eq 0 ]

then

yum install gcc openssl-devel readline-devel -y

yum install bzip2-devel.bz2* -y

yum install zlib-devel* -y

yum install pam* -y

yum install perl*  -y

yum install rpm-build -y

fi

 

#create the global users , prepare to install slurm , make sure headenode and nodes have the same UID AND GID to confirm target

export MUNGEUSER=991

groupadd -g $MUNGEUSER munge

useradd -m -d /bar/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge

export SLURMUSER=992

groupadd -g $SLURMUSER slurm

useradd -m -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm

 

#rename machine

file="/etc/hosts"

ifconfig | grep 192.168.192.*

if [ -f "$file" ]

then

echo "can not find hosts file"

else

echo "

$HEADNODE_IP worker

$NODE1_IP worker1

" >> /etc/hosts

fi

hostnamectl --static set-hostname worker

 

echo 'install munge'

mv  munge_0.5.10.orig.tar.bz2  munge-0.5.10.tar.bz2

rpmbuild -tb  --clean munge-0.5.10.tar.bz2

cd  /root/rpmbuild/RPMS/x86_64

rpm --install  munge*.rpm

echo 'install mariadb'

yum -y install mariadb-server mariadb-devel

 

echo 'create munge key'

/usr/sbin/create-munge-key -r

 

#make sure the priority is correct

chown munge: /etc/munge/munge.key /var/log/munge

chmod 400 /etc/munge/munge.key /var/log/munge

 

echo 'install slurm'

rpmbuild  -ta  --clean  slurm-18.08.5-2.tar.bz2

cd  /root/rpmbuild/RPMS/x86_64

rpm --install slurm*rpm

#configure slurm.conf

if [ ! -f "/etc/slurm/slurm.conf" ]

then

echo "file not exit"

else

echo "

ControlMachine=worker

ControlAddr=$HEADNODE_IP

MpiDefault=none

ProctrackType=proctrack/pgid

ReturnToService=1

SlurmctldPidFile=/var/run/slurmctld.pid

SlurmdPidFile=/var/run/slurmd.pid

SlurmdSpoolDir=/var/spool/slurmd

SlurmUser=slurm

StateSaveLocation=/var/spool/slurmctld

SwitchType=switch/none

TaskPlugin=task/none

FastSchedule=1

SchedulerType=sched/backfill

SelectType=select/linear

AccountingStorageType=accounting_storage/none

ClusterName=buhpc

JobAcctGatherType=jobacct_gather/none

SlurmctldLogFile=/var/log/slurmctld.log

SlurmdLogFile=/var/log/slurmd.log

NodeName=worker NodeAddr=$HEADNODE_IP CPUs=1 Procs=1 State=UNKNOWN

NodeName=worker1 NodeAddr=$NODE1_IP CPUs=1 Procs=1 State=UNKNOWN

PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP

" > /etc/slurm/slurm.conf

 

#radio config file to worker

scp /etc/munge/munge.key root@worker1:/etc/munge/

scp /etc/slurm/slurm.conf   root@worker1:/etc/slurm/

 

#make sure the priority is correct

mkdir /var/spool/slurmctld chown slurm: /var/spool/slurmctld chmod 755 /var/spool/slurmctld touch /var/log/slurmctld.log chown slurm: /var/log/slurmctld.log touch /var/log/slurm_jobacct.log /var/log/slurm_jobcomp.log chown slurm: /var/log/slurm_jobacct.log /var/log/slurm_jobcomp.log

 

#start slurm at head node

 

systemctl stop firewalld

systemctl start munge

systemctl enable slurmd.service

systemctl start slurmd.service

systemctl enable slurmctld.service

systemctl start slurmctld.service

参考:

OpenMPI1是一种高性能消息传递库

https://www.cnblogs.com/aaron-agu/p/5700650.html

你可能感兴趣的:(linux运维)