linux集群之并行

实验环境

OScentos 5.10 x86_64(一台admin,两台计算节点)

主机名和和IP对应关系如下

admin: 192.168.78.11

node1:192.168.78.12

node2:192.168.78.13

 

软件版本:PBS

torque-3.0.6.tar.gz

maui-3.3.1.tar.gz

openmpi-1.8.1.tar.bz2

并行软件:

apoa1.tar.gz

NAMD_2.9_Linux-x86_64-multicore.tar.gz

 

一:环境配置

1.修改hosts文件,添加内容如下

192.168.78.11  admin

192.168.78.12  node1

192.168.78.13  node2


2.设置无密码访问

ssh-keygen一直按enter键即可,进入.ssh目录生成认证密码,并设置权限

[root@admin ~]#cd.ssh/

[[email protected]]#ls

id_rsa id_rsa.pub

[[email protected]]#cp id_rsa.pub authorized_keys

[[email protected]]#chmod 600 authorized_keys

[[email protected]]#ll

total 12

-rw------- 1rootroot  394 Aug 23 03:52 authorized_keys

-rw------- 1rootroot 1675 Aug 23 03:50 id_rsa

-rw-r--r-- 1rootroot  394 Aug 23 03:50 id_rsa.pub


3.然后复制.ssh目录到所有计算节点

 [root@admin~]# for i in 1 2 ; do scp -r /root/.ssh node$i:/root/ ; done

第一次要输入两台计算节点的root密码,以后都是无密码访问了


4.复制hosts文件到所有计算节点

[root@admin ~]#for i in 1 2 ; do scp /etc/hosts node$i:/etc/ ; done


5.配置nfs服务

把管理节点上的/export作为共享目录

[root@admin~]#mkdir -p /export/{apps,home,scripts,source}                  //其中apps为软件共享目录,home为共享家目录

[root@admin ~]#cat /etc/exports

/export  192.168.78.0/255.255.255.0(rw,sync)


6.启动nfs服务并检查启动是否成功

[root@admin~]#chkconfig portmap on ; /etc/init.d/portmap start

Startingportmap:                                         [ OK  ]

[root@admin~]#chkconfig nfs on ; /etc/init.d/nfs start

[root@admin~]#showmount -e localhost

Export listforlocalhost:

/export 192.168.78.0/255.255.255.0

[root@admin ~]#


7.配置autofs

[root@admin ~]#cat /etc/auto.master

/home/etc/auto.home  --timeout=1200

/share/ec/auto.share   --timeout=1200

[root@admin ~]#cat /etc/auto.share

*                                             admin:/export/&

[root@admin ~]#cat /etc/auto.home

*              -nfsvers=3            admin:/export/home/&

[root@admin ~]#


8.启动autofs服务

[root@admin~]#chkconfig autofs on ; /etc/init.d/autofs start


9.复制auto.master auto.share auto.home到所有计算节点

[root@admin ~]#for i in 1 2; do scp /etc/auto.master node$i:/etc/; done

[root@admin ~]#for i in 1 2; do scp /etc/auto.share node$i:/etc/; done

[root@admin ~]#for i in 1 2; do scp /etc/auto.home node$i:/etc/; done


10.启动autofs服务

[root@admin ~]#for i in 1 2; do ssh node$i /etc/init.d/autofs start; done

[root@admin ~]#for i in 1 2; do ssh node$i chkconfig autofs on; done


11.配置NIS服务

[root@admin ~]#yum -y install ypserv

[root@admin~]#nisdomainname wjcyf.com

[root@admin~]#echo "/bin/nisdomainname wjcyf.com">>/etc/rc.local

[root@admin~]#echo "NISDOMAIN=wjcyf.com">>/etc/sysconfig/network

[root@admin ~]#cp /usr/share/doc/ypserv-2.19/securenets /var/yp/

[root@admin ~]#vi /var/yp/securenets

修改后内容如下

[root@admin~]#grep -v "^#" /var/yp/securenets

255.0.0.0      127.0.0.0

255.255.255.0          192.168.78.0

[root@admin ~]#


12.启动NIS服务

[root@admin~]#/etc/init.d/ypserv start ;chkconfig ypserv on

Starting YPserverservices:                              [ OK  ]

[root@admin~]#/etc/init.d/yppasswdd start ;chkconfig yppasswdd on

Starting YPpasswdservice:                               [ OK  ]

[root@admin ~]#


13.修改/etc/default/useradd文件

HOME=/home更改为HOME=/export/home


14./etc/skel目录下创建.ssh目录并在.ssh目录下建立一个名为config的文件,设置如下

[root@admin~]#mkdir /etc/skel/.ssh

[root@admin~]#touch /etc/skel/.ssh/config

[root@admin ~]#cat /etc/skel/.ssh/config

StrictHostKeyChecking     no

UserKnownHostsFile        /dev/null

[root@admin~]#chmod 600 /etc/skel/.ssh/config


15.创建用于同步用户的命令

◆在/usr/local/sbin目录下创建了一个名为sync_users的脚本,内容如下:

#!/bin/bash

YPINIT=/usr/lib64/yp/ypinit

for USER in $(sed -n '/export/p' /etc/passwd | awk -F ":" '{print$1}')
do
                  if [ -z "$USER" ]; then
                           $YPINIT -m
                  else
                           usermod -d /home/$USER $USER
                 fi
done

                           $YPINIT -m

◆赋予可执行权限

chmod 755/usr/local/sbin/sync_users

◆以后执行sync_users命令就可以同步新创建的用户


16.创建一个测试用户wjc,并同步该用户

[root@admin~]#useradd wjc

[root@admin~]#echo wjc | passwd --stdin wjc

[root@admin~]#sync_users


注:以后每添加一个新用户,都需要执行sync_users命令

 

17. 配置NIS客户端,在所有计算节点上安装ypbind,RHEL默认已经安装

[root@admin~]#grep -v "^#" /etc/yp.conf

ypserver       admin


18.复制/etc/yp.conf到所有计算节点上

[root@admin ~]#for i in 1 2; do scp /etc/yp.conf node$i:/etc/; done


19.修改/etc/nsswitch.conf文件(在node1上操作)

[root@node1 ~]# vi /etc/nsswitch.conf

在第33,34,35,38files后面添加nis

    33 passwd:     files nis

    34 shadow:     files nis

    35 group:         files nis

    36

    37 #hosts:      db files nisplus nis dns

    38 hosts:        files nis dns


20.复制nsswitch.confnode2

[root@node1 ~]#scp /etc/nsswitch.conf node2:/etc/


21.设置nisdomainname名称

[root@admin ~]#for i in 1 2; do ssh node$i nisdomainname wjcyf.com; done

[root@admin ~]#for i in 1 2; do ssh node$i 'echo "/bin/nisdomainnamewjcyf.com">>/etc/rc.local'; done

[root@admin ~]#for i in 1 2; do ssh node$i 'echo"NISDOMAIN=wjcyf.com">>/etc/sysconfig/network'; done


22.启动ypbind服务

[root@admin ~]#for i in 1 2; do ssh node$i /etc/init.d/ypbind start; done

Binding to theNISdomain: [  OK  ]

Listening foranNIS domain server.

Binding to theNISdomain: [  OK  ]

Listening foranNIS domain server..


23.验证NIS服务配置是否正确

[root@node1~]#ypcat passwd

wjc:$1$tsPKQvPP$Kwom9qG/DNR1w/Lq./cQV.:500:500::/home/wjc:/bin/bash

[root@admin ~]#for i in 1 2; do ssh node$i id wjc; done
uid=500(wjc) gid=500(wjc) groups=500(wjc)
uid=500(wjc) gid=500(wjc) groups=500(wjc)

有上面输出可知,NIS服务配置正确

 

二:安装和配置torque(管理节点)

1.首先安装openmpi

[root@adminparallel]#tar xjvf openmpi-1.8.1.tar.bz2 -C /usr/local/src/

[root@adminparallel]#cd /usr/local/src/openmpi-1.8.1/

[[email protected]]#./configure --prefix=/share/apps/openmpi

[[email protected]]#make

[[email protected]]#make install

[[email protected]]#cp -r examples/ /share/apps/openmpi


2.添加环境变量,在/share/scripts目录先建立了一个Path.sh,以后也方便计算节点添加环境变量

[root@adminscripts]#pwd

/share/scripts

[root@adminscripts]#cat Path.sh

#!/bin/bash

grep openmpi /etc/bashrc || cat >>/etc/bashrc <<EOF

export PATH=/share/apps/openmpi/bin:\$PATH

export LD_LIBRARY_PATH=/share/apps/openmpi/lib:\$LD_LIBRARY_PATH

EOF

[root@adminscripts]#

[root@adminscripts]#sh Path.sh

[root@adminscripts]#source /etc/bashrc


3.测试openmpi是否安装成功

[root@adminscripts]#which mpirun

/share/apps/openmpi/bin/mpirun

[root@adminscriptss]#which mpiexec

/share/apps/openmpi/bin/mpiexec


4.安装torque

[root@adminparallel]#tar xzvf torque-3.0.6.tar.gz -C /share/source/

[root@adminparallel]#cd /share/source/torque-3.0.6/

[[email protected]]#./configure  --enable-syslog --enable-nvidia-gpus --enable-cpuset --disable-gui --with-rcp=scp --with-sendmail

[[email protected]]#make

[[email protected]]#make install

[[email protected]]#pwd

/share/source/torque-3.0.6

[[email protected]]#cat install.sh

cd /share/source/torque-3.0.6

make install

[[email protected]]#


5.初始化torque创建默认队列

[[email protected]]#./torque.setup root

initializingTORQUE(admin: root@admin)

PBS_Server admin:Create mode and server database exists,

do you wishtocontinue y/(n)?y

root    26351    1  0 06:44?        00:00:00 pbs_server -t create

Max openservers:10239

Max openservers:10239

[[email protected]]#


6.查看创建的默认队列batch

[[email protected]]#qmgr -c "p s"

#

# Create queues and set their attributes.

#

#

# Create and define queue batch

#

create queue batch

set queue batch queue_type = Execution

set queue batch resources_default.nodes = 1

set queue batch resources_default.walltime= 01:00:00

set queue batch enabled = True

set queue batch started = True

#

# Set server attributes.

#

set server scheduling = True

set server acl_hosts = admin

set server admins= root@admin

set server operators = root@admin

set server default_queue = batch

set server log_events = 511

set server mail_from = adm

set server scheduler_iteration = 600

set server node_check_rate = 150

set server tcp_timeout = 6

set server mom_job_sync = True

set server keep_completed = 300

[[email protected]]#


7.更改队列batch部分属性,以满足实际需求

[[email protected]]#qmgr -c "s q batch resources_default.walltime=24:00:00"

[[email protected]]#qmgr -c "s s query_other_jobs=true"


8.建立mom配置文件,用于复制到所有计算节点

[root@adminmom_priv]#pwd

/var/spool/torque/mom_priv

[root@adminmom_priv]#cat config

$pbsserver     admin

$logevent      225


9.创建节点信息文件

[root@adminserver_priv]#pwd

/var/spool/torque/server_priv

[root@adminserver_priv]#cat nodes

node1 

node2

[root@adminserver_priv]#


10.查看目前节点信息均为down状态

[root@adminserver_priv]#pbsnodes -a

node1

    state = down

    np = 1

    ntype = cluster

    mom_service_port = 15002

    mom_admin_port = 15003

    gpus = 0

 

node2

    state = down

    np = 1

    ntype = cluster

    mom_service_port = 15002

    mom_admin_port = 15003

    gpus = 0

 [root@adminserver_priv]#


11.复制pbs_server启动脚本,并设置开机自动启动

[[email protected]]#pwd

/share/apps/torque-3.0.6

[[email protected]]#cp contrib/init.d/pbs_server /etc/init.d/

[[email protected]]#chmod 755 /etc/init.d/pbs_server

[[email protected]]#chkconfig pbs_server on


12.复制pbs_mom脚本,方便复制到计算节点

[[email protected]]#cp contrib/init.d/pbs_mom /etc/init.d/

 

13.安装maui

[root@adminparallel]#tar xzvf maui-3.3.1.tar.gz -C /usr/local/src/

[root@admin ~]#cd /usr/local/src/maui-3.3.1/

[[email protected]]#./configure --prefix=/usr/local/maui --with-pbs=/usr/local

[[email protected]]#make

[[email protected]]#make install


14.复制maui启动脚本,设置正确路径,并设置为开机启动

[[email protected]]#cp etc/maui.d /etc/init.d/mauid

[[email protected]]#vi /etc/init.d/mauid

更改MAUI_PREFIX=/opt/mauiMAUI_PREFIX=/usr/local/maui

[[email protected]]#chmod 755 /etc/init.d/mauid

[[email protected]]#chkconfig mauid on


15.启动maui调度服务

[[email protected]]#/etc/init.d/mauid start

StartingMAUIScheduler:                                  [ OK  ]

[[email protected]]#


16.添加maui命令环境变量

[[email protected]]#vi /etc/bashrc

export PATH=/share/apps/openmpi/bin:/usr/local/maui/bin:$PATH

[[email protected]]#source /etc/bashrc


17.安装并行软件到共享目录

[root@adminnamd]#tar xzvf NAMD_2.9_Linux-x86_64-multicore.tar.gz -C /share/apps/

[root@adminnamd]#tar xzvf apoa1.tar.gz -C /share/apps/

[root@adminapps]#pwd

/share/apps

[root@adminapps]#mv NAMD_2.9_Linux-x86_64-multicore/ namd


18.添加namd命令环境变量,同时也添加到Path.sh方便计算节点添加环境变量

[[email protected]]#vi /etc/bashrc

export PATH=/share/apps/openmpi/bin:/usr/local/maui/bin:/share/apps/namd:$PATH

[[email protected]]#source /etc/bashrc

[root@adminscripts]#which namd2

/share/apps/namd/namd2

[root@adminscripts]#cat Path.sh

#!/bin/bash

grep openmpi /etc/bashrc || cat >>/etc/bashrc <<EOF

export PATH=/share/apps/openmpi/bin:/share/apps/namd:\$PATH

EOF

[root@adminscripts]#

至此管理端配置完成

 

三:计算节点配置torque

1.计算节点安装torque

[root@admin ~]#for i in 1 2; do ssh node$i sh /share/source/torque-3.0.6/install.sh; done


2.复制mom配置文件到计算节点

[root@admin ~]#for i in 1 2; do scp /var/spool/torque/mom_priv/confignode$i:/var/spool/torque/mom_priv/; done


3.复制mom启动脚本到计算节点,启动pbs_mom服务,并设置开机启动

[root@admin ~]#for i in 1 2; do scp /etc/init.d/pbs_mom node$i:/etc/init.d/; done

[root@admin ~]#for i in 1 2; do ssh node$i /etc/init.d/pbs_mom start; done

StartingTORQUEMom: [  OK  ]

StartingTORQUEMom: [  OK  ]

[root@admin ~]#for i in 1 2; do ssh node$i chkconfig pbs_mom on; done


4.设置环境变量

[root@admin ~]#for i in 1 2; do ssh node$i sh /share/scripts/Path.sh; done


5.测试环境变量设置是否正确

[root@admin ~]#for i in 1 2; do ssh node$i which mpirun; done

/share/apps/openmpi/bin/mpirun

/share/apps/openmpi/bin/mpirun

[root@admin ~]#for i in 1 2; do ssh node$i which namd2; done

/share/apps/namd/namd2

/share/apps/namd/namd2

[root@admin ~]#


6.此时再观察计算节点状态,已经变成free了,即可以提交任务到计算节点了

[root@adminapps]#pbsnodes -a

node1

    state = free

    np = 1

    ntype = cluster

    status=rectime=1408751492,varattr=,jobs=,state=free,netload=12996103,gres=,loadave=0.01,ncpus=1,physmem=1024932kb,availmem=2082428kb,totmem=2165536kb,idletime=0,nusers=0,nsessions=0,uname=Linuxnode12.6.18-371.el5 #1 SMP Tue Oct 1 08:35:08 EDT 2013 x86_64,opsys=linux

    mom_service_port = 15002

    mom_admin_port = 15003

    gpus = 0

 

node2

    state = free

    np = 1

    ntype = cluster

    status=rectime=1408751482,varattr=,jobs=,state=free,netload=12983275,gres=,loadave=0.03,ncpus=1,physmem=1024932kb,availmem=2082444kb,totmem=2165536kb,idletime=0,nusers=0,nsessions=0,uname=Linuxnode22.6.18-371.el5 #1 SMP Tue Oct 1 08:35:08 EDT 2013 x86_64,opsys=linux

    mom_service_port = 15002

    mom_admin_port = 15003

    gpus = 0

 

[root@adminapps]#

 

 

四:验证并行集群是否搭建成功

1.在管理节点上以建立的wjc用户登录,首先设置节点间无密码互访,操作和root用户一样,只是不需要复制.ssh目录


2.复制namd用软件apoa1到当前目录下

[wjc@admin ~]$cp -r /share/apps/apoa1/ ./


3.创建PBS脚本

[wjc@admin~]$touch test.pbs

脚本内容如下

[wjc@admin ~]$cat test.pbs

#!/bin/bash

#PBS -N wjcjob1

#PBS -j oe

#PBS -l nodes=2:ppn=1

NP=`cat $PBS_NODEFILE | wc -l`

echo "This job's id is $PBS_JOBID@$PBS_QUEUE"

echo "This job's workdir is $PBS_O_WORKDIR"

echo "This job is running on following nodes:"

cat $PBS_NODEFILE

echo "This job begins at:" `date`

echo

echo

cd $PBS_O_WORKDIR

mpirun -np $NP-machinefile $PBS_NODEFILE namd2 apoa1/apoa1.namd

echo

echo

echo "This job stops at:" `date`

[wjc@admin ~]$


4.提交任务

[wjc@admin ~]$qsub test.pbs

5.查看作业运行状态

[wjc@admin~]$qstat

Jobid                   Name            User            Time UseS Queue

-------------------------------------------------------- -------- - -----

1.admin                 wjcjob1          wjc                   0 R batch         

[wjc@admin~]$qstat -n

 

admin:

                                                                        Req'd Req'd   Elap

JobID              Username   Queue   Jobname         SessID NDS   TSK   Memory Time  S Time

--------------------------------------- ---------------- ------ ----- ------ ------ ----- - -----

1.admin           wjc         batch   wjcjob1           6676     2     2   --  24:00 R  --

  node2/0+node1/0

[wjc@admin ~]$

由上面可知作业已经在node1node2上运行了

至此,linux并行集群搭建完成

 

五:并行集群的监控

1.可以使用ganglia,具体搭建方法可以参考我的这两篇博客

http://blog.csdn.net/wjciayf/article/details/38342921

http://blog.csdn.net/wjciayf/article/details/38347247

 


本文出自 “永不止步” 博客,谢绝转载!

你可能感兴趣的:(linux,集群,并行)