Redhat5.5上实现Linux的rhcs群集
Rhcs红帽的群集套件,通过Conga项目调用luci界面安装软件套件
Rhel 4 提供16+个节点
Rhel 5 提供100+个节点
68-1
[root@target ~]# uname -n
target.test.com
[root@target ~]# cat /etc/sysconfig/network
HOSTNAME=target.test.com
[root@target ~]# cat /etc/hosts
127.0.0.1 localhost.localdomain localhost
::1 localhost6.localdomain6 localhost6
192.168.2.100 target.test.com target
192.168.2.10 node1.test.com node1
192.168.2.20 node2.test.com node2
格式化磁盘
[root@target ~]# fdisk -l
[root@target ~]# fdisk /dev/sdb
[root@target ~]# partprobe /dev/sdb
[root@target ~]# more /proc/partitions
安装target
[root@target ~]# yum list all |grep scsi
scsi-target-utils.i386 0.0-6.20091205snap.el5_4.1
[root@target ~]# yum install scsi-target* -y
[root@target ~]# service tgtd start
[root@target ~]# chkconfig tgtd on
[root@target ~]# netstat -tunlp |grep 3260
创建控制器和逻辑分区,各服务器时间要保持一致
Ip bind 认证,并把规则写入开机脚本里。
[root@target ~]# date
Thu Oct 20 14:38:09 CST 2011
[root@target ~]# tgtadm --lld iscsi --op new --mode target --tid=1 --targetname iqn.2011-10.co
m.test.target:test
[root@target ~]# echo "tgtadm --lld iscsi --op new --mode target --tid=1 --targetname iqn.2011
-10.com.test.target:test">>/etc/rc.d/rc.local
[root@target ~]# tgtadm --lld iscsi --op new --mode=logicalunit --tid=1 --lun=1 --backing-stor
e /dev/sdb1
[root@target ~]# echo "tgtadm --lld iscsi --op new --mode=logicalunit --tid=1 --lun=1 --backin
g-store /dev/sdb1">>/etc/rc.d/rc.local
[root@target ~]# tgtadm --lld iscsi --op bind --mode=target --tid=1 --initiator-address=192.168.2.0/24
[root@target ~]# echo "tgtadm --lld iscsi --op bind --mode=target --tid=1 --initiator-address=
192.168.2.0/24">>/etc/rc.d/rc.local
[root@target ~]# yum list all |grep luci
luci.i386 0.12.2 -12.el5 rhel-cluster
[root@target ~]# yum install luci -y
Luci初始化需要三步
#luci_admin init
#service luci restart –首次启动需要用restart
#https://localhost:8084/
[root@target ~]# luci_admin init
[root@target ~]# service luci restart
Shutting down luci: [ OK ]
Starting luci: Generating https SSL certificates... done
[ OK ]
Point your web browser to https://target.test.com:8084 to access luci
[root@target ~]# chkconfig luci on
[root@node1 ~]# uname -n
node1.test.com
[root@node1 ~]# cat /etc/hosts
127.0.0.1 localhost.localdomain localhost
::1 localhost6.localdomain6 localhost6
192.168.2.10 node1.test.com node1
192.168.2.20 node2.test.com node2
192.168.2.100 target.test.com target
[root@node1 ~]# cat /etc/sysconfig/network
HOSTNAME=node1.test.com
[root@node1 ~]# yum list all|grep scsi
iscsi-initiator-utils.i386 6.2.0 .871-0.16.el5 rhel-server
[root@node1 ~]# yum list all|grep ricci
ricci.i386 0.12.2 -12.el5 rhel-cluster
[root@node1 ~]# yum install ricci iscsi* -y
[root@node1 ~]# vim /etc/iscsi/initiatorname.iscsi
InitiatorName=iqn.2011-10.com.test.node1:init1
[root@node1 ~]# service iscsi start
[root@node1 ~]# chkconfig iscsi on
发现target
[root@node1 ~]# iscsiadm --mode discovery --type sendtargets --portal 192.168.2.100
192.168.2.100:3260,1 iqn.2011-10.com.test.target:test
登陆target
[root@node1 ~]# iscsiadm --mode node --targetname iqn.2011-10.com.test.target:te
st --portal 192.168.2.100 --login
Logging in to [iface: default, target: iqn.2011-10.com.test.target:test, portal: 192.168.2.100,3260]
Login to [iface: default, target: iqn.2011-10.com.test.target:test, portal: 192.168.2.100,3260]: successful
[root@node2 ~]# uname -n
node2.test.com
[root@node2 ~]# cat /etc/sysconfig/network
HOSTNAME=node2.test.com
[root@node2 ~]# cat /etc/hosts
127.0.0.1 localhost.localdomain localhost
::1 localhost6.localdomain6 localhost6
192.168.2.10 node1.test.com node1
192.168.2.20 node2.test.com node2
192.168.2.100 target.test.com target
安装ricci和iscsi
[root@node2 ~]# yum list all |grep iscsi
iscsi-initiator-utils.i386 6.2.0 .871-0.16.el5 rhel-server
[root@node2 ~]# yum list all |grep ricci
ricci.i386 0.12.2 -12.el5 rhel-cluster
[root@node2 ~]# yum install ricci iscsi –y
[root@node2 ~]# vim /etc/iscsi/initiatorname.iscsi
InitiatorName=iqn.2011-10.com.test.node2:init2
[root@node2 ~]# service iscsi start
[root@node2 ~]# chkconfig iscsi on
[root@node2 ~]# iscsiadm --mode discovery --type sendtargets --portal 192.168.2.100
192.168.2.100:3260,1 iqn.2011-10.com.test.target:test
[root@node2 ~]# iscsiadm --mode node --targetname iqn.2011-10.com.test.target:test --portal 192.168.2.100 --login
Logging in to [iface: default, target: iqn.2011-10.com.test.target:test, portal: 192.168.2.100,3260]
Login to [iface: default, target: iqn.2011-10.com.test.target:test, portal: 192.168.2.100,3260]: successful
在target上用show可以查看用户登陆情况
[root@target ~]# tgtadm --lld iscsi --op show --mode target
Target 1: iqn.2011-10.com.test.target:test
System information:
Driver: iscsi
State: ready
I_T nexus information:
I_T nexus: 1
Initiator: iqn.2011-10.com.test.node1:init1
Connection: 0
IP Address: 192.168.2.10
I_T nexus: 2
Initiator: iqn.2011-10.com.test.node2:init2
Connection: 0
IP Address: 192.168.2.20
LUN information:
LUN: 0
Type: controller
SCSI ID: IET 00010000
SCSI SN: beaf10
Size: 0 MB
Online: Yes
Removable media: No
Backing store type: rdwr
Backing store path: None
LUN: 1
Type: disk
SCSI ID: IET 00010001
SCSI SN: beaf11
Size: 1645 MB
Online: Yes
Removable media: No
Backing store type: rdwr
Backing store path: /dev/sdb1
Account information:
ACL information:
192.168.2.0/24
Luci配置群集
需要关闭防火墙,域名要能解析,并且luci和ricci服务是启动的
启动ricci
[ot@node1 ~]# service ricci start
[root@node1 ~]# chkconfig ricci on
群集会自动挂载共享磁盘,Apache不能本节点启动,它是由群集管理的。群集会根据规则自动启用
Login只需一次,从新登陆用
#Service iscsi restart
#Check ricci on
[root@node1 ~]# yum install httpd*
[root@node2 ~]# yum install httpd*
群集文件系统
1. 分布锁
2. 推送功能
[root@target ~]# fdisk /dev/sdb
[root@target ~]# partprobe /dev/sdb
[root@target ~]# more /proc/partitions
分区不能太小了,集群可能会报错
[root@node1 ~]# partprobe /dev/sdb
[root@node1 ~]# more /proc/sdb
/proc/sdb: No such file or directory
[root@node1 ~]# more /proc/partitions
[root@node2 ~]# partprobe /dev/sdb
[root@node2 ~]# more /proc/partitions
两个分区都要同步下
1. 先建立群集
68-2
检验时报错原因:我的node1上的防火墙忘记关了。在创建群集时这一步还可能报其他错误,可能原因是,域名解析问题或者是ricci没有启动的问题,需要从这些检错.
68-3
2.然后再建群集逻辑,个节点做群集卷
物理卷
[root@node1 ~]# partprobe /dev/sdb
[root@node1 ~]# more /proc/partitions
[root@node1 ~]# pvcreate /dev/sdb
Physical volume "/dev/sdb" successfully created
卷组
[root@node1 ~]# vgcreate vg01 /dev/sdb
/dev/cdrom: open failed: Read-only file system
/dev/cdrom: open failed: Read-only file system
Attempt to close device '/dev/cdrom' which is not open.
/dev/cdrom: open failed: Read-only file system
Attempt to close device '/dev/cdrom' which is not open.
Clustered volume group "vg01" successfully created
[root@node1 ~]# vgscan
Reading all physical volumes. This may take a while...
Found volume group "vg01" using metadata type lvm2
Found volume group "VolGroup00" using metadata type lvm2
[root@node1 ~]# pvdisplay
[root@node1 ~]# vgdisplay
--- Volume group ---
Clustered yes
[root@node2 ~]# vgdisplay
--- Volume group ---
Clustered yes
如果其他节点看不到群集逻辑卷
1.看是否同步
2.如果节点的群集逻辑卷不同步各节点启用服务同步下
[root@node1 ~]# service clvmd restart
68-4
创建逻辑卷
[root@node1 ~]# lvcreate -L 1000M -n lv01 vg01
/dev/cdrom: open failed: Read-only file system
Logical volume "lv01" created
[root@node1 ~]# lvsan
-bash: lvsan: command not found
[root@node1 ~]# lvscan
ACTIVE '/dev/vg01/lv01' [1000.00 MB] inherit
ACTIVE '/dev/VolGroup00/LogVol00' [199.38 GB] inherit
ACTIVE '/dev/VolGroup00/LogVol01' [512.00 MB] inherit
[root@node1 ~]#
创建gfs文件系统
Lock_dlm分布锁
Nolock 单机锁(系统崩溃时挂载单机锁才能访问文件系统)
[root@node1 ~]# gfs_mkfs -p lock_dlm -t test:lv01 -j 3 /dev/vg01/lv01
This will destroy any data on /dev/vg01/lv01.
Are you sure you want to proceed? [y/n] y
Device: /dev/vg01/lv01
Blocksize: 4096
Filesystem Size: 157652
Journals: 3
Resource Groups: 8
Locking Protocol: lock_dlm
Lock Table: test:lv01
Syncing...
All Done
[root@node1 ~]#
//-j 日志功能占的块
创建luci群集管理
1. 定义fence
68-5
创建分发的钥匙
68-6
创建之后可以在下面的文件里查看fence的配置
[root@node1 ~]# ll /etc/cluster/
total 8
-rw-r----- 1 root root 476 Oct 20 15:58 cluster.conf
-rw------- 1 root root 4096 Oct 20 15:57 fence_xvm.key
[root@node1 ~]# cat /etc/cluster/cluster.conf
<?xml version="1.0"?>
<cluster alias="test" config_version="2" name="test">
<fence_daemon clean_start="0" post_fail_delay="0" post_join_delay="3"/>
<clusternodes>
<clusternode name="node1.test.com" nodeid="1" votes="1">
<fence/>
</clusternode>
<clusternode name="node2.test.com" nodeid="2" votes="1">
<fence/>
</clusternode>
</clusternodes>
<cman expected_votes="1" two_node="1"/>
<fencedevices/>
<rm>
<failoverdomains/>
<resources/>
</rm>
</cluster>
[root@node1 ~]# file -s /etc/cluster/fence_xvm.key
/etc/cluster/fence_xvm.key: data
[root@node1 ~]# file -s /etc/cluster/cluster.conf
/etc/cluster/cluster.conf: XML 1.0 document text
添加fence设备
68-7
Fence应用到节点
68-8
Node2 同样应用
68-9
添加故障转移区域
68-10
数据小是优先的,优先级相同,主节点是随机的
添加资源
添加群集ip
68-11
添加群集文件系统
68-12
创建脚本资源
68-13
添加服务
68-14
[root@node2 ~]# service httpd status
httpd (pid 11730) is running...
[root@node2 ~]# mount
/dev/mapper/VolGroup00-LogVol00 on / type ext3 (rw)
proc on /proc type proc (rw)
sysfs on /sys type sysfs (rw)
devpts on /dev/pts type devpts (rw,gid=5,mode=620)
/dev/sda1 on /boot type ext3 (rw)
tmpfs on /dev/shm type tmpfs (rw)
none on /proc/sys/fs/binfmt_misc type binfmt_misc (rw)
sunrpc on /var/lib/nfs/rpc_pipefs type rpc_pipefs (rw)
/dev/hdc on /media/RHEL_5.5 i386 DVD type iso9660 (ro,noexec,nosuid,nodev,uid=0)
/dev/hdc on /mnt/cdrom type iso9660 (ro)
none on /sys/kernel/config type configfs (rw)
/dev/mapper/vg01-lv01 on /var/www/html type gfs (rw,hostdata=jid=0:id=262146:first=1)
[root@node2 ~]# cd /var/www/html/
[root@node2 html]# ll
total 0
[root@node2 html]# vim index.html
hello my first cluster test1!
68-15
[root@node2 ~]# service httpd status
httpd (pid 11730) is running...
[root@node2 ~]# mount
/dev/mapper/VolGroup00-LogVol00 on / type ext3 (rw)
proc on /proc type proc (rw)
sysfs on /sys type sysfs (rw)
devpts on /dev/pts type devpts (rw,gid=5,mode=620)
/dev/sda1 on /boot type ext3 (rw)
tmpfs on /dev/shm type tmpfs (rw)
none on /proc/sys/fs/binfmt_misc type binfmt_misc (rw)
sunrpc on /var/lib/nfs/rpc_pipefs type rpc_pipefs (rw)
/dev/hdc on /media/RHEL_5.5 i386 DVD type iso9660 (ro,noexec,nosuid,nodev,uid=0)
/dev/hdc on /mnt/cdrom type iso9660 (ro)
none on /sys/kernel/config type configfs (rw)
/dev/mapper/vg01-lv01 on /var/www/html type gfs (rw,hostdata=jid=0:id=262146:first=1)
[root@node2 ~]# cd /var/www/html/
[root@node2 html]# vim index.html
hello my first cluster test1!
[root@node2 html]# tail -f /var/log/messages
Oct 20 16:38:08 localhost kernel: GFS: fsid=test:lv01.0: jid=0: Looking at journal...
Oct 20 16:38:08 localhost kernel: GFS: fsid=test:lv01.0: jid=0: Done
Oct 20 16:38:08 localhost kernel: GFS: fsid=test:lv01.0: jid=1: Trying to acquire journal lock...
Oct 20 16:38:08 localhost kernel: GFS: fsid=test:lv01.0: jid=1: Looking at journal...
Oct 20 16:38:09 localhost kernel: GFS: fsid=test:lv01.0: jid=1: Done
Oct 20 16:38:09 localhost kernel: GFS: fsid=test:lv01.0: jid=2: Trying to acquire journal lock...
Oct 20 16:38:09 localhost kernel: GFS: fsid=test:lv01.0: jid=2: Looking at journal...
Oct 20 16:38:09 localhost kernel: GFS: fsid=test:lv01.0: jid=2: Done
Oct 20 16:38:11 localhost avahi-daemon[2980]: Registering new address record for 192.168.2.200 on eth0.
Oct 20 16:38:13 localhost clurgmgrd[5924]: <notice> Service service:www started
[root@node2 html]# ip addr list
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 16436 qdisc noqueue
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast qlen 1000
link/ether 00:0c:29:ee:ab:6a brd ff:ff:ff:ff:ff:ff
inet 192.168.2.20/24 brd 192.168.2.255 scope global eth0
inet 192.168.2.200/24 scope global secondary eth0
inet6 fe80::20c:29ff:feee:ab6a/64 scope link
valid_lft forever preferred_lft forever
3: sit0: <NOARP> mtu 1480 qdisc noop
link/sit 0.0.0 .0 brd 0.0.0.0
[root@node2 html]#
68-16
假设node2节点出息故障,查看群集各个节点的状态
[root@node2 ~]# service httpd stop
[root@node1 ~]# service httpd status
httpd is stopped
[root@node1 ~]# tail -f /var/log/messages
Oct 20 16:28:01 localhost clurgmgrd[6009]: <notice> Reconfiguring
Oct 20 16:29:46 localhost ccsd[5883]: Update of cluster.conf complete (version 8 -> 9).
Oct 20 16:29:55 localhost clurgmgrd[6009]: <notice> Reconfiguring
Oct 20 16:31:11 localhost ccsd[5883]: Update of cluster.conf complete (version 9 -> 10).
Oct 20 16:31:30 localhost clurgmgrd[6009]: <notice> Reconfiguring
Oct 20 16:38:44 localhost ccsd[5883]: Update of cluster.conf complete (version 10 -> 11).
Oct 20 16:39:03 localhost clurgmgrd[6009]: <notice> Reconfiguring
Oct 20 16:40:27 localhost ccsd[5883]: Update of cluster.conf complete (version 11 -> 12).
Oct 20 16:40:37 localhost clurgmgrd[6009]: <notice> Reconfiguring
Oct 20 16:40:38 localhost clurgmgrd[6009]: <notice> Initializing service:www
Oct 20 17:10:24 localhost clurgmgrd[6009]: <notice> Recovering failed service service:www
Oct 20 17:10:24 localhost kernel: GFS 0.1.34 -12.el5 (built Mar 11 2010 14:05:45) installed
Oct 20 17:10:24 localhost kernel: Lock_DLM (built Mar 11 2010 14:05:42) installed
Oct 20 17:10:24 localhost kernel: Lock_Nolock (built Mar 11 2010 14:05:41) installed
Oct 20 17:10:24 localhost kernel: Trying to join cluster "lock_dlm", "test:lv01"
Oct 20 17:10:25 localhost kernel: Joined cluster. Now mounting FS...
Oct 20 17:10:25 localhost kernel: GFS: fsid=test:lv01.1: jid=1: Trying to acquire journal lock...
Oct 20 17:10:25 localhost kernel: GFS: fsid=test:lv01.1: jid=1: Looking at journal...
Oct 20 17:10:25 localhost kernel: GFS: fsid=test:lv01.1: jid=1: Done
Oct 20 17:10:28 localhost avahi-daemon[2980]: Registering new address record for 192.168.2.200 on eth0.
Oct 20 17:10:29 localhost clurgmgrd[6009]: <notice> Service service:www started
[root@node1 ~]# service httpd status
httpd (pid 13086) is running...
[root@node1 ~]# ip addr status
Command "status" is unknown, try "ip address help".
[root@node1 ~]# ip addr list
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 16436 qdisc noqueue
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast qlen 1000
link/ether 00:0c:29:b0:cc:45 brd ff:ff:ff:ff:ff:ff
inet 192.168.2.10/24 brd 192.168.2.255 scope global eth0
inet 192.168.2.200/24 scope global secondary eth0
inet6 fe80::20c:29ff:feb0:cc45/64 scope link
valid_lft forever preferred_lft forever
3: sit0: <NOARP> mtu 1480 qdisc noop
link/sit 0.0.0 .0 brd 0.0.0.0
[root@node1 ~]#
68-17
查看节点的状态
[root@node1 cluster]# clustat
[root@node1 cluster]# cman_tool status
[root@node1 cluster]# ccs_tool lsnode
[root@node1 cluster]# ccs_tool lsfence
[root@node1 cluster]# service cman status
[root@node1 cluster]# service rgmanager status
[root@node1 ~]# clustat -i 1 ---------1s查看一次