本实验部署DRBD + HEARDBEAT + NFS 环境,建立一个高可用(HA)的文件服务器集群。在方案中,通
过DRBD保证了服务器数据的完整性和一致性。DRBD类似于一个网络RAID-1功能。当你将数据写入本
地文件系统时,数据还将会被发送到网络中另一台主机上,以相同的形式记录在一个另文件系统中。主节
点与备节点的数据可以保证实时相互同步。当本地主服务器出现故障时,备份服务器上还会保留有一份相
同的数据,可以继续使用。在高可用(HA)中使用DRBD功能,可以代替使用一个共享盘阵。因为数据同时
存在于本地主服务器和备份服务器上。切换时,远程主机只要使用它上面的那份备份数据,就可以继续提
供主服务器上相同的服务,并且client用户对主服务器的故障无感知。
虚拟机操作系统:Red Hat Enterprise Linux Server release 5.4
A: master 计算机名:Server1 etho:192.168.20.101
B:slave 计算机名:Server2 eth0: 192.168.20.102
Heartbeat 虚拟IP:192.168.20.188
两台服务器将/dev/sda4互为镜像
两台服务器/etc/export配置相同
同步服务器时间
[root@server1 ~]# hwclock –s
两个主机的hosts文件里包含2台主机的ip地址和主机名称
[root@server1 ~]# echo "192.168.20.101 server1" >>/etc/hosts
[root@server1 ~]# echo "192.168.20.102 server2" >>/etc/hosts
[root@server2 ~]# echo "192.168.20.101 server1" >>/etc/hosts
[root@server2 ~]# echo "192.168.20.102 server2" >>/etc/hosts
在server1和server2做一下操作
配置yum服务
[root@server1 ~]# mkdir /mnt/cdrom
[root@server1 ~]# mount /dev/cdrom /mnt/cdrom/
mount: block device /dev/cdrom is write-protected, mounting read-only
[root@server1 ~]# vim /etc/yum.repos.d/rhel-debuginfo.repo
[rhel-server]
name=Red Hat Enterprise Linux server
baseurl=file:///mnt/cdrom/Server
enabled=1
gpgcheck=1
gpgkey=file:///mnt/cdrom/RPM-GPG-KEY-redhat-release
[rhel-cluster]
name=Red Hat Enterprise Linux cluster
baseurl=file:///mnt/cdrom/Cluster
enabled=1
gpgcheck=1
gpgkey=file:///mnt/cdrom/RPM-GPG-KEY-redhat-release
[rhel-clusterstorage]
name=Red Hat Enterprise Linux clusterstorage
baseurl=file:///mnt/cdrom/ClusterStorage
enabled=1
gpgcheck=1
gpgkey=file:///mnt/cdrom/RPM-GPG-KEY-redhat-release
两主机分区大小一致
[root@server1 ~]# fdisk –l
Disk /dev/sda: 21.4 GB, 21474836480 bytes
255 heads, 63 sectors/track, 2610 cylinders
Units = cylinders of 16065 * 512 = 8225280 bytes
Device Boot Start End Blocks Id System
/dev/sda1 * 1 13 104391 83 Linux
/dev/sda2 14 1288 10241437+ 83 Linux
/dev/sda3 1289 1415 1020127+ 82 Linux swap / Solaris
[root@server1 ~]# fdisk /dev/sda
The number of cylinders for this disk is set to 2610.
There is nothing wrong with that, but this is larger than 1024,
and could in certain setups cause problems with:
1) software that runs at boot time (e.g., old versions of LILO)
2) booting and partitioning software from other OSs
(e.g., DOS FDISK, OS/2 FDISK)
Command (m for help): n
Command action
e extended
p primary partition (1-4)
p
Selected partition 4
First cylinder (1416-2610, default 1416):
Using default value 1416
Last cylinder or +size or +sizeM or +sizeK (1416-2610, default 2610): +1G
Command (m for help): w
The partition table has been altered!
Calling ioctl() to re-read partition table.
WARNING: Re-reading the partition table failed with error 16: Device or resource busy.
The kernel still uses the old table.
The new table will be used at the next reboot.
Syncing disks.
[root@server1 ~]# partprobe /dev/sda
[root@server1 ~]# cat /proc/partitions
major minor #blocks name
8 0 20971520 sda
8 1 104391 sda1
8 2 10241437 sda2
8 3 1020127 sda3
8 4 987997 sda4
Server1和server2 做一下相同操作
4.1.安装 drbd包
[root@server1 ~]# ll
total 424
drwxr-xr-x 2 root root 4096 Mar 2 14:26 Desktop
-rw------- 1 root root 1300 Mar 2 20:34 anaconda-ks.cfg
-rw-r--r-- 1 root root 221868 May 7 16:15 drbd83-8.3.8-1.el5.centos.i386.rpm
-rw-r--r-- 1 root root 35768 Mar 2 20:33 install.log
-rw-r--r-- 1 root root 4713 Mar 2 20:33 install.log.syslog
-rw-r--r-- 1 root root 125974 May 7 16:15 kmod-drbd83-8.3.8-1.el5.centos.i686.rpm
-rw-r--r-- 1 root root 240 Mar 2 12:38 scsrun.log
[root@server1 ~]# yum localinstall *.rpm --nogpgcheck –y
加载DRBD模块
[root@server1 ~]# modprobe drbd
查看模块加载
[root@server1 ~]# lsmod |grep drbd
drbd 228528 4
4.2.修改配置文件
[root@server1 ~]# cat /etc/drbd.conf
#
# please have a a look at the example configuration file in
# /usr/share/doc/drbd83/drbd.conf
#
[root@server1 ~]# cp /usr/share/doc/drbd83-8.3.8/drbd.conf /etc/
cp: overwrite `/etc/drbd.conf'? y
[root@server1 ~]# vim /etc/drbd.conf
[root@server1 ~]# cd /etc/drbd.d/
[root@server1 drbd.d]# ll
total 4
-rwxr-xr-x 1 root root 1418 Jun 4 2010 global_common.conf
[root@server1 drbd.d]# cp global_common.conf global_common.conf.bak
[root@server1 drbd.d]# vim global_common.conf
1 global {
2 usage-count no;
3 #minor-count dialog-refresh disable-ip-verification
4 }
5
6 common {
7 protocol C;
8
9 startup{
10 wfc-timeout 120;
11 degr-wfc-timeout 120;
12 }
13 disk {
14 on-io-error detach;
15 fencing resource-only;
16
17 }
18 net {
19 cram-hmac-alg "sha1"
20 shared-secret "mydrbdlab”;
21 }
22 syncer {
23 rate 100M;
24 }
25 }
[root@server1 drbd.d]# vim web.res
resource web{
on server1{
device /dev/drdb0;
disk /dev/sda4;
address 192.168.20.101:7789;
meta-disk internal;
}
on server2{
device /dev/drbd0;
disk /dev/sda4;
address 192.168.20.102:7789;
meta-disk internal;
}
}
4.3.检测配置文件
[root@server1 drbd.d]# dd if=/dev/zero bs=1M count=1 of=/dev/sda4;sync
[root@server1 drbd.d]# drbdadm create-md web
Writing meta data...
initializing activity log
NOT initialized bitmap
New drbd meta data block successfully created.
启动服务时两主机服务同时启动
[root@server1 drbd.d]# service drbd start
Starting DRBD resources: [
web
Found valid meta data in the expected location, 1011703808 bytes into /dev/sda4.
d(web) s(web) n(web) ]outdated-wfc-timeout has to be shorter than degr-wfc-timeout
outdated-wfc-timeout implicitly set to degr-wfc-timeout (120s)
[root@server1 ~]# drbd-overview
0:web Connected Secondary/Secondary Inconsistent/Inconsistent C r----
[root@server1 ~]#
[root@server1 drbd.d]# cat /proc/drbd
version: 8.3.8 (api:88/proto:86-94)
GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by [email protected], 2010-06-04 08:04:16
0: cs:Connected ro:Secondary/Secondary ds:Inconsistent/Inconsistent C r----
ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:987928
[root@server2 drbd.d]# cat /proc/drbd
version: 8.3.8 (api:88/proto:86-94)
GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by [email protected], 2010-06-04 08:04:16
0: cs:Connected ro:Secondary/Secondary ds:Inconsistent/Inconsistent C r----
ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:987928
此时发现两server没有主服务
创建文件夹
[root@server1 ~]# mkdir /data
4.4.只在主节点上执行
把server1 作为主节点
[root@server1 ~]# cd /etc/drbd.d/
[root@server1 drbd.d]# drbdadm -- --overwrite-data-of-peer primary web
[root@server1 drbd.d]# drbd-overview
0:web SyncSource Primary/Secondary UpToDate/Inconsistent C r----
[=================>..] sync'ed: 91.8% (84248/987928)K delay_probe: 70
格式化
[root@server1 drbd.d]# mkfs -t ext3 -L drbdweb /dev/drbd0
[root@server1 drbd.d]# mkdir /mnt/1
[root@server1 drbd.d]# mount /dev/drbd0 /mnt/1
[root@server1 drbd.d]# df -h
Filesystem Size Used Avail Use% Mounted on
/dev/sda2 9.5G 2.6G 6.4G 29% /
/dev/sda1 99M 12M 83M 12% /boot
tmpfs 97M 0 97M 0% /dev/shm
/dev/hdc 2.8G 2.8G 0 100% /media/RHEL_5.4 i386 DVD
/dev/hdc 2.8G 2.8G 0 100% /mnt/cdrom
/dev/drbd0 950M 18M 885M 2% /mnt/1
查看状态
[root@server1 drbd.d]# service drbd status
drbd driver loaded OK; device status:
version: 8.3.8 (api:88/proto:86-94)
GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by [email protected], 2010-06-04 08:04:16
m:res cs ro ds p mounted fstype
0:web Connected Primary/Secondary UpToDate/UpToDate C /mnt/1 ext
[root@server2 ~]# service drbd status
drbd driver loaded OK; device status:
version: 8.3.8 (api:88/proto:86-94)
GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by [email protected], 2010-06-04 08:04:16
m:res cs ro ds p mounted fstype
0:web Connected Secondary/Primary UpToDate/UpToDate C
此时发现server1为主服务,server2为辅助服务
两台服务器都修改nfs配置文件如下
[root@server1 ~]# vim /etc/exports
/data *(rw,sync,insecure,no_root_squash,no_wdelay)
[root@server1 ~]# service portmap start
Starting portmap: [ OK ]
[root@server1 ~]# chkconfig portmap on
[root@server1 ~]# service nfs start
[root@server1 ~]# chkconfig nfs on
两台服务器都修改nfs启动脚本。
将/etc/init.d/nfs 脚本中的stop部分中的killproc nfsd -2 修改为 -9
[root@server1 ~]# vim /etc/init.d/nfs
stop)
# Stop daemons.
echo -n $"Shutting down NFS mountd: "
killproc rpc.mountd
echo
echo -n $"Shutting down NFS daemon: "
killproc nfsd -9
echo
if [ -n "$RQUOTAD" -a "$RQUOTAD" != "no" ]; then
echo -n $"Shutting down NFS quotas: "
killproc rpc.rquotad
RETVAL=$?
echo
修改Heartbeat配置
在server1和server2做以下操作
安装Heartbeat套件
[root@server1 ~]# yum localinstall heartbeat-2.1.4-9.el5.i386.rpm heartbeat-pils-2.1.4-10.el5.i386.rpm heartbeat-stonith-2.1.4-10.el5.i386.rpm libnet-1.1.4-3.el5.i386.rpm perl-MailTools-1.77-1.el5.noarch.rpm --nogpgcheck –y
[root@server1 ~]# cd /usr/share/doc/heartbeat-2.1.4/
Heartbeat配置
[root@server1 heartbeat-2.1.4]# cp authkeys haresources ha.cf /etc/ha.d/
Ha.cf配置,2台机子不一样的地方标红,填写对方服务器IP地址
[root@server1 heartbeat-2.1.4]# vim ha.cf
debugfile /var/log/ha-debug
logfile /var/log/ha-log
logfacility local0
keepalive 2
deadtime 10
udpport 694
ucast eth0 192.168.20.102
ping 192.168.20.1
auto_failback off
node server1
node server2
haresources配置
2台机子相同
[root@server1 ha.d]# echo "server1 IPaddr::192.168.20.188/24/eth0 drbddisk::web Filesystem::/dev/drbd0::/data::ext3 killnfsd" >> haresources
authkeys 配置相同
auth 1
1 crc
#2 sha1 HI!
#3 md5 Hello!
Killnfsd配置相同(手动编写)
[root@server1 ha.d]# echo "killall -9 nfsd; /etc/init.d/nfs restart; exit 0 " >> resource.d/killnfsd
设置文档权限
[root@server1 ha.d]# chmod 600 /etc/ha.d/authkeys
[root@server1 ha.d]# chmod 755 /etc/ha.d/resource.d/killnfsd
开启Heartbeat服务
[root@server1 ha.d]# service heartbeat start
Starting High-Availability services:
2012/05/08_01:28:21 INFO: Resource is stopped
[ OK ]
[root@server1 ha.d]# chkconfig heartbeat on
1.在测试机上将192.168.10.188:/data挂载到本地/mnt/nfs
[root@node1 ~]# mkdir /mnt/nfs
[root@node1 ~]# mount 192.168.20.188:/data /mnt/nfs/
2.在测试机上创建shell,二秒一个
[root@node1 nfs]# vim /mnt/test.sh
while true
do
echo ---\> trying touch x : `date`
touch x
echo \<----- done touch x : `date`
echo
sleep 2
done
3.启动两个server的heartbeat服务
[root@server1 ~]# service heartbeat start
[root@server2 ~]# service heartbeat start
此时主服务server1的节点起来
[root@server1 ~]# drbd-overview
0:web Connected Primary/Secondary UpToDate/UpToDate C r---- /data ext3 950M 18M 885M 2%
[root@server2 ~]# drbd-overview
0:web Connected Secondary/Primary UpToDate/UpToDate C r----
[root@server1 ~]# ifconfig
eth0 Link encap:Ethernet HWaddr 00:0C:29:C7:C5:3A
inet addr:192.168.20.101 Bcast:192.168.20.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:fec7:c53a/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:7113 errors:0 dropped:0 overruns:0 frame:0
TX packets:38268 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:949734 (927.4 KiB) TX bytes:53973073 (51.4 MiB)
Interrupt:67 Base address:0x2000
eth0:0 Link encap:Ethernet HWaddr 00:0C:29:C7:C5:3A
inet addr:192.168.20.188 Bcast:192.168.20.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
Interrupt:67 Base address:0x2000
Server2的备节点没有起来
[root@server2 ha.d]# ifconfig
eth0 Link encap:Ethernet HWaddr 00:0C:29:02:49:2E
inet addr:192.168.20.102 Bcast:192.168.20.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:fe02:492e/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:782303 errors:0 dropped:0 overruns:0 frame:0
TX packets:98572 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:1153260066 (1.0 GiB) TX bytes:10589710 (10.0 MiB)
Interrupt:67 Base address:0x2000
4.把主节点server1的heartbeat服务停止,则备节点接管服务
[root@server1 ~]# service heartbeat stop
[root@server1 ~]# drbd-overview
0:web Connected Secondary/Primary UpToDate/UpToDate C r----
[root@server2 ~]# drbd-overview
0:web Connected Primary/Secondary UpToDate/UpToDate C r---- /data ext3 950M 18M 885M 2%
Server1
[root@server1 ~]# ifconfig
eth0 Link encap:Ethernet HWaddr 00:0C:29:C7:C5:3A
inet addr:192.168.20.101 Bcast:192.168.20.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:fec7:c53a/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:6608 errors:0 dropped:0 overruns:0 frame:0
TX packets:37581 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:887449 (866.6 KiB) TX bytes:53874964 (51.3 MiB)
Interrupt:67 Base address:0x2000
lo Link encap:Local Loopback
inet addr:127.0.0.1 Mask:255.0.0.0
inet6 addr: ::1/128 Scope:Host
UP LOOPBACK RUNNING MTU:16436 Metric:1
RX packets:6517 errors:0 dropped:0 overruns:0 frame:0
TX packets:6517 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:0
RX bytes:7925837 (7.5 MiB) TX bytes:7925837 (7.5 MiB)
Server2
[root@server2 ha.d]# ifconfig
eth0 Link encap:Ethernet HWaddr 00:0C:29:02:49:2E
inet addr:192.168.20.102 Bcast:192.168.20.255 Mask:255.255.255.0
inet6 addr: fe80::20c:29ff:fe02:492e/64 Scope:Link
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:782303 errors:0 dropped:0 overruns:0 frame:0
TX packets:98572 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:1153260066 (1.0 GiB) TX bytes:10589710 (10.0 MiB)
Interrupt:67 Base address:0x2000
eth0:0 Link encap:Ethernet HWaddr 00:0C:29:02:49:2E
inet addr:192.168.20.188 Bcast:192.168.20.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
Interrupt:67 Base address:0x2000
5.测试脚本
[root@node1 ~]# cd /mnt/nfs/
[root@node1 nfs]# bash /mnt/test.sh
终端显示如下
---> trying touch x : Tue May 8 13:14:54 CST 2012
<----- done touch x : Tue May 8 13:14:54 CST 2012
---> trying touch x : Tue May 8 13:14:56 CST 2012
<----- done touch x : Tue May 8 13:14:56 CST 2012
---> trying touch x : Tue May 8 13:14:58 CST 2012
<----- done touch x : Tue May 8 13:14:58 CST 2012
---> trying touch x : Tue May 8 13:15:00 CST 2012
touch: cannot touch `x': Stale NFS file handle
<----- done touch x : Tue May 8 13:15:00 CST 2012
---> trying touch x : Tue May 8 13:15:07 CST 2012
<----- done touch x : Tue May 8 13:15:07 CST 2012
---> trying touch x : Tue May 8 13:15:09 CST 2012
<----- done touch x : Tue May 8 13:15:09 CST 2012
---> trying touch x : Tue May 8 13:15:11 CST 2012
<----- done touch x : Tue May 8 13:15:11 CST 2012
至此,server2接管服务成功,实验已实现所需的功能;也可手动在nfs挂载目录里建立文件,来回切换server1和server2的drbd服务来进行测试。