基于Ubuntu 14.04 Pacemaker+Corosync+Drbd+Mysql主备

1.Drbd设备安装

参见上一篇博客 http://blog.csdn.net/nnuljl/article/details/38266737

2.心跳程序Corosync

安装心跳程序,在两个节点均安装corosync

apt-get install -y --force-yes corosync

开启corosync。 可能遇到的问题server corosync start 无响应

root@controller1:~# vim /etc/default/corosync
# start corosync at boot [yes|no]
START=yes
~                                                                                                                                            
~     
将原来默认的no,修改为yes,否则service corosync start 无反应

3.配置Corosync

root@controller1:~# vim /etc/corosync/corosync.conf


配置文件如下

totem {
        version: 2

        # How long before declaring a token lost (ms)
        token: 3000

        # How many token retransmits before forming a new configuration
        token_retransmits_before_loss_const: 10
# Please read the openais.conf.5 manual page

totem {
        version: 2

        # How long before declaring a token lost (ms)
        token: 3000

        # How many token retransmits before forming a new configuration
        token_retransmits_before_loss_const: 10

        # How long to wait for join messages in the membership protocol (ms)
        join: 60

        # How long to wait for consensus to be achieved before starting a new round of membership configuration (ms)
        consensus: 3600

        # Turn off the virtual synchrony filter
        vsftype: none

        # Number of messages that may be sent by one processor on receipt of the token
        max_messages: 20

        # Limit generated nodeids to 31-bits (positive signed integers)
        clear_node_high_bit: yes

        # Disable encryption
        secauth: off

        # How many threads to use for encryption/decryption
        threads: 0

        # Optionally assign a fixed node id (integer)
        # nodeid: 1234

        # This specifies the mode of redundant ring, which may be none, active, or passive.
        rrp_mode: active

        interface {
                # The following values need to be set based on your environment 
                ringnumber: 0
                bindnetaddr: 192.168.2.11
                mcastaddr: 226.94.1.4
                mcastport: 5405
        }
        interface {
                ringnumber: 1
                bindnetaddr: 192.168.99.2
                mcastaddr: 226.94.1.4
                mcastport:5405
        }

}

amf {
        mode: disabled
}

quorum {
        # Quorum for the Pacemaker Cluster Resource Manager
        provider: corosync_votequorum
        expected_votes: 1
}

aisexec {
        user:   root
        group:  root
}

logging {
        fileline: off
        to_stderr: yes
        to_logfile: yes
        logfile: /var/log/corosync/corosync.log
        to_syslog: yes
        syslog_facility: daemon
        debug: off
        timestamp: on
        logger_subsys {
                subsys: AMF
                debug: off
                tags: enter|leave|trace1|trace2|trace3|trace4|trace6
        }
}
# Now this is just for test, add a comment
需要修改配置的地方为totem域内的interface

rrp_mode: active

        interface {
                # The following values need to be set based on your environment 
                ringnumber: 0
                bindnetaddr: 192.168.2.11
                mcastaddr: 226.94.1.4
                mcastport: 5405
        }
        interface {
                ringnumber: 1
                bindnetaddr: 192.168.99.2
                mcastaddr: 226.94.1.4
                mcastport:5405
        }
可以配置多个心跳接口,ringnumber不要写成一样的即可。bindnetaddr为绑定的IP,还有就是广播IP。

查看心跳集群情况,如下为在节点查看情况

root@controller1:~# corosync-cmapctl |grep members
runtime.totem.pg.mrp.srp.members.1084752395.config_version (u64) = 0
runtime.totem.pg.mrp.srp.members.1084752395.ip (str) = r(0) ip(192.168.2.11) r(1) ip(192.168.99.2) 
runtime.totem.pg.mrp.srp.members.1084752395.join_count (u32) = 1
runtime.totem.pg.mrp.srp.members.1084752395.status (str) = joined
runtime.totem.pg.mrp.srp.members.1084752396.config_version (u64) = 0
runtime.totem.pg.mrp.srp.members.1084752396.ip (str) = r(0) ip(192.168.2.12) r(1) ip(127.0.0.1) 
runtime.totem.pg.mrp.srp.members.1084752396.join_count (u32) = 1
runtime.totem.pg.mrp.srp.members.1084752396.status (str) = joined
节点2查看情况

root@controller2:~# corosync-cmapctl |grep members
runtime.totem.pg.mrp.srp.members.1084752395.config_version (u64) = 0
runtime.totem.pg.mrp.srp.members.1084752395.ip (str) = r(0) ip(192.168.2.11) r(1) ip(192.168.99.2) 
runtime.totem.pg.mrp.srp.members.1084752395.join_count (u32) = 1
runtime.totem.pg.mrp.srp.members.1084752395.status (str) = joined
runtime.totem.pg.mrp.srp.members.1084752396.config_version (u64) = 0
runtime.totem.pg.mrp.srp.members.1084752396.ip (str) = r(0) ip(192.168.2.12) r(1) ip(127.0.0.1) 
runtime.totem.pg.mrp.srp.members.1084752396.join_count (u32) = 1
runtime.totem.pg.mrp.srp.members.1084752396.status (str) = joined
我们看到有一个IP是127.0.0.1因为节点2,因为硬件资源问题和192.168.99.2相对接的网卡被移除了,所以显示为本地IP。

4.安装Pacemaker

apt-get install -y --force-yes pacemaker

5.配置Pacemaker 

ocf日志配置

root@controller1:/usr/lib/ocf/lib/heartbeat# ls
apache-conf.sh  ocf-binaries     ocf-rarun        ocf-shellfuncs  sapdb-nosha.sh
http-mon.sh     ocf-directories  ocf-returncodes  ora-common.sh   sapdb.sh

配置ocf-directories文件

# Binaries and binary options for use in Resource Agents

prefix=/usr
exec_prefix=/usr
: ${INITDIR:=/etc/init.d}
: ${HA_DIR:=/etc/ha.d}
: ${HA_RCDIR:=$HA_DIR/rc.d}
: ${HA_CONFDIR=$HA_DIR/conf}
: ${HA_CF:=$HA_DIR/ha.cf}
: ${HA_VARLIB:=/var/lib/heartbeat}
: ${HA_RSCTMP:=/var/run/resource-agents}
: ${HA_RSCTMP_OLD:=/var/run/heartbeat/rsctmp}
: ${HA_FIFO:=/var/lib/heartbeat/fifo}
: ${HA_BIN:=/usr/lib/heartbeat}
: ${HA_SBIN_DIR:=/usr/sbin}
: ${HA_DATEFMT:="%Y/%m/%d_%T "}
: ${HA_DEBUGLOG:=/dev/null}
: ${HA_RESOURCEDIR:=$HA_DIR/resource.d}
: ${HA_DOCDIR:=/usr/share/doc/heartbeat}
: ${__SCRIPT_NAME:=`basename $0`}
: ${HA_VARRUN:=/var/run/}
: ${HA_VARLOCK:=/var/lock/subsys/}
: ${HA_LOGFILE:=/var/log/ha/ha.log}
: ${HA_DEBUGLOG:=/var/log/ha/ha_debug.log}
新增最后两行,并在/var/log下创建ha目录,当使用ocf资源时会生成日志。

6.配置Pacemaker服务资源

crm status  查看集群情况

root@controller1:/usr/lib/ocf/lib/heartbeat# crm status
Last updated: Wed Jul 30 13:03:46 2014
Last change: Wed Jul 30 12:39:21 2014 via crm_attribute on controller2
Stack: corosync
Current DC: controller1 (1084752395) - partition with quorum
Version: 1.1.10-42f2063
2 Nodes configured
20 Resources configured


Node controller2 (1084752396): standby
Online: [ controller1 ]

controller2 被笔者认为设置为standby,初始状态应该两者都为online。

root@controller1:~# crm ra
crm(live)ra# classes
lsb
ocf / heartbeat linbit openstack pacemaker rabbitmq redhat
service
stonith
upstart
查看pacemaker资源脚本,这些脚本控制着服务

crm(live)ra# help

This level contains commands which show various information about
the installed resource agents. It is available both at the top
level and at the `configure` level.

Available commands:

        classes          list classes and providers
        list             list RA for a class (and provider)
        meta             show meta data for a RA
        providers        show providers for a RA and a class
        help             show help (help topics for list of topics)
        end              go back one level
        quit             exit the program
其中crm(live)ra# meta IPaddr2 是作为虚拟IP绑定的资源。

配置服务资源

root@controller1:~# crm configure
crm(live)configure# 
为交换命令可以用help查看命令帮助
Available commands:

        node             define a cluster node
        primitive        define a resource
        monitor          add monitor operation to a primitive
        group            define a group
        clone            define a clone
        ms               define a master-slave resource
        rsc_template     define a resource template
        location         a location preference
        colocation       colocate resources
        order            order resources
        rsc_ticket       resources ticket dependency
        property         set a cluster property
        rsc_defaults     set resource defaults
        fencing_topology node fencing order
        role             define role access rights
        user             define user access rights
        op_defaults      set resource operations defaults
        schema           set or display current CIB RNG schema
        show             display CIB objects
        edit             edit CIB objects
        filter           filter CIB objects
        delete           delete CIB objects
        default-timeouts set timeouts for operations to minimums from the meta-data
        rename           rename a CIB object
        modgroup         modify group
        refresh          refresh from CIB
        erase            erase the CIB
        ptest            show cluster actions if changes were committed
        rsctest          test resources as currently configured
        cib              CIB shadow management
        cibstatus        CIB status management and editing
        template         edit and import a configuration from a template
        commit           commit the changes to the CIB
        verify           verify the CIB with crm_verify
        upgrade          upgrade the CIB to version 1.0
        save             save the CIB to a file
        load             import the CIB from a file
        graph            generate a directed graph
        xml              raw xml
        help             show help (help topics for list of topics)
        end              go back one level
:
需要说明的primitive为定义一个资源,group将多个资源定义为组,方便多操作。location定义资源节点黏性,即优先运行于哪个节点,colocation定义多个服务运行于同一节点,或者不能运行于同一节点,order定义服务节点顺序。

crm(live)configure# show
node $id="1084752395" controller1 \
        attributes standby="off"
node $id="1084752396" controller2 \
        attributes standby="on"
...
primitive p_api_ip ocf:heartbeat:IPaddr2 \
        params ip="192.168.99.3" cidr_netmask="24" \
        op monitor interval="30s"
primitive p_fs_mysql ocf:heartbeat:Filesystem \
        params device="/share-s/mysql" directory="/var/lib/mysql" fstype="ext4" options="relatime" \
        op start interval="0" timeout="120s" \
        op stop interval="0" timeout="180s" \
        op monitor interval="60s" timeout="120s" \
        meta target-role="Started"
primitive p_mysql ocf:heartbeat:mysql \
        params additional_parameters="--bind-address=0.0.0.0" config="/etc/mysql/my.cnf" pid="/var/run/mysqld/mysqld.pid" socket="/var/run/mysqld/mysqld.sock" log="/var/log/mysql/mysqld.log"
primitive p_drbd_mysql ocf:linbit:drbd \
	params drbd_resource="mysql" \
	op start interval="0" timeout="240s" \
	op stop interval="0" timeout="180s" \
	op promote interval="0" timeout="180s" \
	op demote interval="0" timeout="180s" \
	op monitor interval="30s" role="Slave" \
	op monitor interval="29s" role="Master"
group g_1_mysql p_ip_admin p_fs_mysql p_mysql
ms ms_drbd_mysql p_drbd_mysql \
	meta notify="true" clone-max="2"
colocation c_mysql_on_drbd inf: g_1_mysql ms_drbd_mysql:Master
order o_drbd_before_mysql inf: ms_drbd_mysql:promote g_1_mysql:start

如上,定义了Drbd资源,虚拟IP资源,文件系统挂载资源,MySQL启动资源。

当定义好资源后可以在交互命令里面验证,提交。

root@controller1:~# crm configure
crm(live)configure# verify 
WARNING: p_mysql: default timeout 20s for start is smaller than the advised 120
WARNING: p_mysql: default timeout 20s for stop is smaller than the advised 120
crm(live)configure# commit 
INFO: apparently there is nothing to commit
INFO: try changing something first
提交以后资源就会在Pacemaker内运行了,可以通过crm status或者crm_mon查看。

Last updated: Wed Jul 30 13:20:54 2014
Last change: Wed Jul 30 12:39:21 2014 via crm_attribute on controller2
Stack: corosync
Current DC: controller1 (1084752395) - partition with quorum
Version: 1.1.10-42f2063
2 Nodes configured
20 Resources configured


Node controller2 (1084752396): standby
Online: [ controller1 ]

 Resource Group: g_1_mysql
     p_ip_admin (ocf::heartbeat:IPaddr2):       Started controller1
     p_fs_mysql (ocf::heartbeat:Filesystem):    Started controller1
     p_mysql    (ocf::heartbeat:mysql): Started controller1
如果可以看到相关资源,当然Drbd也会显示在里面,但是由于笔者的测试环境已经切换为iSCSI共享存储,所以相关数据就没有了。

7.Pacermaker维护

Pacermaker restart时,资源会进行迁移,同时可以在资源执行节点#crm node standby设置节点为备用,如果要恢复#crm node online

重启某个服务#crm resource restart     例如#crm resource restart g_1_mysql  所有mysql资源会重启,#crm resource stop p_mysql   MySQL即会关闭。

8.配置STONITH注意事项

pacemaker提供fence机制,配置STONITH资源即可需要注意的时,要设置stonith-action,否则fence看不出效果。

property $id="cib-bootstrap-options" \
        dc-version="1.1.10-42f2063" \
        cluster-infrastructure="corosync" \
        stonith-enabled="false" \
        no-quorum-policy="ignore" \
        stonith-action="reboot"
如果启用fence的stonith,将stonith-enabled设置为true,并且设置stonith-action,参数值可通过交互命令Tab知晓

crm(live)configure# property 
batch-limit=                  enable-startup-probes=        pe-error-series-max=          stonith-enabled=
cluster-delay=                is-managed-default=           pe-input-series-max=          stonith-timeout=
cluster-recheck-interval=     maintenance-mode=             pe-warn-series-max=           stop-all-resources=
crmd-transition-delay=        migration-limit=              placement-strategy=           stop-orphan-actions=
dc-deadtime=                  no-quorum-policy=             remove-after-stop=            stop-orphan-resources=
default-action-timeout=       node-health-green=            shutdown-escalation=          symmetric-cluster=
default-resource-stickiness=  node-health-red=              start-failure-is-fatal=       
election-timeout=             node-health-strategy=         startup-fencing=              
enable-acl=                   node-health-yellow=           stonith-action=               
crm(live)configure# property stonith-action=
stonith-action (enum, [reboot]): Action to send to STONITH device
    Action to send to STONITH device  Allowed values: reboot, poweroff, off
系统能提供哪些STONITH服务可通过命令查询

root@controller1:~# crm ra
crm(live)ra# classes
lsb
ocf / heartbeat linbit openstack pacemaker rabbitmq redhat
service
stonith
upstart
crm(live)ra# list stonith
apcmaster                   apcmastersnmp               apcsmart                    baytech                     bladehpi
cyclades                    drac3                       external/drac5              external/dracmc-telnet      external/hetzner
external/hmchttp            external/ibmrsa             external/ibmrsa-telnet      external/ipmi               external/ippower9258
external/kdumpcheck         external/libvirt            external/nut                external/rackpdu            external/riloe
external/ssh                external/vcenter            external/vmware             external/xen0               external/xen0-ha
fence_legacy                fence_pcmk                  ibmhmc                      ipmilan                     meatware
null                        nw_rpc100s                  rcd_serial                  rps10                       ssh
suicide                     wti_mpc                     wti_nps  
如需转载注明出处:http://blog.csdn.net/nnuljl

你可能感兴趣的:(运维)