测试机器
centos 6.5 pacemaker+corosync+cmam+resource agent
192.168.14.107
192.168.14.101
1,安装 pacemaker,顺带corosync被安装
[root@oserver1 ~]# yum -y install pacemaker
已安装:
pacemaker.x86_64 0:1.1.14-8.el6
作为依赖被安装:
clusterlib.x86_64 0:3.0.12.1-78.el6 corosync.x86_64 0:1.4.7-5.el6 corosynclib.x86_64 0:1.4.7-5.el6 libibverbs.x86_64 0:1.1.8-4.el6
libqb.x86_64 0:0.17.1-2.el6 librdmacm.x86_64 0:1.0.21-0.el6 pacemaker-cli.x86_64 0:1.1.14-8.el6 pacemaker-cluster-libs.x86_64 0:1.1.14-8.el6
pacemaker-libs.x86_64 0:1.1.14-8.el6 perl-TimeDate.noarch 1:1.16-13.el6 rdma.noarch 0:6.8_4.1-1.el6 resource-agents.x86_64 0:3.9.5-34.el6_8.2
完毕!
2,安装pcs
[root@oserver1 ~]# yum -y install pcs
已安装:
pcs.x86_64 0:0.9.148-7.el6
作为依赖被安装:
ccs.x86_64 0:0.16.2-86.el6 compat-readline5.x86_64 0:5.2-17.1.el6 python-clufter.x86_64 0:0.56.2-1.el6 ruby.x86_64 0:1.8.7.374-4.el6_6
ruby-irb.x86_64 0:1.8.7.374-4.el6_6 ruby-libs.x86_64 0:1.8.7.374-4.el6_6 ruby-rdoc.x86_64 0:1.8.7.374-4.el6_6 rubygems.noarch 0:1.3.7-5.el6
完毕!
3,安装 postgres9.5
#yum install https://download.postgresql.org/pub/repos/yum/9.5/redhat/rhel-6-x86_64/pgdg-centos95-9.5-2.noarch.rpm
已安装:
pgdg-centos95.noarch 0:9.5-2
[root@oserver1 ~]# yum install postgresql95-server postgresql95-contrib
已安装:
postgresql95-contrib.x86_64 0:9.5.3-2PGDG.rhel6 postgresql95-server.x86_64 0:9.5.3-2PGDG.rhel6
作为依赖被安装:
postgresql95.x86_64 0:9.5.3-2PGDG.rhel6 postgresql95-libs.x86_64 0:9.5.3-2PGDG.rhel6
完毕!
[root@oserver1 ~]#
4,Replacement of pgsql RA
[root@oserver1 ~]# wget https://raw.github.com/ClusterLabs/resource-agents/a6f4ddf76cb4bbc1b3df4c9b6632a6351b63c19e/heartbeat/pgsql
# cp pgsql /usr/lib/ocf/resource.d/heartbeat/
# chmod 755 /usr/lib/ocf/resource.d/heartbeat/pgsql
5, pg config
node1
[root@oserver1 heartbeat]# mkdir -p /alidata/data/pgdata
[root@oserver1 heartbeat]# chown postgres:postgres /alidata/data/pgdata
[root@oserver1 heartbeat]# su postgres -c '/usr/pgsql-9.5/bin/initdb -D /alidata/data/pgdata'
bash-4.1$ /usr/pgsql-9.5/bin/initdb -D /alidata/data/pgdata
postgresql.conf
listen_addresses = '*'
wal_level = hot_standby
synchronous_commit = on
archive_mode = on
archive_command = 'cp %p /alidata/data/pg_archive/%f'
max_wal_senders=5
wal_keep_segments = 32
hot_standby = on
restart_after_crash = off
replication_timeout = 5000
wal_receiver_status_interval = 2
max_standby_streaming_delay = -1
max_standby_archive_delay = -1
synchronous_commit = on
restart_after_crash = off
hot_standby_feedback = on
pg_hba.conf
host replication pgrepuser
0.0.0.0/0
md5
host all all 0.0.0.0/0 md5
postgres=# CREATE ROLE pgrepuser REPLICATION LOGIN PASSWORD '123456';
ALTER USER postgres WITH PASSWORD 'postgres';
启动
[root@oserver1 data]# ./start.sh
正在启动服务器进程
[root@oserver1 data]# < 2016-06-30 10:35:17.986 CST >日志: 日志输出重定向到日志收集进程
< 2016-06-30 10:35:17.986 CST >提示: 后续的日志输出将出现在目录 "pg_log"中.
node2
[root@oserver2 data]# mkdir -p /alidata/data/pgdata
[root@oserver2 data]# chown postgres:postgres /alidata/data/pgdata
[root@oserver2 data]# chmod 700 /alidata/data/pgdata
[root@oserver2 data]# mkdir /alidata/data/pg_archive
[root@oserver2 data]# chown postgres:postgres pg_archive
# su - postgres -c '/usr/pgsql-9.5/bin/pg_basebackup -D /alidata/data/pgdata -F p -x -h oserver1 -p 5432 -U pgrepuser -P -v'
口令:
事务日志起始于时间点: 0/4000028, 基于时间表1
38894/38894 kB (100%), 1/1 表空间
transaction log end point: 0/40000F8
pg_basebackup: base backup completed
bash-4.1$
Create /alidata/data/pgdata/recovery.conf to confirm replication
bash-4.1$ more recovery.conf
standby_mode = 'on'
primary_conninfo = 'host=192.168.16.104 port=5432 user=pgrepuser password=123456 application_name=node2'
restore_command = 'cp /alidata/data/pg_archive/%f %p'
recovery_target_timeline = 'latest'
bash-4.1$
启动
[root@oserver2 data]# ./start.sh
正在启动服务器进程
[root@oserver2 data]# < 2016-06-30 11:38:04.977 CST >日志: 日志输出重定向到日志收集进程
< 2016-06-30 11:38:04.977 CST >提示: 后续的日志输出将出现在目录 "pg_log"中.
[root@oserver2 data]#
确认复制模式启动(在oserver1上操作)
# su - postgres -c '
psql -c "select client_addr,sync_state from pg_stat_replication;"'
bash-4.1$ /usr/pgsql-9.5/bin/psql -c "select client_addr,sync_state from pg_stat_replication;"
client_addr | sync_state
----------------+------------
192.168.16.106 | async
(1 行记录)
bash-4.1$
在两个节点上都把postgres 停止,先备再主
==================================
两个节点要实现时间同步、ssh互信、hosts名称解析
所有节点的主机名称和对应的IP地址解析服务可以正常工作,且每个节点的主机名称需要跟"uname -n“命令的结果保持一致;因此,需要保证两个节点上的/etc/hosts
192.168.14.107 oserver1
192.168.14.101 oserver2
时间同步 配置 ntp
time1-7.aliyun.com
2)设定两个节点可以基于密钥进行ssh通信,这可以通过类似如下的命令实现:
oserver1
:
# ssh-keygen -t rsa
# ssh-copy-id -i ~/.ssh/id_rsa.pub root@oserver2
oserver2
:
# ssh-keygen -t rsa
# ssh-copy-id -i ~/.ssh/id_rsa.pub root@oserver1
生成节点间通信时用到的认证密钥文件
[root@oserver1 ~]# corosync-keygen
Corosync Cluster Engine Authentication key generator.
Gathering 1024 bits for key from /dev/random.
Press keys on your keyboard to generate entropy.
Press keys on your keyboard to generate entropy (bits = 128).
Press keys on your keyboard to generate entropy (bits = 192).
Press keys on your keyboard to generate entropy (bits = 256).
Press keys on your keyboard to generate entropy (bits = 976).
Writing corosync key to /etc/corosync/authkey.
[root@oserver1 corosync]# scp authkey corosync.conf oserver2:/etc/corosync
pcs
查看集群节点状态
pcs status
禁用stonith
# pcs property set stonith-enabled=false
查看配信息
#pcs config
pcs resource
pcs status
http://clusterlabs.org/quickstart-redhat-6.html
[root@oserver2 network-scripts]# yum install cman
Installed:
cman.x86_64 0:3.0.12.1-78.el6
Dependency Installed:
fence-agents.x86_64 0:4.0.15-12.el6 fence-virt.x86_64 0:0.2.3-19.el6 ipmitool.x86_64 0:1.8.15-2.el6 modcluster.x86_64 0:0.16.2-35.el6
net-snmp-utils.x86_64 1:5.5-57.el6 openais.x86_64 0:1.1.1-7.el6 openaislib.x86_64 0:1.1.1-7.el6 perl-Net-Telnet.noarch 0:3.03-11.el6
pexpect.noarch 0:2.3-6.el6 python-suds.noarch 0:0.4.1-3.el6 ricci.x86_64 0:0.16.2-86.el6 sg3_utils.x86_64 0:1.28-12.el6
telnet.x86_64 1:0.17-48.el6
Dependency Updated:
net-snmp.x86_64 1:5.5-57.el6 net-snmp-libs.x86_64 1:5.5-57.el6 sg3_utils-libs.x86_64 0:1.28-12.el6
Complete!
ccs 已经在安装pcs时候被依赖安装
resource-agents已经在安装pacemaker时候被依赖安装
=================================================================================
centos 6 必须用方案
http://clusterlabs.org/quickstart-redhat-6.html
[ALL] # yum install pacemaker cman pcs ccs resource-agents
[ONE] # ccs -f /etc/cluster/cluster.conf --createcluster pacemaker1
[ONE] # ccs -f /etc/cluster/cluster.conf --addnode oserver1
[ONE] # ccs -f /etc/cluster/cluster.conf --addnode oserver2
how to send it's fencing requests to Pacemaker
[ONE] # ccs -f /etc/cluster/cluster.conf --addfencedev pcmk agent=fence_pcmk
[ONE] # ccs -f /etc/cluster/cluster.conf --addmethod pcmk-redirect oserver1
[ONE] # ccs -f /etc/cluster/cluster.conf --addmethod pcmk-redirect oserver2
[ONE] # ccs -f /etc/cluster/cluster.conf --addfenceinst pcmk oserver1 pcmk-redirect port=oserver1
[ONE] # ccs -f /etc/cluster/cluster.conf --addfenceinst pcmk oserver2 pcmk-redirect port=oserver2
copy /etc/cluster/cluster.conf to all the other nodes
考虑防火墙可能有影响可以关闭防火墙:
# service iptables stop
# chkconfig iptables stop
start
[ALL] # echo "CMAN_QUORUM_TIMEOUT=0" >> /etc/sysconfig/cman
[ALL] # service cman start
[ALL] # service pacemaker start
启动也可以 直接 service pacemaker start
[root@oserver1 ~]# pcs status
Cluster name: pacemaker1
Last updated: Wed Jun 29 11:39:50 2016
Last change: Wed Jun 29 11:34:06 2016 by root via cibadmin on oserver2
Stack: cman
Current DC: oserver1 (version 1.1.14-8.el6-70404b0) - partition with quorum
2 nodes and 0 resources configured
Online: [ oserver1 oserver2 ]
Full list of resources:
PCSD Status:
oserver1: Offline
oserver2: Offline
[ONE] # pcs property set stonith-enabled=false
启动之后可以进一步设置。
不需要使用STONITH功能:
# pcs property set stonith-enabled=false
等同于
crm configure property stonith-enabled=false
大部分情况下集群只有两个节点,忽略下面这个配置
# pcs property set no-quorum-policy=ignore
当单点故障时候进行服务迁移
# pcs resource defaults migration-threshold=1
cib.xml 位置
[root@oserver1 ~]# ls /var/lib/pacemaker/cib/cib.xml
验证 cib
crm_verify -L
pcs 和crmsh 的比较
http://www.bkjia.com/xtzh/612799.html
添加资源 floating address
# pcs resource create ClusterIP ocf:heartbeat:IPaddr2 ip=192.168.16.1111 cidr_netmask=32 op monitor interval=30s
[root@oserver1 ~]# pcs status
Cluster name: pacemaker1
Last updated: Wed Jun 29 18:00:21 2016
Last change: Wed Jun 29 17:55:24 2016 by root via cibadmin on oserver1
Stack: cman
Current DC: oserver2 (version 1.1.14-8.el6-70404b0) - partition with quorum
2 nodes and 1 resource configured
Online: [ oserver1 oserver2 ]
Full list of resources:
ClusterIP (ocf::heartbeat:IPaddr2): Started oserver1
PCSD Status:
oserver1: Offline
oserver2: Offline
[root@oserver1 ~]#
[root@oserver1 ~]# pcs resource show ClusterIP
Resource: ClusterIP (class=ocf provider=heartbeat type=IPaddr2)
Attributes: ip=192.168.16.111 cidr_netmask=32
Meta Attrs: target-role=Started
Operations: start interval=0s timeout=20s (ClusterIP-start-interval-0s)
stop interval=0s timeout=20s (ClusterIP-stop-interval-0s)
monitor interval=30s (ClusterIP-monitor-interval-30s)
[root@oserver1 ~]#
perform a failover
[root@oserver1 ~]#service pacemaker stop
[root@oserver2 ~]# pcs status
Cluster name: pacemaker1
Last updated: Wed Jun 29 18:04:20 2016
Last change: Wed Jun 29 17:55:24 2016 by root via cibadmin on oserver1
Stack: cman
Current DC: oserver2 (version 1.1.14-8.el6-70404b0) - partition WITHOUT quorum
2 nodes and 1 resource configured
Online: [ oserver2 ]
OFFLINE: [ oserver1 ]
Full list of resources:
ClusterIP (ocf::heartbeat:IPaddr2): Started oserver2
PCSD Status:
oserver1: Offline
oserver2: Offline
Prevent Resources from Moving after Recovery
crm configure rsc_defaults resource-stickiness=100
Colocation constraints 服务之间的约束,必须在同一个node上
crm configure colocation website-with-ip INFINITY: WebSite ClusterIP
Controlling Resource Start/Stop Ordering
ordering constraint
crm configure order apache-after-ip mandatory: ClusterIP WebSite
Specifying a Preferred Location
# crm configure location prefer-pcmk-1 WebSite 50: pcmk-1
Manually Moving Resources Around the Cluster
# crm resource move WebSite pcmk-1
postgres pcs config
vi config.pcs
==============================
pcs cluster cib pgsql_cfg
pcs -f pgsql_cfg property set no-quorum-policy="ignore"
pcs -f pgsql_cfg property set stonith-enabled="false"
pcs -f pgsql_cfg resource defaults resource-stickiness="INFINITY"
pcs -f pgsql_cfg resource defaults migration-threshold="1"
设置资源失败1次就迁移到其他node
pcs -f pgsql_cfg resource create vip-master IPaddr2 \
ip="192.168.16.112" \
nic="eth0" \
cidr_netmask="24" \
op start timeout="60s" interval="0s" on-fail="restart" \
op monitor timeout="60s" interval="10s" on-fail="restart" \
op stop timeout="60s" interval="0s" on-fail="block"
pcs -f pgsql_cfg resource create vip-rep IPaddr2 \
ip="192.168.16.113" \
nic="eth0" \
cidr_netmask="24" \
meta migration-threshold="0" \
op start timeout="60s" interval="0s" on-fail="stop" \
op monitor timeout="60s" interval="10s" on-fail="restart" \
op stop timeout="60s" interval="0s" on-fail="ignore"
pcs -f pgsql_cfg resource create pgsql pgsql \
pgctl="/usr/pgsql-9.5/bin/pg_ctl" \
psql="/usr/pgsql-9.5/bin/psql" \
pgdata="/alidata/data/pgdata/" \
rep_mode="sync" \
node_list="oserver1 oserver2" \
node_list(*)
The list of PRI and all HS nodes. Specifies a space-separated list of all node name (result of the uname -n command).
restore_command="cp /alidata/data/pg_archive/%f %p" \
primary_conninfo_opt="
password=postgres keepalives_idle=60 keepalives_interval=5 keepalives_count=5" \
----primary_conninfo_opt 将被写进recovery.conf
----primary_conninfo = 'host=${OCF_RESKEY_master_ip} port=${OCF_RESKEY_pgport} user=${OCF_RESKEY_repuser} application_name=${NODENAME} ${OCF_RESKEY_primary_conninfo_opt}'
master_ip="192.168.16.113" \
restart_on_promote='true' \
op start timeout="60s" interval="0s" on-fail="restart" \
op monitor timeout="60s" interval="4s" on-fail="restart" \
op monitor timeout="60s" interval="3s" on-fail="restart" role="Master" \
op promote timeout="60s" interval="0s" on-fail="restart" \
op demote timeout="60s" interval="0s" on-fail="stop" \
op stop timeout="60s" interval="0s" on-fail="block" \
op notify timeout="60s" interval="0s"
pcs -f pgsql_cfg resource master msPostgresql pgsql \
master-max=1 master-node-max=1 clone-max=2 clone-node-max=1 notify=true
--创建主从资源
--create a master/slave resou rce from a previou sly-created resou rce or resource group,
--additional clone resource to allow the
resource to run on both nodes at the same time.
pcs -f pgsql_cfg resource group add master-group vip-master vip-rep
--创建资源组,
--a set of resou rces th at n eed to be located together,start seque tially, and stop in th e reverse order.
pcs -f pgsql_cfg constraint colocation add master-group with Master msPostgresql INFINITY
--限制两个vip资源和master postgres 资源在同一个node上
pcs -f pgsql_cfg constraint order promote msPostgresql then start master-group symmetrical=false score=INFINITY
--promote postgres from slave to master 时启动VIP
pcs -f pgsql_cfg constraint order demote msPostgresql then stop master-group symmetrical=false score=0
--d
mote postgres from master to slave 时关闭VIP
pcs cluster push cib pgsql_cfg 官方文档错误
pcs cluster cib-push pgsql_cfg
================
sh config.pcs
所有的 内容会写到ls /var/lib/pacemaker/cib/cib.xml
[root@oserver2 data]# crm_mon -Afr -1
Last updated: Thu Jun 30 15:04:19 2016 Last change: Thu Jun 30 15:04:14 2016 by root via cibadmin on oserver1
Stack: cman
Current DC: oserver2 (version 1.1.14-8.el6-70404b0) - partition with quorum
2 nodes and 5 resources configured
Online: [ oserver1 oserver2 ]
Full list of resources:
ClusterIP (ocf::heartbeat:IPaddr2): Started oserver2
Master/Slave Set: msPostgresql [pgsql]
Slaves: [ oserver1 oserver2 ]
Resource Group: master-group
vip-master (ocf::heartbeat:IPaddr2): Stopped
vip-rep (ocf::heartbeat:IPaddr2): Stopped
Node Attributes:
* Node oserver1:
+ master-pgsql : -INFINITY
+ pgsql-status : HS:alone
+ pgsql-xlog-loc : 0000000008000098
* Node oserver2:
+ master-pgsql : -INFINITY
+ pgsql-status : HS:alone
+ pgsql-xlog-loc : 0000000008000098
Migration Summary:
* Node oserver1:
* Node oserver2:
[root@oserver2 data]#
注意,启动pacemaker前必须把所有node上的postgres 停掉
启动资源
[root@oserver1 data]# pcs resource enable master-group
[root@oserver2 data]# crm_mon -Afr -1
Last updated: Thu Jun 30 15:31:24 2016 Last change: Thu Jun 30 15:04:32 2016 by root via crm_attribute on oserver1
Stack: cman
Current DC: oserver2 (version 1.1.14-8.el6-70404b0) - partition with quorum
2 nodes and 5 resources configured
Online: [ oserver1 oserver2 ]
Full list of resources:
ClusterIP (ocf::heartbeat:IPaddr2): Started oserver2
Master/Slave Set: msPostgresql [pgsql]
Masters: [ oserver1 ]
Slaves: [ oserver2 ]
Resource Group: master-group
vip-master (ocf::heartbeat:IPaddr2): Started oserver1
vip-rep (ocf::heartbeat:IPaddr2): Started oserver1
Node Attributes:
* Node oserver1:
+ master-pgsql : 1000
+ pgsql-data-status : LATEST
+ pgsql-master-baseline : 0000000008000098
+ pgsql-status : PRI
* Node oserver2:
+ master-pgsql : -INFINITY
+ pgsql-data-status : DISCONNECT
+ pgsql-status : HS:alone
Migration Summary:
* Node oserver1:
* Node oserver2:
[root@oserver2 data]#
Q :启动cman报如下错误:
A :如下关系Network Mannager服务和启动项就可以了:
# service NetworkManager stop
# chkconfig NetworkManager off
:为什么启动Pacemaker后发现两个节点之间总是显示对方offline,监控的资源(比如虚拟IP)都在本机启动了。
A :可能是出现脑裂了,尝试关闭防火墙和SELinux试试:
# service iptables stop
# vi /etc/selinux/config
将SELinux
置为 disabled
也有可能是两个节点之前的时间不同步导致的。
postges 两个node上的call 状态不一致,这时候pcs resource enable master-group 也起不来了
解决
[root@oserver1 data]# pcs resource cleanup master-group
Waiting for 4 replies from the CRMd.... OK
Cleaning up vip-master on oserver1, removing fail-count-vip-master
Cleaning up vip-master on oserver2, removing fail-count-vip-master
Cleaning up vip-rep on oserver1, removing fail-count-vip-rep
Cleaning up vip-rep on oserver2, removing fail-count-vip-rep
[root@oserver1 data]# pcs resource cleanup msPostgresql
Waiting for 2 replies from the CRMd.. OK
Cleaning up pgsql:0 on oserver1, removing fail-count-pgsql
Cleaning up pgsql:0 on oserver2, removing fail-count-pgsql
阿里云不支持组播,解决方法,改组播为单播
https://yq.aliyun.com/articles/15262
在/var/log/messages 中出现error
[root@oserver1 data]# rm /var/lib/pgsql/tmp/PGSQL.lock