一、准备工作
1. 在两台主机上安装centos 7,并分别修改修改主机名为node1、node2
在主机1上执行:
hostnamectl set-hostname node1
在主机2上执行:
hostnamectl set-hostname node2
2. node1和node2配置三块网卡,ip分配如下:
node1:
eno16777736: 192.168.119.10
eno33554960: 192.168.3.10
eno50332184: 192.168.4.10
node2:
eno16777736: 192.168.119.11
eno33554960: 192.168.3.11
eno50332184: 192.168.4.11
vip:
master_vip 192.168.119.12 绑定到网卡eno16777736。连接到master主机的浮动IP
rep_vip 192.168.119.13 绑定到网卡eno50332184。连接到standby主机的浮动IP
3. 禁用selinux (node1、node2上都要执行)
修改/etc/selinux/config,将SELINUX=permissive 修改为SELINUX=disabled
4.关闭防火墙(node1、node2上都要执行)
systemctl stop firewalld.service
systemctl disable firewalld.service
二、安装与配置
1. 通过yum安装postgresql、pacemaker、corosync(node1、node2上都要执行)
yum -y install postgresql-server postgresql-libs postgresql-contrib postgresql-devel pacemaker corosync pcs
默认安装的postgresql的版本是9.2的,如果需要安装postgresql 9.6,请参考:安装postgresql 9.6
2. 配置corosync(node1、node2上都要执行)
vi /etc/corosync/corosync.conf,文件内容如下:
quorum {
provider: corosync_votequorum
expected_votes: 2
}
aisexec {
user: root
group: root
}
service {
name: pacemaker
ver: 0
}
totem {
version: 2
secauth: off
interface {
ringnumber: 0
bindnetaddr: 192.168.3.0
mcastaddr: 239.255.1.1
}
}
logging {
to_syslog: yes
}
3.初始化数据库(node1上执行)
# su - postgres
$ mkdir /var/lib/pgsql/pg_archive
$ cd /var/lib/pgsql/data
$ initdb
4. 修改数据库主配置文件postgresql.conf (路径:/var/lib/pgsql/data),主要改动如下(node1上执行):
listen_addresses = '*'
wal_level = hot_standby
synchronous_commit = on
archive_mode = on
archive_command = 'cp %p /var/lib/pgsql/pg_archive/%f'
max_wal_senders=5
wal_keep_segments = 32
hot_standby = on
replication_timeout = 5000
wal_receiver_status_interval = 2
max_standby_streaming_delay = -1
max_standby_archive_delay = -1
restart_after_crash = off
hot_standby_feedback = on
5.修改数据库连接权限配置文件pg_hba.conf (node1上执行)
local all all trust
host all all 127.0.0.1/32 trust
host replication postgres 192.168.0.0/16 trust
host all postgres 192.168.0.0/16 trust
6.启动数据库(node1上执行)
$ pg_ctl -D /var/lib/pgsql/data start
7.备数据库的初始化,直接从主数据库备份创建(node2上执行)
# su - postgres
$ rm -rf /var/lib/pgsql/data/*
$ pg_basebackup -h 192.168.119.10 -U postgres -D /var/lib/pgsql/data -X stream -P
$ mkdir /var/lib/pgsql/pg_archive
8.创建流复制recovery.conf文件,文件路径/var/lib/pgsql/data/(node2上执行)
standby_mode = 'on'
primary_conninfo = 'host=192.168.119.12 port=5432 user=postgres application_name=node2'
restore_command = 'cp /var/lib/pgsql/pg_archive/%f %p'
recovery_target_timeline = 'latest'
9.启动从数据库(node2上执行)
$ pg_ctl -D /var/lib/pgsql/data/ start
10.检查从数据库是否配置成功(node1上执行)
# su - postgres
$ psql -c "select client_addr,sync_state from pg_stat_replication;"
client_addr | sync_state
---------------+------------
192.168.119.10 | sync
11.停止数据库(node1、node2上都要执行)
$ pg_ctl -D /var/lib/pgsql/data stop
$ exit
12. 启动corosync(node1、node2上都要执行)
systemctl start corosync
13.启动pacemaker(node1、node2上都要执行)
systemctl start pacemaker
14.检查集群状态(node1或node2执行)
# crm_mon -Afr -1
15.创建资源配置文件。创建文件cluster.pcs,文件内容如下(node1上执行):
pcs cluster cib pgsql_cfg
pcs -f pgsql_cfg property set no-quorum-policy="ignore"
pcs -f pgsql_cfg property set stonith-enabled="false"
pcs -f pgsql_cfg resource defaults resource-stickiness="INFINITY"
pcs -f pgsql_cfg resource defaults migration-threshold="1"
pcs -f pgsql_cfg resource create vip-master IPaddr2 \
ip="192.168.119.12" \
nic="eno16777736" \
cidr_netmask="24" \
op start timeout="60s" interval="0s" on-fail="restart" \
op monitor timeout="60s" interval="10s" on-fail="restart" \
op stop timeout="60s" interval="0s" on-fail="block"
pcs -f pgsql_cfg resource create vip-rep IPaddr2 \
ip="192.168.119.13" \
nic="eno50332184" \
cidr_netmask="24" \
meta migration-threshold="0" \
op start timeout="60s" interval="0s" on-fail="stop" \
op monitor timeout="60s" interval="10s" on-fail="restart" \
op stop timeout="60s" interval="0s" on-fail="ignore"
pcs -f pgsql_cfg resource create pgsql pgsql \
pgctl="/usr/bin/pg_ctl" \
psql="/usr/bin/psql" \
pgdata="/var/lib/pgsql/data/" \
rep_mode="sync" \
node_list="node1 node2" \
restore_command="cp /var/lib/pgsql/pg_archive/%f %p" \
primary_conninfo_opt="keepalives_idle=60 keepalives_interval=5 keepalives_count=5" \
master_ip="192.168.119.12" \
restart_on_promote='true' \
op start timeout="60s" interval="0s" on-fail="restart" \
op monitor timeout="60s" interval="4s" on-fail="restart" \
op monitor timeout="60s" interval="3s" on-fail="restart" role="Master" \
op promote timeout="60s" interval="0s" on-fail="restart" \
op demote timeout="60s" interval="0s" on-fail="stop" \
op stop timeout="60s" interval="0s" on-fail="block" \
op notify timeout="60s" interval="0s"
pcs -f pgsql_cfg resource master msPostgresql pgsql \
master-max=1 master-node-max=1 clone-max=2 clone-node-max=1 notify=true
pcs -f pgsql_cfg resource group add master-group vip-master
pcs -f pgsql_cfg resource group add slave-group vip-rep
pcs -f pgsql_cfg constraint colocation add master-group with Master msPostgresql INFINITY
pcs -f pgsql_cfg constraint order promote msPostgresql then start master-group symmetrical=false score=INFINITY
pcs -f pgsql_cfg constraint order demote msPostgresql then stop master-group symmetrical=false score=0
pcs -f pgsql_cfg constraint colocation add slave-group with slave msPostgresql INFINITY
pcs -f pgsql_cfg constraint order promote msPostgresql then start slave-group symmetrical=false score=INFINITY
pcs -f pgsql_cfg constraint order demote msPostgresql then stop slave-group symmetrical=false score=0
pcs cluster cib-push pgsql_cfg
16.加载配置文件(node1上执行)
# sh cluster.pcs
17.检查主备集群是否创建成功
[root@node1 data]# crm_mon -Afr -1
Stack: corosync
Current DC: node1 (version 1.1.18-11.el7_5.3-2b07d5c5a9) - partition with quorum
Last updated: Fri Aug 24 02:58:34 2018
Last change: Fri Aug 24 02:56:50 2018 by root via crm_attribute on node1
2 nodes configured
4 resources configured
Online: [ node1 node2 ]
Full list of resources:
Master/Slave Set: msPostgresql [pgsql]
Masters: [ node1 ]
Slaves: [ node2 ]
Resource Group: master-group
vip-master (ocf::heartbeat:IPaddr2): Started node1
Resource Group: slave-group
vip-rep (ocf::heartbeat:IPaddr2): Started node2
Node Attributes:
* Node node1:
+ master-pgsql : 1000
+ pgsql-data-status : LATEST
+ pgsql-master-baseline : 000000000601C458
+ pgsql-status : PRI
* Node node2:
+ master-pgsql : 100
+ pgsql-data-status : STREAMING|SYNC
+ pgsql-status : HS:sync
Migration Summary:
* Node node1:
* Node node2:
三、故障模拟测试
1. 强制杀掉主节点数据库服务进程(node1上执行)
# killall -9 postgres
2.再次检查集群状态(node1或node2上执行)
[root@node1 data]# crm_mon -Afr -1
Stack: corosync
Current DC: node1 (version 1.1.18-11.el7_5.3-2b07d5c5a9) - partition with quorum
Last updated: Fri Aug 24 03:09:25 2018
Last change: Fri Aug 24 03:08:52 2018 by root via crm_attribute on node2
2 nodes configured
4 resources configured
Online: [ node1 node2 ]
Full list of resources:
Master/Slave Set: msPostgresql [pgsql]
Masters: [ node2 ]
Stopped: [ node1 ]
Resource Group: master-group
vip-master (ocf::heartbeat:IPaddr2): Started node2
Resource Group: slave-group
vip-rep (ocf::heartbeat:IPaddr2): Stopped
Node Attributes:
* Node node1:
+ master-pgsql : -INFINITY
+ pgsql-data-status : DISCONNECT
+ pgsql-status : STOP
* Node node2:
+ master-pgsql : 1000
+ pgsql-data-status : LATEST
+ pgsql-master-baseline : 000000000601C4B8
+ pgsql-status : PRI
Migration Summary:
* Node node1:
pgsql: migration-threshold=1 fail-count=1 last-failure='Fri Aug 24 03:08:47 2018'
* Node node2:
Failed Actions:
* pgsql_monitor_3000 on node1 'not running' (7): call=56, status=complete, exitreason='',
last-rc-change='Fri Aug 24 03:08:47 2018', queued=0ms, exec=0ms
3. 恢复node1故障(node1上执行)
# su - postgres
$ rm -rf /var/lib/pgsql/data/
$ pg_basebackup -h 192.168.119.11 -U postgres -D /var/lib/pgsql/data -X stream -P
$ rm /var/lib/pgsql/tmp/PGSQL.lock
$ exit
# pcs resource cleanup msPostgresql
4.再次查看恢复后的集群状态(node1或node2上执行)
[root@node1 data]# crm_mon -Afr -1
Stack: corosync
Current DC: node1 (version 1.1.18-11.el7_5.3-2b07d5c5a9) - partition with quorum
Last updated: Fri Aug 24 03:15:07 2018
Last change: Fri Aug 24 03:15:00 2018 by root via crm_attribute on node2
2 nodes configured
4 resources configured
Online: [ node1 node2 ]
Full list of resources:
Master/Slave Set: msPostgresql [pgsql]
Masters: [ node2 ]
Slaves: [ node1 ]
Resource Group: master-group
vip-master (ocf::heartbeat:IPaddr2): Started node2
Resource Group: slave-group
vip-rep (ocf::heartbeat:IPaddr2): Started node1
Node Attributes:
* Node node1:
+ master-pgsql : 100
+ pgsql-data-status : STREAMING|SYNC
+ pgsql-status : HS:sync
* Node node2:
+ master-pgsql : 1000
+ pgsql-data-status : LATEST
+ pgsql-master-baseline : 000000000601C4B8
+ pgsql-status : PRI
Migration Summary:
* Node node1:
* Node node2:
可以看到,node1故障恢复后,集群状态正常。