$ vim /pg_data/pgsql11/data/postgresql.conf
shared_preload_libraries = 'repmgr'
$ sudo systemctl retsart postgres11
$ vim /pg_data/pgsql11/repmgr.conf
failover='automatic' # 开启自动故障转移
promote_command='/usr/local/pgsql11/bin/repmgr standby promote -f /pg_data/pgsql11/repmgr.conf --log-to-file'
follow_command='/usr/local/pgsql11/bin/repmgr standby follow -f /pg_data/pgsql11/repmgr.conf --log-to-file --upstream-node-id=%n'
monitoring_history=yes # 启动监控参数
reconnect_interval=5 # 每间隔5s尝试一次重连
reconnect_attempts=3 # 当primary节点宕机后,重试连接次数,默认为6
monitor_interval_secs=5 # 监控数据默认采集周期
location='location1' # 定义location
priority=99 # 定义节点权重
$ sudo systemctl restart repmgr11
[postgres@172-16-104-57 pgsql11]$ repmgr -f /pg_data/pgsql11/repmgr.conf witness register -h172.16.104.7 -Urepmgr -drepmgr --force
INFO: connecting to witness node "172-16-104-57" (ID: 4)
INFO: connecting to primary node
INFO: "repmgr" extension is already installed
INFO: witness registration complete
NOTICE: witness node "172-16-104-57" (ID: 4) successfully registered
[postgres@172-16-104-57 ~]$ repmgr -f /pg_data/pgsql11/repmgr.conf cluster show
ID | Name | Role | Status | Upstream | Location | Priority | Timeline | Connection string
1 | 172-16-104-7 | standby | running | 172-16-104-56 | location1 | 100 | 18 | host=172-16-104-7 user=repmgr dbname=repmgr connect_timeout=2
2 | 172-16-104-55 | standby | running | 172-16-104-56 | location1 | 99 | 18 | host=172-16-104-55 user=repmgr dbname=repmgr connect_timeout=2
3 | 172-16-104-56 | primary | * running | | location1 | 100 | 18 | host=172-16-104-56 user=repmgr dbname=repmgr connect_timeout=2
4 | 172-16-104-57 | witness | * running | 172-16-104-7 | location1 | 0 | n/a | host= user=repmgr dbname=repmgr connect_timeout=2
[postgres@172-16-104-57 ~]$ repmgr -f /pg_data/pgsql11/repmgr.conf cluster show
ID | Name | Role | Status | Upstream | Location | Priority | Timeline | Connection string
1 | 172-16-104-7 | standby | running | 172-16-104-56 | location1 | 100 | 18 | host=172-16-104-7 user=repmgr dbname=repmgr connect_timeout=2
2 | 172-16-104-55 | standby | running | 172-16-104-56 | location1 | 99 | 18 | host=172-16-104-55 user=repmgr dbname=repmgr connect_timeout=2
3 | 172-16-104-56 | primary | * running | | location1 | 100 | 18 | host=172-16-104-56 user=repmgr dbname=repmgr connect_timeout=2
4 | 172-16-104-57 | witness | * running | 172-16-104-7 | location1 | 0 | n/a | host= user=repmgr dbname=repmgr connect_timeout=2
[postgres@172-16-104-56 ~]$ sudo systemctl stop postgres11
-- 旧primary节点不可达,后续会根据配置文件设置重复几次重试连接primary节点,确认primary节点是否真正的无法使用
[postgres@172-16-104-57 ~]$ repmgr -f /pg_data/pgsql11/repmgr.conf cluster show
ID | Name | Role | Status | Upstream | Location | Priority | Timeline | Connection string
1 | 172-16-104-7 | standby | running | ? 172-16-104-56 | location1 | 100 | 18 | host=172-16-104-7 user=repmgr dbname=repmgr connect_timeout=2
2 | 172-16-104-55 | standby | running | ? 172-16-104-56 | location1 | 99 | 18 | host=172-16-104-55 user=repmgr dbname=repmgr connect_timeout=2
3 | 172-16-104-56 | primary | ? unreachable | ? | location1 | 100 | | host=172-16-104-56 user=repmgr dbname=repmgr connect_timeout=2
4 | 172-16-104-57 | witness | * running | 172-16-104-7 | location1 | 0 | n/a | host= user=repmgr dbname=repmgr connect_timeout=2
WARNING: following issues were detected
- unable to connect to node "172-16-104-7" (ID: 1)'s upstream node "172-16-104-56" (ID: 3)
- unable to determine if node "172-16-104-7" (ID: 1) is attached to its upstream node "172-16-104-56" (ID: 3)
- unable to connect to node "172-16-104-55" (ID: 2)'s upstream node "172-16-104-56" (ID: 3)
- unable to determine if node "172-16-104-55" (ID: 2) is attached to its upstream node "172-16-104-56" (ID: 3)
- unable to connect to node "172-16-104-56" (ID: 3)
- node "172-16-104-56" (ID: 3) is registered as an active primary but is unreachable
HINT: execute with --verbose option to see connection error messages
-- 当见证服务器确认旧primary节点不可达后,根据一定的规则选择standby节点并提升为新的primary节点
[postgres@172-16-104-57 ~]$ repmgr -f /pg_data/pgsql11/repmgr.conf cluster show
ID | Name | Role | Status | Upstream | Location | Priority | Timeline | Connection string
1 | 172-16-104-7 | standby | ! running as primary | | location1 | 100 | 19 | host=172-16-104-7 user=repmgr dbname=repmgr connect_timeout=2
2 | 172-16-104-55 | standby | running | ? 172-16-104-56 | location1 | 99 | 18 | host=172-16-104-55 user=repmgr dbname=repmgr connect_timeout=2
3 | 172-16-104-56 | primary | ? unreachable | ? | location1 | 100 | | host=172-16-104-56 user=repmgr dbname=repmgr connect_timeout=2
4 | 172-16-104-57 | witness | * running | 172-16-104-7 | location1 | 0 | n/a | host= user=repmgr dbname=repmgr connect_timeout=2
WARNING: following issues were detected
- node "172-16-104-7" (ID: 1) is registered as standby but running as primary
- unable to connect to node "172-16-104-55" (ID: 2)'s upstream node "172-16-104-56" (ID: 3)
- unable to determine if node "172-16-104-55" (ID: 2) is attached to its upstream node "172-16-104-56" (ID: 3)
- unable to connect to node "172-16-104-56" (ID: 3)
- node "172-16-104-56" (ID: 3) is registered as an active primary but is unreachable
HINT: execute with --verbose option to see connection error messages
-- 提升新的primary节点完成后,旧的primary节点被至为failed
[postgres@172-16-104-57 ~]$ repmgr -f /pg_data/pgsql11/repmgr.conf cluster show
ID | Name | Role | Status | Upstream | Location | Priority | Timeline | Connection string
1 | 172-16-104-7 | primary | * running | | location1 | 100 | 19 | host=172-16-104-7 user=repmgr dbname=repmgr connect_timeout=2
2 | 172-16-104-55 | standby | running | ? 172-16-104-56 | location1 | 99 | 18 | host=172-16-104-55 user=repmgr dbname=repmgr connect_timeout=2
3 | 172-16-104-56 | primary | - failed | ? | location1 | 100 | | host=172-16-104-56 user=repmgr dbname=repmgr connect_timeout=2
4 | 172-16-104-57 | witness | * running | 172-16-104-7 | location1 | 0 | n/a | host= user=repmgr dbname=repmgr connect_timeout=2
WARNING: following issues were detected
- unable to connect to node "172-16-104-55" (ID: 2)'s upstream node "172-16-104-56" (ID: 3)
- unable to determine if node "172-16-104-55" (ID: 2) is attached to its upstream node "172-16-104-56" (ID: 3)
- unable to connect to node "172-16-104-56" (ID: 3)
HINT: execute with --verbose option to see connection error messages
[postgres@172-16-104-55 ~]$ repmgr -f /pg_data/pgsql11/repmgr.conf standby follow
NOTICE: attempting to find and follow current primary
INFO: local node 2 can attach to follow target node 1
DETAIL: local node's recovery point: 0/47000098; follow target node's fork point: 0/47000098
INFO: creating replication slot as user "repmgr"
NOTICE: setting node 2's upstream to node 1
NOTICE: restarting server using "sudo systemctl restart postgres11"
WARNING: unable to connect to old upstream node 3 to remove replication slot
HINT: if reusing this node, you should manually remove any inactive replication slots
WARNING: node "172-16-104-55" attached in state "startup"
DETAIL: standby attached to upstream node "172-16-104-7" (ID: 1)
[postgres@172-16-104-57 ~]$ repmgr -f /pg_data/pgsql11/repmgr.conf cluster show
ID | Name | Role | Status | Upstream | Location | Priority | Timeline | Connection string
1 | 172-16-104-7 | primary | * running | | location1 | 100 | 19 | host=172-16-104-7 user=repmgr dbname=repmgr connect_timeout=2
2 | 172-16-104-55 | standby | running | 172-16-104-7 | location1 | 99 | 18 | host=172-16-104-55 user=repmgr dbname=repmgr connect_timeout=2
3 | 172-16-104-56 | primary | - failed | ? | location1 | 100 | | host=172-16-104-56 user=repmgr dbname=repmgr connect_timeout=2
4 | 172-16-104-57 | witness | * running | 172-16-104-7 | location1 | 0 | n/a | host= user=repmgr dbname=repmgr connect_timeout=2
WARNING: following issues were detected
- unable to connect to node "172-16-104-56" (ID: 3)
HINT: execute with --verbose option to see connection error messages
[postgres@172-16-104-56 ~]$ repmgr -f /pg_data/pgsql11/repmgr.conf node rejoin -d'host=172-16-104-7 user=repmgr dbname=repmgr connect_timeout=2' --force-rewind
NOTICE: executing pg_rewind
DETAIL: pg_rewind command is "/usr/local/pgsql11/bin/pg_rewind -D '/pg_data/pgsql11/data' --source-server='host=172-16-104-7 user=repmgr dbname=repmgr connect_timeout=2'"
NOTICE: 0 files copied to /pg_data/pgsql11/data
INFO: creating replication slot as user "repmgr"
NOTICE: setting node 3's upstream to node 1
WARNING: unable to ping "host=172-16-104-56 user=repmgr dbname=repmgr connect_timeout=2"
NOTICE: starting server using "sudo systemctl start postgres11"
WARNING: node "172-16-104-56" attached in state "startup"
INFO: waiting for node "172-16-104-56" (ID: 3) to connect to new primary; 1 of max 60 attempts (parameter "node_rejoin_timeout")
DETAIL: node "172-16-104-7" (ID: 3) is currrently attached to its upstream node in state "startup"
DETAIL: node 3 is now attached to node 1
[postgres@172-16-104-57 ~]$ repmgr -f /pg_data/pgsql11/repmgr.conf cluster show
ID | Name | Role | Status | Upstream | Location | Priority | Timeline | Connection string
1 | 172-16-104-7 | primary | * running | | location1 | 100 | 19 | host=172-16-104-7 user=repmgr dbname=repmgr connect_timeout=2
2 | 172-16-104-55 | standby | running | 172-16-104-7 | location1 | 99 | 18 | host=172-16-104-55 user=repmgr dbname=repmgr connect_timeout=2
3 | 172-16-104-56 | standby | running | 172-16-104-7 | location1 | 100 | 18 | host=172-16-104-56 user=repmgr dbname=repmgr connect_timeout=2
4 | 172-16-104-57 | witness | * running | 172-16-104-7 | location1 | 0 | n/a | host= user=repmgr dbname=repmgr connect_timeout=2