一、简介

实现热备切换，但是是要手动建立触发文件实现，对于一些HA场景来说，需要当主机down了后，备机自动切换，经查询资料知道pgpool-II可以实现这种功能。本文基于PG流复制基础上，以pgpool-II实现主备切换。

pgpool双机集群架构图

　　基于PGPool的双机集群如上图所示：pg主节点和备节点实现流复制热备，pgpool1，pgpool2作为中间件，将主备pg节点加入集群，实现读写分离，负载均衡和HA故障自动切换。两pgpool节点可以委托一个虚拟ip节点作为应用程序访问的地址，两节点之间通过watchdog进行监控，当pgpool1宕机时，pgpool2会自动接管虚拟ip继续对外提供不间断服务。

二、前提

在配置pgpool之前需分别在两台规划机上安装好pg数据库，且配置好了流复制环境，关于流复制配置参考之前文章：PostgreSQL流复制热备

三、节点信息

节点信息

四、部署PGPool集群

#给root及postgres系统账号配置ssh免密登录（master&&slave）

[root@localhost data]# ssh-keygen -t rsa

[root@localhost data]# ssh-copy-id -i ~/.ssh/id_rsa.pub root@master

[root@localhost data]# ssh-copy-id -i ~/.ssh/id_rsa.pub root@slave

[postgres@localhost data]$ ssh-keygen -t rsa

[postgres@localhost data]$ ssh-copy-id -i ~/.ssh/id_rsa.pub postgres@master

[postgres@localhost data]$ ssh-copy-id -i ~/.ssh/id_rsa.pub postgres@slave

#安装PGPool（master&&slave）

[root@localhost ~]# cd /usr/local/src/

[root@localhost src]# wget http://www.pgpool.net/mediawiki/images/pgpool-II-3.6.0.tar.gz

[root@localhost src]# tar xf pgpool-II-3.6.0.tar.gz

[root@localhost src]#cd pgpool-II-3.6.0/

[root@localhost pgpool-II-3.6.0]# ./configure --prefix=/app/pgpool --with-pgsql=/app/postgres

[root@localhost pgpool-II-3.6.0]# make && make install

[root@localhost pgpool-II-3.6.0]# chown -R postgres.postgres /app/pgpool/

[root@localhost sql]# su - postgres

[postgres@slave ~]$ cd /usr/local/src/pgpool-II-3.6.0/src/sql/

[postgres@slave sql]$ make && make install

#配置pgpool（master&&slave）

#配置pgpool环境变量

[postgres@master sql]$ cd /home/postgres

[postgres@master etc]$ vim .bashrc

添加如下：

PGPOOLHOME=/app/pgpool

export PGPOOLHOME

PATH=$PATH:$HOME/.local/bin:$HOME/bin:$PGHOME/bin:$PGPOOLHOME/bin

export PATH

[postgres@master ~]$ source .bashrc

#配置pool_hba.conf， pool_hba.conf是对登录用户进行验证的，要和pg的pg_hba.conf保持一致，要么都是trust，要么都是md5验证方式，这里采用了md5验证方式如下设置：

[postgres@master ~]$ cd /app/pgpool/etc/

[postgres@master etc]$ cp pool_hba.conf.sample pool_hba.conf

[postgres@master etc]$ vim pool_hba.conf

# "local" is for Unix domain socket connections only

local all all md5

# IPv4 local connections:

host all all 0.0.0.0/0 md5

host all all 0/0 md5

#配置pcp.conf，pcp.conf配置用于pgpool自己登陆管理使用的，一些操作pgpool的工具会要求提供密码等，配置如下

[postgres@master ~]$ cd /app/pgpool/etc/

[postgres@master etc]$ cp pcp.conf.sample pcp.conf

[postgres@master etc]$ pg_md5 postgres

e8a48653851e28c69d0506508fb27fc5

[postgres@master etc]$ vim pcp.conf

添加：

postgres:e8a48653851e28c69d0506508fb27fc5

[postgres@master etc]$ cp pgpool.conf.sample pgpool.conf

[postgres@master etc]$ pg_md5 -p -m -u postgres pool_passwd

#配置系统命令权限，配置 ifconfig, arping 执行权限，执行failover_stream.sh需要用到，可以让其他普通用户执行。创建两个日志文件目录

[root@localhost sql]# chmod u+s /sbin/ifconfig;chmod u+s /usr/sbin

[root@localhost sql]#mkdir /var/log/pgpool;chown -R postgres.postgres /var/log/pgpool;mkdir /var/run/pgpool;chown -R postgres.postgres /var/run/pgpool

#master配置pgpool.conf

[postgres@master ~]$ cd /app/pgpool/etc/

[postgres@master etc]$ vim pgpool.conf

# CONNECTIONS

listen_addresses = '*'

port = 9999

pcp_listen_addresses = '*'

pcp_port = 9898

# - Backend Connection Settings -

backend_hostname0 = 'master'

backend_port0 = 5432

backend_weight0 = 1

backend_data_directory0 = '/app/postgres/data'

backend_flag0 = 'ALLOW_TO_FAILOVER'

backend_hostname1 = 'slave'

backend_port1 = 5432

backend_weight1 = 1

backend_data_directory1 = '/app/postgres/data'

backend_flag1 = 'ALLOW_TO_FAILOVER'

# - Authentication -

enable_pool_hba = on

pool_passwd = 'pool_passwd'

# FILE LOCATIONS

pid_file_name = '/app/pgpool/pgpool.pid'

replication_mode = off

load_balance_mode = on

master_slave_mode = on

master_slave_sub_mode = 'stream'

sr_check_period = 5

sr_check_user = 'repuser'

sr_check_password = 'repuser'

sr_check_database = 'postgres'

#------------------------------------------------------------------------------

# HEALTH CHECK 健康检查

#------------------------------------------------------------------------------

health_check_period = 10 # Health check period

# Disabled (0) by default

health_check_timeout = 20

# Health check timeout

# 0 means no timeout

health_check_user = 'postgres'

# Health check user

health_check_password = 'postgres' #数据库密码

# Password for health check user

health_check_database = 'postgres'

#必须设置，否则primary数据库down了，pgpool不知道，不能及时切换。从库流复制还在连接数据，报连接失败。

#只有下次使用pgpool登录时，发现连接不上，然后报错，这时候，才知道挂了，pgpool进行切换。

#主备切换的命令行配置

#------------------------------------------------------------------------------

# FAILOVER AND FAILBACK

#------------------------------------------------------------------------------

failover_command = '/app/pgpool/failover_stream.sh %H '

#------------------------------------------------------------------------------

# WATCHDOG

#------------------------------------------------------------------------------

# - Enabling -

use_watchdog = on

# - Watchdog communication Settings -

wd_hostname = 'master'

# Host name or IP address of this watchdog

# (change requires restart)

wd_port = 9000

# port number for watchdog service

# (change requires restart)

# - Virtual IP control Setting -

delegate_IP = '10.1.83.168'

# delegate IP address

# If this is empty, virtual IP never bring up.

# (change requires restart)

if_cmd_path = '/sbin'

# path to the directory where if_up/down_cmd exists

# (change requires restart)

if_up_cmd = 'ifconfig ens33:0 inet $_IP_$ netmask 255.255.254.0'

# startup delegate IP command

# (change requires restart)

# eth1根据现场机器改掉

if_down_cmd = 'ifconfig ens33:0 down'

# shutdown delegate IP command

# (change requires restart)

# eth1根据现场机器改掉

# -- heartbeat mode --

wd_heartbeat_port = 9694

# Port number for receiving heartbeat signal

# (change requires restart)

wd_heartbeat_keepalive = 2

# Interval time of sending heartbeat signal (sec)

# (change requires restart)

wd_heartbeat_deadtime = 30

# Deadtime interval for heartbeat signal (sec)

# (change requires restart)

heartbeat_destination0 = 'slave'

# Host name or IP address of destination 0

# for sending heartbeat signal.

# (change requires restart)

heartbeat_destination_port0 = 9694

# Port number of destination 0 for sending

# heartbeat signal. Usually this is the

# same as wd_heartbeat_port.

# (change requires restart)

heartbeat_device0 = 'ens33'

# Name of NIC device (such like 'eth0')

# used for sending/receiving heartbeat

# signal to/from destination 0.

# This works only when this is not empty

# and pgpool has root privilege.

# (change requires restart)

# eth1根据现场机器改掉

# - Other pgpool Connection Settings -

other_pgpool_hostname0 = 'slave' #对端

# Host name or IP address to connect to for other pgpool 0

# (change requires restart)

other_pgpool_port0 = 9999

# Port number for othet pgpool 0

# (change requires restart)

other_wd_port0 = 9000

# Port number for othet watchdog 0

# (change requires restart)

#slave配置pgpool.conf

[postgres@master ~]$ cd /app/pgpool/etc/

[postgres@master etc]$ vim pgpool.conf

# CONNECTIONS

listen_addresses = '*'

port = 9999

pcp_listen_addresses = '*'

pcp_port = 9898

# - Backend Connection Settings -

backend_hostname0 = 'master'

backend_port0 = 5432

backend_weight0 = 1

backend_data_directory0 = '/app/postgres/data'

backend_flag0 = 'ALLOW_TO_FAILOVER'

backend_hostname1 = 'slave'

backend_port1 = 5432

backend_weight1 = 1

backend_data_directory1 = '/app/postgres/data'

backend_flag1 = 'ALLOW_TO_FAILOVER'

# - Authentication -

enable_pool_hba = on

pool_passwd = 'pool_passwd'

# FILE LOCATIONS

pid_file_name = '/app/pgpool/pgpool.pid'

replication_mode = off

load_balance_mode = on

master_slave_mode = on

master_slave_sub_mode = 'stream'

sr_check_period = 5

sr_check_user = 'repuser'

sr_check_password = 'repuser'

sr_check_database = 'postgres'

#------------------------------------------------------------------------------

# HEALTH CHECK 健康检查

#------------------------------------------------------------------------------

health_check_period = 10 # Health check period

# Disabled (0) by default

health_check_timeout = 20

# Health check timeout

# 0 means no timeout

health_check_user = 'postgres'

# Health check user

health_check_password = 'postgres' #数据库密码

# Password for health check user

health_check_database = 'postgres'

#必须设置，否则primary数据库down了，pgpool不知道，不能及时切换。从库流复制还在连接数据，报连接失败。

#只有下次使用pgpool登录时，发现连接不上，然后报错，这时候，才知道挂了，pgpool进行切换。

#主备切换的命令行配置

#------------------------------------------------------------------------------

# FAILOVER AND FAILBACK

#------------------------------------------------------------------------------

failover_command = '/app/pgpool/failover_stream.sh %H '

#------------------------------------------------------------------------------

# WATCHDOG

#------------------------------------------------------------------------------

# - Enabling -

use_watchdog = on

# - Watchdog communication Settings -

wd_hostname = 'slave' #本端

# Host name or IP address of this watchdog

# (change requires restart)

wd_port = 9000

# port number for watchdog service

# (change requires restart)

# - Virtual IP control Setting -

delegate_IP = 'vip'

# delegate IP address

# If this is empty, virtual IP never bring up.

# (change requires restart)

if_cmd_path = '/sbin'

# path to the directory where if_up/down_cmd exists

# (change requires restart)

if_up_cmd = 'ifconfig eth1:0 inet $_IP_$ netmask 255.255.255.0'

# startup delegate IP command

# (change requires restart)

# eth1根据现场机器改掉

if_down_cmd = 'ifconfig eth1:0 down'

# shutdown delegate IP command

# (change requires restart)

# eth1根据现场机器改掉

# -- heartbeat mode --

wd_heartbeat_port = 9694

# Port number for receiving heartbeat signal

# (change requires restart)

wd_heartbeat_keepalive = 2

# Interval time of sending heartbeat signal (sec)

# (change requires restart)

wd_heartbeat_deadtime = 30

# Deadtime interval for heartbeat signal (sec)

# (change requires restart)

heartbeat_destination0 = 'master' #对端

# Host name or IP address of destination 0

# for sending heartbeat signal.

# (change requires restart)

heartbeat_destination_port0 = 9694

# Port number of destination 0 for sending

# heartbeat signal. Usually this is the

# same as wd_heartbeat_port.

# (change requires restart)

heartbeat_device0 = 'ens33'

# Name of NIC device (such like 'eth0')

# used for sending/receiving heartbeat

# signal to/from destination 0.

# This works only when this is not empty

# and pgpool has root privilege.

# (change requires restart)

# eth1根据现场机器改掉

# - Other pgpool Connection Settings -

other_pgpool_hostname0 = 'master' #对端

# Host name or IP address to connect to for other pgpool 0

# (change requires restart)

other_pgpool_port0 = 9999

# Port number for othet pgpool 0

# (change requires restart)

other_wd_port0 = 9000

# Port number for othet watchdog 0

# (change requires restart)

#配置文件里，故障处理配置的是failover_command = '/app/pgpool/failover_stream.sh %H '，因此，需要在/app/pgpool目录中写个failover_stream.sh脚本：
[postgres@master etc]$ cd /app/pgpool

[postgres@master pgpool]$ touch failover_stream.sh

#注意这里使用了promote 而不是触发文件，触发文件来回切换有问题，编辑内容如下：

[postgres@master pgpool]$ vim failover_stream.sh

#! /bin/sh

# Failover command for streaming replication.

# Arguments: $1: new master hostname.

new_master=$1

trigger_command="$PGHOME/bin/pg_ctl promote -D $PGDATA"

# Prompte standby database.

/usr/bin/ssh -T $new_master $trigger_command

exit 0;

#如果是其他用户创建的，需要赋予postgres可执行权限，例如：

[postgres@master pgpool]$ chmod 777 /app/pgpool/failover_stream.sh

五、PGPool集群管理

#启动集群

[postgres@slave pgpool]$ pg_ctl start -D $PGDATA

[postgres@master pgpool]$ pgpool -n -d -D > /var/log/pgpool/pgpool.log 2>&1 &

#查看集群节点状态

[postgres@master pgpool]$ psql -h 10.1.83.168 -p 9999

Password:

psql (9.6.0)

Type "help" for help.

postgres=# show pool_nodes;

node_id | hostname | port | status | lb_weight | role | select_cnt | load_balance_node | replication_delay

---------+----------+------+--------+-----------+---------+------------+-------------------+-------------------

0 | master | 5432 | up | 0.500000 | primary | 0 | true | 0

1 | slave | 5432 | up | 0.500000 | standby | 0 | false | 0

(2 rows)

#在slave上节点也是psql -h 10.1.83.168 -p 9999，双pgpool使用虚拟ip，做到高可用。

#Pgpool的HA

#模拟master端pgpool宕机

[postgres@master pgpool]$ pgpool -m fast stop

2020-01-08 18:07:57: pid 8535: LOG: stop request sent to pgpool. waiting for termination...

.done.

[1]+ 完成 pgpool -n -d -D > /var/log/pgpool/pgpool.log 2>&1

[postgres@master pgpool]$ psql -h 10.1.83.168 -p 9999

Password:

psql (9.6.0)

Type "help" for help.

postgres=# show pool_nodes;

node_id | hostname | port | status | lb_weight | role | select_cnt | load_balance_node | replication_delay

---------+----------+------+--------+-----------+---------+------------+-------------------+-------------------

0 | master | 5432 | up | 0.500000 | primary | 0 | false | 0

1 | slave | 5432 | up | 0.500000 | standby | 0 | true | 0

(2 rows)

[postgres@master pgpool]$ pgpool -n -d -D > /var/log/pgpool/pgpool.log 2>&1 &

#访问成功，在master节点上的pgpool宕机后，由slave节点的pgpool接管vip和集群服务，并未中断应用访问。#在master上重新启动pgpool后，定制slave上的pgpool服务，结果一样。

#模拟master端pg primary宕机

[postgres@master pgpool]$ pg_ctl stop -D $PGDATA

waiting for server to shut down...... done

server stopped

[postgres@master pgpool]$ psql -h 10.1.83.168 -p 9999

Password:

psql (9.6.0)

Type "help" for help.

postgres=# show pool_nodes;

node_id | hostname | port | status | lb_weight | role | select_cnt | load_balance_node | replication_delay

---------+----------+------+--------+-----------+---------+------------+-------------------+-------------------

0 | master | 5432 | down | 0.500000 | standby | 0 | false | 0

1 | slave | 5432 | up | 0.500000 | primary | 0 | true | 0

(2 rows)

#slave已经被切换成primary，且master节点状态是down

#修复master节点重新加入集群

[postgres@master pgpool]$ cd $PGDATA

[postgres@master data]$ mv recovery.done recovery.conf#一定要把.done改成.conf

[postgres@master data]$ pg_ctl start -D $PGDATA

server starting

[postgres@master data]$ LOG: redirecting log output to logging collector process

HINT: Future log output will appear in directory "/app/postgres/log".

[postgres@master data]$ pcp_attach_node -d -U postgres -h 10.1.83.168 -p 9898 -n 0

#注意master的node_id是0，所以-n 0

Password:

#提示输入密码，输入pcp管理密码。

DEBUG: recv: tos="m", len=8

DEBUG: recv: tos="r", len=21

DEBUG: send: tos="C", len=6

DEBUG: recv: tos="c", len=20

pcp_attach_node -- Command Successful

DEBUG: send: tos="X", len=4

[postgres@master data]$ psql -h 10.1.83.168 -p 9999

Password:

psql (9.6.0)

Type "help" for help.

postgres=# show pool_nodes;

node_id | hostname | port | status | lb_weight | role | select_cnt | load_balance_node | replication_delay

---------+----------+------+--------+-----------+---------+------------+-------------------+-------------------

0 | master | 5432 | up | 0.500000 | standby | 0 | false | 0

1 | slave | 5432 | up | 0.500000 | primary | 0 | true | 0

(2 rows)

#master直接down机

当前master节点是primay，我们直接将master服务器直接关机后，发现实现了主备切换，master已经down了，而slave已经被切换成了primary：

#master

[root@master ~]# reboot

#slave

[postgres@slave ~]$ psql -h 10.1.83.168 -p 9999

Password:

psql (9.6.0)

Type "help" for help.

postgres=# show pool_nodes;

node_id | hostname | port | status | lb_weight | role | select_cnt | load_balance_node | replication_delay

---------+----------+------+--------+-----------+---------+------------+-------------------+-------------------

0 | master | 5432 | down | 0.500000 | standby | 0 | false | 0

1 | slave | 5432 | up | 0.500000 | primary | 1 | true | 0

(2 rows)

#数据线同步

在主备切换时，修复节点并重启后，由于数据发生变化，或修复的节点数据发生变化再按照流复制模式加入集群，很可能报时间线不同步错误：

[postgres@master data]$ mv recovery.done recovery.conf

[postgres@master data]$ pg_ctl start -D $PGDATA

[postgres@master data]$ less /app/postgres/log/postgresql-2020-01-09_142207.log

LOG: database system was shut down at 2020-01-09 13:29:13 CST

LOG: entering standby mode

LOG: consistent recovery state reached at 0/20000098

LOG: invalid record length at 0/20000098: wanted 24, got 0

LOG: database system is ready to accept read only connections

LOG: fetching timeline history file for timeline 13 from primary server

FATAL: could not start WAL streaming: ERROR: requested starting point 0/20000000 on timeline 12 is not in this server's history

DETAIL: This server's history forked from timeline 12 at 0/1E000098.

LOG: new timeline 13 forked off current database system timeline 12 before current recovery point 0/20000098

FATAL: could not start WAL streaming: ERROR: requested starting point 0/20000000 on timeline 12 is not in this server's history

DETAIL: This server's history forked from timeline 12 at 0/1E000098.

LOG: new timeline 13 forked off current database system timeline 12 before current recovery point 0/20000098

FATAL: could not start WAL streaming: ERROR: requested starting point 0/20000000 on timeline 12 is not in this server's history

DETAIL: This server's history forked from timeline 12 at 0/1E000098.

LOG: new timeline 13 forked off current database system timeline 12 before current recovery point 0/20000098

产生这种情况，需要根据pg_rewind工具同步数据时间线，具体分六步走。

#停掉需要做同步的节点pg服务

[postgres@master data]$ pg_ctl stop -D $PGDATA

#同步new master节点上时间线

[postgres@master data]$ pg_rewind --target-pgdata=/app/postgres/data --source-server='host=slave port=5432 user=postgres dbname=postgres password=postgres'

#修改pg_hba.conf与 recovery.done文件

[postgres@master data]$ cd $PGDATA

[postgres@master data]$ mv recovery.done recovery.conf

[postgres@master data]$ vim recovery.conf

primary_conninfo = 'host=slave port=5432 user=repuser password=repuser'

[postgres@master data]$ vi pg_hba.conf

host replication repuser slave md5

#重启pg服务

[postgres@master data]$ pg_ctl start -D $PGDATA

#重新加入集群

[postgres@master data]$ pcp_attach_node -d -U postgres -h 10.1.83.168 -p 9898 -n 0

#启动pgpool

[postgres@master data]$ pgpool -n -d -D > /var/log/pgpool/pgpool.log 2>&1 &

#查看集群节点状态

[postgres@master data]$ psql -h 10.1.83.168 -p 9999

Password:

psql (9.6.0)

Type "help" for help.

postgres=# show pool_nodes;

node_id | hostname | port | status | lb_weight | role | select_cnt | load_balance_node | replication_delay

---------+----------+------+--------+-----------+---------+------------+-------------------+-------------------

0 | master | 5432 | up | 0.500000 | standby | 0 | true | 0

1 | slave | 5432 | up | 0.500000 | primary | 1 | false | 0

(2 rows)

PGPool-II+PG流复制实现HA

一、简介

二、前提

三、节点信息

四、部署PGPool集群

#给root及postgres系统账号配置ssh免密登录（master&&slave）

#安装PGPool（master&&slave）

#配置pgpool（master&&slave）

#配置pgpool环境变量

#配置pool_hba.conf， pool_hba.conf是对登录用户进行验证的，要和pg的pg_hba.conf保持一致，要么都是trust，要么都是md5验证方式，这里采用了md5验证方式如下设置：

#配置pcp.conf，pcp.conf配置用于pgpool自己登陆管理使用的，一些操作pgpool的工具会要求提供密码等，配置如下

#配置系统命令权限，配置 ifconfig, arping 执行权限，执行failover_stream.sh需要用到，可以让其他普通用户执行。创建两个日志文件目录

#master配置pgpool.conf

#slave配置pgpool.conf

五、PGPool集群管理

#启动集群

#查看集群节点状态

#Pgpool的HA

#模拟master端pgpool宕机

#模拟master端pg primary宕机

#master直接down机

#数据线同步

你可能感兴趣的:(PGPool-II+PG流复制实现HA)

PGPool-II+PG流复制实现HA

一、简介

二、前提

三、节点信息

四、部署PGPool集群

#给root及postgres系统账号配置ssh免密登录（master&&slave）

#安装PGPool（master&&slave）

#配置pgpool（master&&slave）

#配置pgpool环境变量

#配置pool_hba.conf， pool_hba.conf是对登录用户进行验证的，要和pg的pg_hba.conf保持一致，要么都是trust，要么都是md5验证方式，这里采用了md5验证方式如下设置：

#配置pcp.conf，pcp.conf配置用于pgpool自己登陆管理使用的，一些操作pgpool的工具会要求提供密码等，配置如下

#配置系统命令权限，配置 ifconfig, arping 执行权限 ，执行failover_stream.sh需要用到，可以让其他普通用户执行。创建两个日志文件目录

#master配置pgpool.conf

#slave配置pgpool.conf

五、PGPool集群管理

#启动集群

#查看集群节点状态

#Pgpool的HA

#模拟master端pgpool宕机

#模拟master端pg primary宕机

#master直接down机

#数据线同步

你可能感兴趣的:(PGPool-II+PG流复制实现HA)

#配置系统命令权限，配置 ifconfig, arping 执行权限，执行failover_stream.sh需要用到，可以让其他普通用户执行。创建两个日志文件目录