yum install corosync pacemaker
直接去CentOS rpm包网站找
下面rpm是通过yum安装时获取到的信息, CentOS7可以同样方式获取,
Corosync
corosynclib.x86_64 0:1.4.7-6.el6
corosync.x86_64 0:1.4.7-6.el6
Pacemaker
clusterlib.x86_64 0:3.0.12.1-84.el6_10.1
cman.x86_64 0:3.0.12.1-84.el6_10.1
modcluster.x86_64 0:0.16.2-35.el6
openais.x86_64 0:1.1.1-7.el6
openaislib.x86_64 0:1.1.1-7.el6
ricci.x86_64 0:0.16.2-87.el6
pacemaker-cli.x86_64 0:1.1.18-3.el6
pacemaker-cluster-libs.x86_64 0:1.1.18-3.el6
pacemaker-libs-1.1.18-3.el6.x86_64
pacemaker.x86_64 0:1.1.18-3.el6
环境
pcmk-1 192.168.0.1
pcmk-2 192.168.0.2
命令行 Cluster Shell 程序涵盖了管理和配置操作系统所需的所有各个方面。
pcs和crmsh是两个流行的命令行shell
yum install -y pacemaker pcs psmisc policycoreutils-python
本文档使用 pcs 管理集群 (crmsh 也可达到同样效果,只是语法稍微有些不一样)
【ALL】永久关闭防火墙 与 Selinux
setenforce 0
sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config
systemctl mask firewalld.service
systemctl stop firewalld.service
iptables --flush
【ALL】启用 pcsd 后台进程
systemctl start pcsd.service
systemctl enable pcsd.service
【ALL】给 hacluster 用户设置密码
echo mysupersecretpassword | passwd --stdin hacluster
【pcmk1】配置 COROSYNC
pcs cluster auth pcmk-1 pcmk-2
【pcmk1】创建集群 (此操作要与上一步操作在同一节点上操作)
pcs cluster setup --name mycluster pcmk-1 pcmk-2
使用 pcs
pcs --help
pcs status --help
【ONE】启动集群
pcs cluster start --all
或者在每个集群节点上分别执行如下命令
pcs cluster start
或者
systemctl start corosync.service
systemctl start pacemaker.service
检查集群状态
[root@node0 ~]# corosync-cfgtool -s
Printing ring status.
Local node ID 1
RING ID 0
id = 192.168.0.70
status = ring 0 active with no faults
[root@node0 ~]#
[root@node0 ~]# corosync-cmapctl | grep members
runtime.totem.pg.mrp.srp.members.1.config_version (u64) = 0
runtime.totem.pg.mrp.srp.members.1.ip (str) = r(0) ip(192.168.0.70)
runtime.totem.pg.mrp.srp.members.1.join_count (u32) = 1
runtime.totem.pg.mrp.srp.members.1.status (str) = joined
runtime.totem.pg.mrp.srp.members.2.config_version (u64) = 0
runtime.totem.pg.mrp.srp.members.2.ip (str) = r(0) ip(192.168.0.71)
runtime.totem.pg.mrp.srp.members.2.join_count (u32) = 1
runtime.totem.pg.mrp.srp.members.2.status (str) = joined
runtime.totem.pg.mrp.srp.members.3.config_version (u64) = 0
runtime.totem.pg.mrp.srp.members.3.ip (str) = r(0) ip(192.168.0.72)
runtime.totem.pg.mrp.srp.members.3.join_count (u32) = 1
runtime.totem.pg.mrp.srp.members.3.status (str) = joined
检查 pacemaker 安装
ps -axf
42995 ? SLsl 1:52 corosync
43017 ? Ss 0:03 /usr/sbin/pacemakerd -f
43018 ? Ss 0:04 \_ /usr/libexec/pacemaker/cib
43019 ? Ss 0:03 \_ /usr/libexec/pacemaker/stonithd
43020 ? Ss 0:03 \_ /usr/libexec/pacemaker/lrmd
43021 ? Ss 0:03 \_ /usr/libexec/pacemaker/attrd
43022 ? Ss 0:03 \_ /usr/libexec/pacemaker/pengine
43023 ? Ss 0:04 \_ /usr/libexec/pacemaker/crmd
检查 pcs 状态
[root@node0 ~]# pcs status
Cluster name: cluster_test01
Stack: corosync
Current DC: node2 (version 1.1.21-4.el7-f14e36fd43) - partition with quorum
Last updated: Mon Jul 20 13:20:53 2020
Last change: Mon Jul 20 05:46:34 2020 by root via crm_resource on node0
3 nodes configured
2 resources configured
Online: [ node0 node1 node2 ]
Full list of resources:
VIP (ocf::heartbeat:IPaddr2): Started node1
haproxy (systemd:haproxy): Started node1
Daemon Status:
corosync: active/enabled
pacemaker: active/enabled
pcsd: active/enabled
[root@node0 ~]#
检查
[root@node0 ~]# pcs status xml
<?xml version="1.0"?>
<crm_mon version="1.1.21">
<summary>
<stack type="corosync" />
<current_dc present="true" version="1.1.21-4.el7-f14e36fd43" name="node2" id="3" with_quorum="true" />
<last_update time="Mon Jul 20 13:27:45 2020" />
<last_change time="Mon Jul 20 05:46:34 2020" user="root" client="crm_resource" origin="node0" />
<nodes_configured number="3" expected_votes="unknown" />
<resources_configured number="2" disabled="0" blocked="0" />
<cluster_options stonith-enabled="false" symmetric-cluster="true" no-quorum-policy="ignore" maintenance-mode="false" />
</summary>
<nodes>
<node name="node0" id="1" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="false" resources_running="0" type="member" />
<node name="node1" id="2" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="false" resources_running="2" type="member" />
<node name="node2" id="3" online="true" standby="false" standby_onfail="false" maintenance="false" pending="false" unclean="false" shutdown="false" expected_up="true" is_dc="true" resources_running="0" type="member" />
</nodes>
<resources>
<resource id="VIP" resource_agent="ocf::heartbeat:IPaddr2" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="node1" id="2" cached="false"/>
</resource>
<resource id="haproxy" resource_agent="systemd:haproxy" role="Started" active="true" orphaned="false" blocked="false" managed="true" failed="false" failure_ignored="false" nodes_running_on="1" >
<node name="node1" id="2" cached="false"/>
</resource>
</resources>
<node_attributes>
<node name="node0">
</node>
<node name="node1">
</node>
<node name="node2">
</node>
</node_attributes>
<node_history>
<node name="node2">
<resource_history id="VIP" orphan="false" migration-threshold="1000000">
<operation_history call="39" task="stop" last-rc-change="Mon Jul 20 05:02:28 2020" last-run="Mon Jul 20 05:02:28 2020" exec-time="287ms" queue-time="2ms" rc="0" rc_text="ok" />
</resource_history>
</node>
<node name="node0">
</node>
<node name="node1">
<resource_history id="VIP" orphan="false" migration-threshold="1000000">
<operation_history call="10" task="start" last-rc-change="Mon Jul 20 05:47:31 2020" last-run="Mon Jul 20 05:47:31 2020" exec-time="416ms" queue-time="0ms" rc="0" rc_text="ok" />
<operation_history call="11" task="monitor" interval="30000ms" last-rc-change="Mon Jul 20 05:47:31 2020" exec-time="666ms" queue-time="0ms" rc="0" rc_text="ok" />
</resource_history>
<resource_history id="haproxy" orphan="false" migration-threshold="1000000">
<operation_history call="12" task="start" last-rc-change="Mon Jul 20 05:47:31 2020" last-run="Mon Jul 20 05:47:31 2020" exec-time="2835ms" queue-time="0ms" rc="0" rc_text="ok" />
<operation_history call="13" task="monitor" interval="5000ms" last-rc-change="Mon Jul 20 05:47:34 2020" exec-time="10ms" queue-time="0ms" rc="0" rc_text="ok" />
</resource_history>
</node>
</node_history>
<fence_history>
</fence_history>
<tickets>
</tickets>
<bans>
</bans>
</crm_mon>
检查配置是否OK
crm_verify -L -V
创建一个资源
[root@pcmk-1 ~]# pcs resource create ClusterIP ocf:heartbeat:IPaddr2
ip=192.168.122.120 cidr_netmask=32 op monitor interval=30s
获取有哪些资源标准
[root@node0 ~]# pcs resource standards
lsb
ocf
service
systemd
[root@node0 ~]#
查看资源详情
[root@node0 ~]# pcs resource
VIP (ocf::heartbeat:IPaddr2): Started node0
haproxy (systemd:haproxy): Started node0
[root@node0 ~]# pcs resource show VIP
Resource: VIP (class=ocf provider=heartbeat type=IPaddr2)
Attributes: cidr_netmask=32 ip=192.168.0.75
Operations: monitor interval=30s (VIP-monitor-interval-30s)
start interval=0s timeout=20s (VIP-start-interval-0s)
stop interval=0s timeout=20s (VIP-stop-interval-0s)
获取有哪些ocf资源提供者
[root@node0 ~]# pcs resource providers
heartbeat
openstack
pacemaker
查看具体ocf 与 provider对应的资源代理名
[root@node0 ~]# pcs resource agents ocf:heartbeat
aliyun-vpc-move-ip
apache
aws-vpc-move-ip
awseip
awsvip
azure-lb
clvm
conntrackd
CTDB
db2
Delay
dhcpd
docker
Dummy
ethmonitor
exportfs
Filesystem
galera
garbd
iface-vlan
IPaddr
IPaddr2
IPsrcaddr
iSCSILogicalUnit
iSCSITarget
LVM
LVM-activate
lvmlockd
MailTo
mysql
nagios
named
nfsnotify
nfsserver
nginx
NodeUtilization
oraasm
oracle
oralsnr
pgsql
portblock
postfix
rabbitmq-cluster
redis
Route
rsyncd
SendArp
slapd
Squid
sybaseASE
symlink
tomcat
vdo-vol
VirtualDomain
Xinetd
模拟故障转移
[root@node0 ~]# pcs cluster stop node0
node0: Stopping Cluster (pacemaker)...
node0: Stopping Cluster (corosync)...
[root@node0 ~]#
[root@node0 ~]# pcs status
Error: cluster is not currently running on this node
[root@node1 ~]# pcs status
Cluster name: cluster_test01
Stack: corosync
Current DC: node2 (version 1.1.21-4.el7-f14e36fd43) - partition with quorum
Last updated: Mon Jul 20 13:44:44 2020
Last change: Mon Jul 20 05:46:34 2020 by root via crm_resource on node0
3 nodes configured
2 resources configured
Online: [ node1 node2 ]
OFFLINE: [ node0 ]
Full list of resources:
VIP (ocf::heartbeat:IPaddr2): Started node1
haproxy (systemd:haproxy): Started node1
Daemon Status:
corosync: active/enabled
pacemaker: active/enabled
pcsd: active/enabled
[root@node0 ~]# pcs cluster start node0
node0: Starting Cluster (corosync)...
node0: Starting Cluster (pacemaker)...
[root@node0 ~]#
[root@node0 ~]#
[root@node0 ~]# pcs status
Cluster name: cluster_test01
Stack: corosync
Current DC: node2 (version 1.1.21-4.el7-f14e36fd43) - partition with quorum
Last updated: Mon Jul 20 13:45:13 2020
Last change: Mon Jul 20 05:46:34 2020 by root via crm_resource on node0
3 nodes configured
2 resources configured
Online: [ node0 node1 node2 ]
Full list of resources:
VIP (ocf::heartbeat:IPaddr2): Started node1
haproxy (systemd:haproxy): Started node1
Daemon Status:
corosync: active/enabled
pacemaker: active/enabled
pcsd: active/enabled
[root@node0 ~]#
避免资源在恢复之后,又移动到其他节点去
在大多数情况下,非常需要防止健康的资源四处移动
集群。 移动资源几乎总是需要一段时间的停机时间。 对于复杂的服务
例如数据库,这段时间可能会很长
[root@node0 ~]# pcs resource defaults
No defaults set
[root@node0 ~]#
[root@node0 ~]# pcs resource defaults --help
Usage: pcs resource defaults...
defaults [options]
Set default values for resources, if no options are passed, lists
currently configured defaults. Defaults do not apply to resources which
override them with their own defined values.
[root@node0 ~]# pcs resource defaults resource-stickness=100
Warning: Defaults do not apply to resources which override them with their own defined values
[root@node0 ~]# pcs resource defaults
resource-stickness=100
STONITH(将头节点中的其他节点也击中)可保护您的数据免遭破坏通过恶意节点或意外并发访问。
仅仅因为节点无响应并不意味着它已停止访问您的数据。 唯一的要确保100%确保数据安全,一种方法是使用STONITH来确保在允许从另一个节点访问数据之前,节点真正脱机;
在无法停止群集服务的情况下,STONITH也可以发挥作用。 在这种情况下,群集使用STONITH强制整个节点脱机,从而可以安全地启动服务别处。