故障重现

keepalived配置如下

# vi /etc/keepalived/keepalived.conf
! Configuration File for keepalived
global_defs {
   notification_email {
         root@localhost
   }
   notification_email_from [email protected]
   smtp_connect_timeout 3
   smtp_server 127.0.0.1
   router_id LVS_DEVEL
}
vrrp_script chk_maintaince_down {
   script "[[ -f /etc/keepalived/down ]] && exit 1 || exit 0"
   interval 1
   weight 2
}
vrrp_script chk_haproxy {
    script "killall -0 haproxy"
    interval 1
    weight 2
}
vrrp_instance VI_1 {
    interface eth0
    state MASTER
    priority 100
    virtual_router_id 125
    garp_master_delay 1
    authentication {
        auth_type PASS
        auth_pass 1e3459f77aba4ded
    }
    track_interface {
       eth0
    }
    virtual_ipaddress {
        172.16.25.10/16 dev eth0 label eth0:0
    }
    track_script {
        chk_haproxy
        chk_maintaince_down
    }
    notify_master "/etc/keepalived/notify.sh master 172.16.25.10"
    notify_backup "/etc/keepalived/notify.sh backup 172.16.25.10"
    notify_fault "/etc/keepalived/notify.sh fault 172.16.25.10"
}
vrrp_instance VI_2 {
    interface eth0
    state BACKUP
    priority 99
    virtual_router_id 126
    garp_master_delay 1
    authentication {
        auth_type PASS
        auth_pass 7615c4b7f518cede
    }
    track_interface {
       eth0
    }
    virtual_ipaddress {
        172.16.25.11/16 dev eth0 label eth0:1
    }
    track_script {
        chk_haproxy
        chk_maintaince_down
    }
    notify_master "/etc/keepalived/notify.sh master 172.16.25.11"
    notify_backup "/etc/keepalived/notify.sh backup 172.16.25.11"
    notify_fault "/etc/keepalived/notify.sh fault 172.16.25.11"
}
# vi /etc/keepalived/notify.sh
#!/bin/bash
# Author: Jason.Yu 
# description: An example of notify script
#
contact='root@localhost'
notify() {
    mailsubject="`hostname` to be $1: $2 floating"
    mailbody="`date '+%F %H:%M:%S'`: vrrp transition, `hostname` changed to be $1"
    echo $mailbody | mail -s "$mailsubject" $contact
}
case "$1" in
    master)
        notify master $2
        /etc/rc.d/init.d/haproxy start
        exit 0
    ;;
    backup)
        notify backup $2
        /etc/rc.d/init.d/haproxy stop
        exit 0
    ;;
    fault)
        notify fault $2
        /etc/rc.d/init.d/haproxy stop
        exit 0
    ;;
    *)
        echo 'Usage: `basename $0` {master|backup|fault}'
        exit 1
    ;;
esac

引发的故障1:keepalived宕机恢复后VIP集体漂移故障

Keepalived双主模型中vrrp_script中权重改变故障排查_第1张图片

引发的故障2:haproxy服务停止后重启VIP集体漂移故障

Keepalived双主模型中vrrp_script中权重改变故障排查_第2张图片



原因

每次主备状态切换时,会引发notify_backup,而在notify.sh脚本中backup部分会执行/etc/rc.d/init.d/haproxy stop,导致权重在2个节点上都改变一次,从而单一节点上对于所有instance的权重都处于最大或者最小,故VIP集体漂移也就不奇怪了;

wKioL1NjgxGDtI_tAANRGF5oM7E741.jpg

wKioL1NjgzeAEzh0AAPRB17UL2E301.jpg


解决方法

修改notify.sh脚本,在处理backup部分,只发送通知邮件,而无需刻意停止haproxy服务;

# vi /etc/keepalived/notify.sh
#!/bin/bash
# Author: Jason.Yu 
# description: An example of notify script
#
contact='root@localhost'
notify() {
    mailsubject="`hostname` to be $1: $2 floating"
    mailbody="`date '+%F %H:%M:%S'`: vrrp transition, `hostname` changed to be $1"
    echo $mailbody | mail -s "$mailsubject" $contact
}
case "$1" in
    master)
        notify master $2
        /etc/rc.d/init.d/haproxy start
        exit 0
    ;;
    backup)
        notify backup $2
       # /etc/rc.d/init.d/haproxy stop # 注释掉或删除此行
        exit 0
    ;;
    fault)
        notify fault $2
        # /etc/rc.d/init.d/haproxy stop # 同上
        exit 0
    ;;
    *)
        echo 'Usage: `basename $0` {master|backup|fault}'
        exit 1
    ;;
esac


调整后的正常权重改变流程

wKiom1Njg52TKUfZAAJwmR7i0yw788.jpg


vrrp_script中节点权重改变算法

vrrp_script 里的script返回值为0时认为检测成功,其它值都会当成检测失败;

  1. weight 为正时,脚本检测成功时此weight会加到priority上,检测失败时不加;

    1. 主失败:

      1. 主 priority < 从 priority + weight 时会切换。

    2. 主成功:

      1. 主 priority + weight > 从 priority + weight 时,主依然为主

  2. weight 为负时,脚本检测成功时此weight不影响priority,检测失败时priority – abs(weight)

    1. 主失败:

      1. 主 priority – abs(weight) < 从priority 时会切换主从

    2. 主成功:

      1. 主 priority > 从priority 主依然为主