任务:需要在nagios中定义服务去检测3个DC的状态(1.主机状态、2.consul cluster状态、3.nomad cluster状态),只要其中某个服务状态失效,就触发nagios eventhandler去改变dns服务器的链接文件,如上图所示。
脚本:脚本中的服务器地址和实际的不同
脚本1:该脚本检测3个DC的服务状态,根据检测到的结果会输出目前dns应该链接的文件名,nagios上会显示该文件名。如果dns没有链接到正确的文件名,nagios就会报警并触发event-handler。
#!/bin/bash
#Detection DC host status、consul cluster status、nomad cluster status
DATE=`date +%Y%m%d%H%M%S`
#DC:US(tier1001 and tier1002)
#DC:EU(tier2001 and tier2002)
#DC:AS(tier3001 and tier3002)
#All DC -> axel-geo_us_eu_as.yml default
#DC-EU down -> axel-geo_us_as.yml if DC-EU down
#DC-AS down -> axel-geo_us_eu.yml if DC-AS down
#DC-US down -> axel-geo_eu_as.yml if DC-US down
#detection dc(US) ping status #检测3个DC的主机状态,通过nagios自带插件check_ping去检测
PING_1001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier1001 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
PING_1002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier1002 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
#detection dc(EU) ping status
PING_2001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier2001 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
PING_2002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier2002 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
#detection dc(AS) ping status
PING_3001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier3001 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
PING_3002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier3002 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
#detection dc(US) consul #检测3个DC的consul cluster状态,通过nrpe调用远程主机上的脚本
if /usr/lib64/nagios/plugins/check_nrpe -H tier1001.axel.network -c check_consul_cluster &>/dev/null ; then CON_US=0 ; else CON_US=1 ; fi
#detection dc(EU) consul
if /usr/lib64/nagios/plugins/check_nrpe -H tier2001.axel.network -c check_consul_cluster &>/dev/null ; then CON_EU=0 ; else CON_EU=1 ; fi
#detection dc(AS) consul
if /usr/lib64/nagios/plugins/check_nrpe -H tier3001.axel.network -c check_consul_cluster &>/dev/null ; then CON_AS=0 ; else CON_AS=1 ; fi
#detection dc(US) nomad #检测3个DC的nomad cluster状态,通过nrpe调用远程主机上的脚本
if /usr/lib64/nagios/plugins/check_nrpe -H tier1001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_US=0 ; else NOM_US=1 ; fi
#detection dc(EU) nomad
if /usr/lib64/nagios/plugins/check_nrpe -H tier2001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_EU=0 ; else NOM_EU=1 ; fi
#detection dc(AS) nomad
if /usr/lib64/nagios/plugins/check_nrpe -H tier3001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_AS=0 ; else NOM_AS=1 ; fi
#detection corrent linkfile #检测dns服务器上目前链接的文件名是什么
FILE=`/usr/lib64/nagios/plugins/check_nrpe -H romeo.zencoo.com -c check_pdns_link`
[ ! -n "$FILE" ] && {
echo '$FILE is NULL'
exit 1
}
#detection service function #将每个DC的三个服务做判断,一个DC中,只有所有服务状态都正常,该DC的变量被赋值0(比如US被赋值为0)
function service {
#detection ping
[ "$PING_1001" == "OK" -a "$PING_1002" == "OK" ] && PING_US=0 || PING_US=1
[ "$PING_2001" == "OK" -a "$PING_2001" == "OK" ] && PING_EU=0 || PING_EU=1
[ "$PING_3001" == "OK" -a "$PING_3002" == "OK" ] && PING_AS=0 || PING_AS=1
#detection all status
[ "$PING_US" -eq 0 ] && [ "$CON_US" -eq 0 ] && [ "$NOM_US" -eq 0 ] && US=0 || US=1
[ "$PING_EU" -eq 0 ] && [ "$CON_EU" -eq 0 ] && [ "$NOM_EU" -eq 0 ] && EU=0 || EU=1
[ "$PING_AS" -eq 0 ] && [ "$CON_AS" -eq 0 ] && [ "$NOM_AS" -eq 0 ] && AS=0 || AS=1
}
service
#判断是否需要切换链接文件,如果需要,退出状态码就是2,nagios就会报警,触发event-handler
if [ ${US} -eq 0 ] && [ ${EU} -eq 0 ] && [ ${AS} -eq 0 ] && [ "$FILE" == "axel-geo_us_eu_as.yml" ];then
echo "all-DC-is ok,->already axel-geo_us_eu_as.yml";exit 0
elif [ ${US} -eq 0 ] && [ ${EU} -eq 0 ] && [ ${AS} -eq 0 ] && [ "$FILE" != "axel-geo_us_eu_as.yml" ];then
echo "axel-geo_us_eu_as.yml";exit 2
elif [ ${US} -eq 1 -a "$FILE" != "axel-geo_eu_as.yml" ];then
echo "axel-geo_eu_as.yml";exit 2
elif [ ${EU} -eq 1 -a "$FILE" != "axel-geo_us_as.yml" ];then
echo "axel-geo_us_as.yml";exit 2
elif [ ${AS} -eq 1 -a "$FILE" != "axel-geo_us_eu.yml" ];then
echo "axel-geo_us_eu.yml";exit 2
else
echo "link file is ${FILE}"
exit 0
fi
脚本2:触发event-handler的脚本
#!/bin/bash
#check_service_status.sh dection All dc host status、consul status、nomad status.
#script return a file name ($2 following four)
#All DC -> axel-geo_us_eu_as.yml default
#DC-EU down -> axel-geo_us_as.yml if DC-EU down
#DC-AS down -> axel-geo_us_eu.yml if DC-AS down
#DC-US down -> axel-geo_eu_as.yml if DC-US down
WORKDIR=/usr/lib64/nagios/plugins
DATE=`date +%Y%m%d%H%M%S`
LOG=/tmp/.dns_linkfile
exec &>>${LOG}
case $1 in #$1就是nagios检测服务的状态码,如果报警就是CRITICAL
OK)
#correct link file
exit 0
;;
CRITICAL) #$2是nagios上显示的信息,也就是文件名,然后通过nrpe去调用dns服务器上的脚本更改链接文件
#need to switch link file
case $2 in
axel-geo_us_eu_as.yml)
#DC-EU、DC-AS、DC-US state ok,linkfile->axel-geo_us_eu_as.yml
REMOTE_CMD=update_us_eu_as
;;
axel-geo_us_as.yml)
#DC-EU down,linkfile->axel-geo_us_as.yml
REMOTE_CMD=update_us_as
;;
axel-geo_us_eu.yml)
#DC-AS down, linkfile->axel-geo_us_eu.yml
REMOTE_CMD=update_us_eu
;;
axel-geo_eu_as.yml)
#DC-US down, linkfile->axel-geo_eu_as.yml
REMOTE_CMD=update_eu_as
;;
*)
#default output
echo "${DATE}--warining,no file match"
exit 1
;;
esac
echo "${DATE}--${WORKDIR}/check_nrpe -H {ns1,ns2}.zencoo.com -c ${REMOTE_CMD}"
${WORKDIR}/check_nrpe -H DNS1 -c ${REMOTE_CMD}
${WORKDIR}/check_nrpe -H DNS2 -c ${REMOTE_CMD}
;;
esac
exit 0
脚本3:更改DNS服务上的链接文件
#!/bin/bash
#The script is called in the check_dc_status and change_dns_linkfile scripts
LOG=/tmp/.dns_linkfile
DATE=`date +%Y%m%d%H%M%S`
DIR=/etc/pdns
LN=axel-geo.yml
FILE="`ls -l ${DIR}/${LN} | sed -n '/^l/p'|sed 's/.*-> //g'`"
#$1 is check_dc_status and change_dns_linkfile passed parameters
case $1 in #前两个脚本会通过nrpe来调用该脚本,$1就是传入的参数
check)
FILE="`ls -l ${DIR}/${LN} | sed -n '/^l/p'|sed 's/.*-> //g'`"
echo "$FILE"
exit 0
;;
us_eu_as)
TAGETFILE="${DIR}/axel-geo_us_eu_as.yml"
;;
us_as)
TAGETFILE="${DIR}/axel-geo_us_as.yml"
;;
us_eu)
TAGETFILE="${DIR}/axel-geo_us_eu.yml"
;;
eu_as)
TAGETFILE="${DIR}/axel-geo_eu_as.yml"
;;
*)
echo '$1 error' >>${LOG}
exit 1
;;
esac
if [ ! -f ${TAGETFILE} ];then
echo '$TAGETFILE does not exist/${DATE}' >>${LOG}
exit 1
elif [ "$FILE" == "$TAGETFILE" ];then
echo "${DATE}-Link file is correct, no need to switch" >>${LOG}
exit 0
else
echo "${HOSTNAME}/${DATE} ln -snf $TAGETFILE ${DIR}/${LN}" >>${LOG}
sudo /usr/bin/ln -snf $TAGETFILE ${DIR}/${LN}
sudo /bin/pdns_control reload && echo "${DATE}-reload dns ok" >>${LOG} || echo "${DATE}-reload dns failed" >>${LOG}
exit 0
fi
nagios配置 #定义检测服务,定义event-handler
define service{
use generic-service
host_name xxx
service_description check_dc_status
contact_groups admins,admins_jabber
check_command check_nrpe_t60!check_dc_status #调用检测服务状态的脚本(脚本1)
event_handler change_dns_linkfile #调用event命令
}
define command {
command_name change_dns_linkfile #$SERVICESTATE$ $SERVICEOUTPUT$ 对应脚本2中的$1和$2
command_line $USER1$/eventhandlers/change_dns_linkfile $SERVICESTATE$ $SERVICEOUTPUT$
}
puppet配置 #脚本1和脚本2会通过nrpe调用脚本3,需要定义相应的命令以及对应的参数
<% if @fqdn == 'dns1xxxx' or @fqdn == 'dns2xxxx' -%>
command[check_pdns_link]=<%= @pluginsdir %>/dns_file_check.sh check
command[update_us_eu_as]=<%= @pluginsdir %>/dns_file_check.sh us_eu_as
command[update_us_eu]=<%= @pluginsdir %>/dns_file_check.sh us_eu
command[update_us_as]=<%= @pluginsdir %>/dns_file_check.sh us_as
command[update_eu_as]=<%= @pluginsdir %>/dns_file_check.sh eu_as
<% end -%>
第一次弄nagios event-handler,感觉很乱,脚本还要再继续完善