shell脚本-切换软链接文件(nagios监控)_第1张图片

任务:需要在nagios中定义服务去检测3个DC的状态(1.主机状态、2.consul cluster状态、3.nomad cluster状态),只要其中某个服务状态失效,就触发nagios eventhandler去改变dns服务器的链接文件,如上图所示。

脚本:脚本中的服务器地址和实际的不同

脚本1:该脚本检测3个DC的服务状态,根据检测到的结果会输出目前dns应该链接的文件名,nagios上会显示该文件名。如果dns没有链接到正确的文件名,nagios就会报警并触发event-handler。

#!/bin/bash
#Detection DC host status、consul cluster status、nomad cluster status
DATE=`date +%Y%m%d%H%M%S`

#DC:US(tier1001 and tier1002)
#DC:EU(tier2001 and tier2002)
#DC:AS(tier3001 and tier3002)

#All DC -> axel-geo_us_eu_as.yml default
#DC-EU down -> axel-geo_us_as.yml  if DC-EU down
#DC-AS down -> axel-geo_us_eu.yml  if DC-AS down
#DC-US down -> axel-geo_eu_as.yml  if DC-US down

#detection dc(US) ping status     #检测3个DC的主机状态,通过nagios自带插件check_ping去检测
PING_1001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier1001 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
PING_1002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier1002 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
#detection dc(EU) ping status
PING_2001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier2001 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
PING_2002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier2002 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
#detection dc(AS) ping status
PING_3001=`/usr/lib64/nagios/plugins/check_ping -4 -H tier3001 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`
PING_3002=`/usr/lib64/nagios/plugins/check_ping -4 -H tier3002 -w 3000.0,80% -c 5000.0,100% -p 5|awk '{print $2}'`

#detection dc(US) consul          #检测3个DC的consul cluster状态,通过nrpe调用远程主机上的脚本
if /usr/lib64/nagios/plugins/check_nrpe -H tier1001.axel.network -c check_consul_cluster &>/dev/null ; then CON_US=0 ; else CON_US=1 ; fi
#detection dc(EU) consul
if /usr/lib64/nagios/plugins/check_nrpe -H tier2001.axel.network -c check_consul_cluster &>/dev/null ; then CON_EU=0 ; else CON_EU=1 ; fi
#detection dc(AS) consul
if /usr/lib64/nagios/plugins/check_nrpe -H tier3001.axel.network -c check_consul_cluster &>/dev/null ; then CON_AS=0 ; else CON_AS=1 ; fi

#detection dc(US) nomad        #检测3个DC的nomad cluster状态,通过nrpe调用远程主机上的脚本
if /usr/lib64/nagios/plugins/check_nrpe -H tier1001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_US=0 ; else NOM_US=1 ; fi
#detection dc(EU) nomad
if /usr/lib64/nagios/plugins/check_nrpe -H tier2001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_EU=0 ; else NOM_EU=1 ; fi
#detection dc(AS) nomad
if /usr/lib64/nagios/plugins/check_nrpe -H tier3001.axel.network -c check_nomad_cluster &>/dev/null ; then NOM_AS=0 ; else NOM_AS=1 ; fi

#detection corrent linkfile         #检测dns服务器上目前链接的文件名是什么
FILE=`/usr/lib64/nagios/plugins/check_nrpe -H romeo.zencoo.com -c check_pdns_link`
[ ! -n "$FILE" ] && {
echo '$FILE is NULL'
exit 1
}

#detection service function     #将每个DC的三个服务做判断,一个DC中,只有所有服务状态都正常,该DC的变量被赋值0(比如US被赋值为0)
function service {
#detection ping 
[ "$PING_1001" == "OK" -a "$PING_1002" == "OK" ] && PING_US=0 || PING_US=1
[ "$PING_2001" == "OK" -a "$PING_2001" == "OK" ] && PING_EU=0 || PING_EU=1
[ "$PING_3001" == "OK" -a "$PING_3002" == "OK" ] && PING_AS=0 || PING_AS=1
#detection all status 
[ "$PING_US" -eq 0 ] && [ "$CON_US" -eq 0 ] && [ "$NOM_US" -eq 0 ] && US=0 || US=1
[ "$PING_EU" -eq 0 ] && [ "$CON_EU" -eq 0 ] && [ "$NOM_EU" -eq 0 ] && EU=0 || EU=1
[ "$PING_AS" -eq 0 ] && [ "$CON_AS" -eq 0 ] && [ "$NOM_AS" -eq 0 ] && AS=0 || AS=1
}

service

#判断是否需要切换链接文件,如果需要,退出状态码就是2,nagios就会报警,触发event-handler
if [ ${US} -eq 0 ] && [ ${EU} -eq 0 ] && [ ${AS} -eq 0 ] && [ "$FILE" == "axel-geo_us_eu_as.yml" ];then
   echo "all-DC-is ok,->already axel-geo_us_eu_as.yml";exit 0
elif [ ${US} -eq 0 ] && [ ${EU} -eq 0 ] && [ ${AS} -eq 0 ] && [ "$FILE" != "axel-geo_us_eu_as.yml" ];then
   echo "axel-geo_us_eu_as.yml";exit 2
elif [ ${US} -eq 1 -a "$FILE" != "axel-geo_eu_as.yml" ];then
   echo "axel-geo_eu_as.yml";exit 2
elif [ ${EU} -eq 1 -a "$FILE" != "axel-geo_us_as.yml" ];then
   echo "axel-geo_us_as.yml";exit 2
elif [ ${AS} -eq 1 -a "$FILE" != "axel-geo_us_eu.yml" ];then
   echo "axel-geo_us_eu.yml";exit 2
else
   echo "link file is ${FILE}"
   exit 0
fi

脚本2:触发event-handler的脚本

#!/bin/bash
#check_service_status.sh dection All dc host status、consul status、nomad status.
#script return a file name ($2 following four)
#All DC -> axel-geo_us_eu_as.yml default
#DC-EU down -> axel-geo_us_as.yml  if DC-EU down
#DC-AS down -> axel-geo_us_eu.yml  if DC-AS down
#DC-US down -> axel-geo_eu_as.yml  if DC-US down

WORKDIR=/usr/lib64/nagios/plugins
DATE=`date +%Y%m%d%H%M%S`
LOG=/tmp/.dns_linkfile
exec &>>${LOG}

case $1 in     #$1就是nagios检测服务的状态码,如果报警就是CRITICAL
OK)
   #correct link file
   exit 0
   ;;
CRITICAL)    #$2是nagios上显示的信息,也就是文件名,然后通过nrpe去调用dns服务器上的脚本更改链接文件
   #need to switch link file
   case $2 in
     axel-geo_us_eu_as.yml)
          #DC-EU、DC-AS、DC-US state ok,linkfile->axel-geo_us_eu_as.yml
          REMOTE_CMD=update_us_eu_as
       ;;  
     axel-geo_us_as.yml)
          #DC-EU down,linkfile->axel-geo_us_as.yml
          REMOTE_CMD=update_us_as
       ;;  
     axel-geo_us_eu.yml)
          #DC-AS down, linkfile->axel-geo_us_eu.yml
          REMOTE_CMD=update_us_eu
       ;;  
     axel-geo_eu_as.yml)
          #DC-US down, linkfile->axel-geo_eu_as.yml
          REMOTE_CMD=update_eu_as
       ;;
                      *)
          #default output
          echo "${DATE}--warining,no file match"
          exit 1 
       ;;
     esac
          echo "${DATE}--${WORKDIR}/check_nrpe -H {ns1,ns2}.zencoo.com -c ${REMOTE_CMD}"
          ${WORKDIR}/check_nrpe -H DNS1 -c ${REMOTE_CMD}          
          ${WORKDIR}/check_nrpe -H DNS2 -c ${REMOTE_CMD}
   ;;
esac
exit 0

脚本3:更改DNS服务上的链接文件

#!/bin/bash
#The script is called in the check_dc_status and change_dns_linkfile scripts
LOG=/tmp/.dns_linkfile
DATE=`date +%Y%m%d%H%M%S`
DIR=/etc/pdns
LN=axel-geo.yml
FILE="`ls -l ${DIR}/${LN} | sed -n '/^l/p'|sed 's/.*-> //g'`"

#$1 is check_dc_status and change_dns_linkfile passed parameters
case $1 in     #前两个脚本会通过nrpe来调用该脚本,$1就是传入的参数
check)
   FILE="`ls -l ${DIR}/${LN} | sed -n '/^l/p'|sed 's/.*-> //g'`"
   echo "$FILE" 
   exit 0
   ;;
us_eu_as)
   TAGETFILE="${DIR}/axel-geo_us_eu_as.yml"
   ;;
us_as)
   TAGETFILE="${DIR}/axel-geo_us_as.yml"
   ;;
us_eu)
   TAGETFILE="${DIR}/axel-geo_us_eu.yml"
   ;;
eu_as)
   TAGETFILE="${DIR}/axel-geo_eu_as.yml"
   ;;
*)
   echo '$1 error' >>${LOG}
   exit 1
   ;;
esac

if [ ! -f ${TAGETFILE} ];then
 echo '$TAGETFILE does not exist/${DATE}' >>${LOG}
 exit 1
elif  [ "$FILE" == "$TAGETFILE" ];then
 echo "${DATE}-Link file is correct, no need to switch" >>${LOG}
 exit 0
else
 echo "${HOSTNAME}/${DATE} ln -snf $TAGETFILE ${DIR}/${LN}" >>${LOG}
sudo /usr/bin/ln -snf $TAGETFILE ${DIR}/${LN}  
sudo /bin/pdns_control reload && echo "${DATE}-reload dns ok" >>${LOG} || echo "${DATE}-reload dns failed" >>${LOG}
 exit 0
fi

nagios配置 #定义检测服务,定义event-handler

define service{
        use                             generic-service
        host_name                         xxx
        service_description             check_dc_status
        contact_groups                  admins,admins_jabber
        check_command                   check_nrpe_t60!check_dc_status   #调用检测服务状态的脚本(脚本1)
        event_handler                   change_dns_linkfile                             #调用event命令
        }

define command {
        command_name    change_dns_linkfile          #$SERVICESTATE$ $SERVICEOUTPUT$  对应脚本2中的$1和$2
        command_line    $USER1$/eventhandlers/change_dns_linkfile $SERVICESTATE$ $SERVICEOUTPUT$     
        }

puppet配置 #脚本1和脚本2会通过nrpe调用脚本3,需要定义相应的命令以及对应的参数

<% if @fqdn == 'dns1xxxx' or @fqdn == 'dns2xxxx' -%>
command[check_pdns_link]=<%= @pluginsdir %>/dns_file_check.sh check                    
command[update_us_eu_as]=<%= @pluginsdir %>/dns_file_check.sh us_eu_as
command[update_us_eu]=<%= @pluginsdir %>/dns_file_check.sh us_eu
command[update_us_as]=<%= @pluginsdir %>/dns_file_check.sh us_as
command[update_eu_as]=<%= @pluginsdir %>/dns_file_check.sh eu_as
<% end -%>

第一次弄nagios event-handler,感觉很乱,脚本还要再继续完善