Shell脚本监控Storm集群,邮件报警

这个脚本只是一个临时的方案,在全面的监控,报警系统完成之前为了节省一定的人力而写的

之前写过一个脚本用来监控Storm的Nimbus和Supervisor进程,在检测到进程不存在时会进行重启,在实际使用中发现,该脚本虽然可以不断重启,但某些情况下并不能完成自动重启,人为的介入仍然是必要的,所以对脚本进行了修改,增加检测重启次数,如果检测到重启失败,则会通过邮件通知相关人员。遇到的困难主要在于获取ssh操作的返回值,思路比较简单,脚本内容如下:

#!/bin/bash
dir=`dirname $0`
slaves="cdn36 cdn37 cdn39 cdn21 cdn22 cdn23"
stopnode=""

while [ 1 ]
do
    echo "==========  "`date`"    ==============="
    nid=`jps -l|grep 'nimbus'|awk '{print $1}'`
    if [ "$nid" = "" ]; then
            echo  'storm nimbus is dead!'
            echo  'trying to start nimbus...'
            nohup storm nimbus >nimbus.log &
            echo 'finish starting!'
    else
            echo "storm nimbus id: $nid"
    fi

    uid=`jps -l|grep 'backtype.storm.ui.core'|awk '{print $1}'`
    if [ "$nid" = "" ]; then
        echo  'storm ui process is dead!'
                echo  'trying to start storm ui'
                nohup storm ui >ui.log &
                echo 'finish starting storm ui!'
    else
        echo "storm ui id: $uid"
    fi 

    stopnode=""
    for node in $slaves
    do
       tmp=$(ssh $node 'source /etc/profile; source ~/.bash_profile;sid=`jps |grep supervisor |awk "{print $1}"`;
             if [ "$sid" = "" ]; then
                 echo "supervisor is dead trying to start supervisor!";
                 mkdir -p ~/rzx;
                 #重启前 删除worker文件
                 rm -fr /data/tmp/storm/worker;
                 nohup  storm supervisor >supervisor.log &
             else
                 echo " supervisor is alived,"${sid};
             fi')
    alived="alived"
    if [ "$tmp" = "${tmp//$alived}"  ] ; then
        stopnode=${stopnode}" "${node}
        echo ${node}"'s supervisor is dead!"
        tmp=""
    else
       echo ${node}"'s"${tmp}
       tmp=""
    fi
    done

    #sleep 等待Supervisor重启完成
    sleep 40

    #检测是否有Supervisor dead
    if [ -n "$stopnode" ] ;then
        echo "check dead supervisor!"
        for node in $stopnode
        do
           check=$(ssh  $node 'source /etc/profile; source ~/.bash_profile;sid=`jps |grep supervisor |awk "{print $1}"`;
             if [ "$sid" = "" ]; then
                 echo "supervisor is still dead!";
             else
                 echo " supervisor is alived,"${sid};
             fi')
           if [ "$check" = "${check//$alived}"  ] ; then
                echo ${node}"'s supervisor is still dead, send the email to admin!"
                title="Supervisor--is--dead"
                contxt=${node}"'s--supervisor--is--dead,please--check--the--server!"
                sh /data/www/mail/bin/start.sh  #发送邮件脚本
                slaves1=${slaves/${node}/""}
                slaves=""
                slaves=${slaves1}
                check=""
           else
                echo ${node}"'s"$check
                check=""
           fi
       done
    else
        echo "no dead supervisor!"
    fi  

    if [ "$nid" = "" ]; then
       nid1=`jps -l|grep 'nimbus'|awk '{print $1}'`
       if [ "$nid1" = ""  ]; then
         echo "nimbus is still dead, send the email to admin!"
         title1="Nimbus--is--dead"
         contxt1="Nimbus--is--dead,please--check--the--server!"
         sh /data/www/mail/bin/start.sh  #发送邮件脚本
       else
         echo "Nimbus is restarted!"
       fi       
    else
         echo
    fi
   
  echo "sleeping 20s..."
        sleep 20
done


你可能感兴趣的:(storm,linux,storm,邮件,监控)