Zookeeper+HDFS+YARN监控

功能:

  1. 监控主机是否ping异常

  2. 监控主机所含ZOOKEEPER+HDFS+YARN服务是否down,如果出现down的情况自动进行重启


监控脚本

beh_serv_mon.sh

#!/bin/bash
#电话号码,若不想使用短信发送功能可将phonelist置空
phonelist=(
1234567890X
)
#脚本所在目录
HD_MON_HOME=/opt/beh/utility/crontab
#日志目录
HD_MON_LOG=/opt/beh/utility/crontab
#java home
JAVA_HOME=/opt/beh/core/jdk
#hadoop home
HADOOP_HOME=/opt/beh/core/hadoop
#zookeer home
ZOOKEEPER_HOME=/opt/beh/core/zookeeper
#加载hadoop的环境变量
#source /etc/profile
#获取需要检测的主机名
for v_host in `cat $HD_MON_HOME/beh_service.ini |grep -v "#" | awk -F ':' '{print $1}'`
do
  echo $v_host"----------------"
  #循环检测
  v_ping_c=`ping -i 1 -c 3 $v_host |grep -i unreach |wc -l`
  #如果主机无法ping通,则发送告警短信,并直接跳转到下个主机
  if [ $v_ping_c -gt 0 ]
  then
    echo $v_host"_network_error"
    #循环电话列表发送短信
    for phonenumber in ${phonelist[*]}
    do
      echo $v_host"_network_error" >> $HD_MON_LOG/hd_mon_log
      sh $HD_MON_HOME/sms_send.sh $phonenumber  $v_host"_network_error"
    done
  else
    #如果主机ping无异常,则检测主机上的相关服务
    v_conf_serv=`cat $HD_MON_HOME/beh_service.ini| grep $v_host | awk -F ':' '{print $2}'|awk -F ',' '{print NF-1}'`
    v_conf_serv_e=`cat $HD_MON_HOME/beh_service.ini| grep $v_host  | awk -F ':' '{print $2}' | sed 's/,/ -e /g'`
    v_now_serv_c=`ssh $v_host $JAVA_HOME/bin/jps | awk '{print $2}' |grep -x $v_conf_serv_e  |wc -l`
    
    if [ $v_now_serv_c -eq $v_conf_serv  ]
    then
      echo $v_host"_service_enough" > /dev/null
    else
      echo $v_host"_service_miss" > /dev/null
      v_conf_serv1=`cat $HD_MON_HOME/beh_service.ini| grep $v_host | awk -F ':' '{print $2}'|cut -c2- |sed 's/,/\n/g' |sort`
      #echo $v_conf_serv1
      v_conf_serv_e1=`cat $HD_MON_HOME/beh_service.ini| grep $v_host  | awk -F ':' '{print $2}' |cut -c1- | sed 's/,/ -e /g'`
      #echo $v_conf_serv_e1
      v_now_serv_c1=`ssh $v_host $JAVA_HOME/bin/jps | awk '{print $2}' |grep -x $v_conf_serv_e1 |sort`
      #echo $v_now_serv_c1
      for v_service in  $v_conf_serv1
      do
        #echo $v_service
        v_service_now=`echo "$v_now_serv_c1" |grep -x "$v_service" |wc -l`
        #echo $v_service_now
        if [ $v_service_now -eq  1 ]
        then
            echo "1:include"
        else
            echo "0:not_include"
            for phonenumber in ${phonelist[*]}
            do
              echo $v_host"_"$v_service"_miss"  >> $HD_MON_LOG/hd_mon_log
              sh $HD_MON_HOME/sms_send.sh $phonenumber  $v_host"_"$v_service"_miss"
            done

            case $v_service in
              DFSZKFailoverController)
              ssh $v_host  'source /etc/profile;$HADOOP_HOME/sbin/hadoop-daemon.sh start zkfc'
              ;;
              JournalNode)
              ssh $v_host 'source /etc/profile;$HADOOP_HOME/sbin/hadoop-daemon.sh start journalnode'
              ;;
              NameNode)
              ssh $v_host  'source /etc/profile;$HADOOP_HOME/sbin/hadoop-daemon.sh start namenode'
              ;;
              QuorumPeerMain)
              ssh $v_host 'source /etc/profile;$ZOOKEEPER_HOME/bin/zkServer.sh start'
              ;;
              ResourceManager)
              ssh $v_host 'source /etc/profile;$HADOOP_HOME/sbin/yarn-daemon.sh start resourcemanager'
              ;;
              JobHistoryServer)
              ssh $v_host 'source /etc/profile;$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver'
              ;;
              ApplicationHistoryServer)
              ssh $v_host 'source /etc/profile;$HADOOP_HOME/sbin/yarn-daemon.sh start timelineserver'
              ;;
              DataNode)
              ssh $v_host 'source /etc/profile;$HADOOP_HOME/sbin/hadoop-daemon.sh start datanode'
              ;;
              NodeManager)
              ssh $v_host 'source /etc/profile;$HADOOP_HOME/sbin/yarn-daemon.sh start nodemanager'
              ;;
              *)
              echo $v_host"_"$v_service"_not_found"
              ;;
            esac
          fi
        done
      fi
    fi
done


配置文件

beh_service.ini

hadoop001:,NameNode,ResourceManager,DFSZKFailoverController,QuorumPeerMain,JournalNode,ApplicationHistoryServer
hadoop011:,NameNode,ResourceManager,DFSZKFailoverController,QuorumPeerMain,JournalNode,JobHistoryServer
hadoop012:,DataNode,NodeManager,QuorumPeerMain,JournalNode
hadoop013:,DataNode,NodeManager
hadoop014:,DataNode,NodeManager
hadoop015:,DataNode,NodeManager
hadoop016:,DataNode,NodeManager
hadoop017:,DataNode,NodeManager
hadoop018:,DataNode,NodeManager
hadoop023:,DataNode,NodeManager
hadoop024:,DataNode,NodeManager
hadoop025:,DataNode,NodeManager
hadoop026:,DataNode,NodeManager
hadoop027:,DataNode,NodeManager
hadoop028:,DataNode,NodeManager
hadoop030:,DataNode,NodeManager
hadoop031:,DataNode,NodeManager
hadoop032:,DataNode,NodeManager
hadoop033:,DataNode,NodeManager
hadoop034:,DataNode,NodeManager
hadoop035:,DataNode,NodeManager
hadoop021:,DataNode,NodeManager
hadoop022:,DataNode,NodeManager
hadoop053:,DataNode,NodeManager
hadoop054:,DataNode,NodeManager
hadoop055:,DataNode,NodeManager
hadoop056:,DataNode,NodeManager
hadoop057:,DataNode,NodeManager
hadoop058:,DataNode,NodeManager
hadoop059:,DataNode,NodeManager


短信脚本

备注:可自行修改为适应本地环境的其他通知脚本

示例:sms_send.sh

#oracle数据库
username=username
password=userpasswd
SID=sidname
V_NUMBER=$1      #参数1为电话号码
V_STR=$2         #参数2为发送内容
sqlplus -s $username/$password@$SID <<EOF
INSERT INTO NEWEBA.UNICOM_REPORT_SMS
  (ROW_NO, MSISDN, MESSAGE, FLAG, SEND_TM)
      SELECT SQE_UNICOM_REPORT_SMS.NEXTVAL,
             $V_NUMBER,
             '$V_STR', 0, SYSDATE
FROM DUAL;
EOF


crontab配置

备注:一般使用hadoop用户启动hadoop相关服务,并且相互之间配置了ssh互信。使用crontab来定时执行脚本。可以选取2台主机,其中一台为每小时整点执行,另一台为每小时半点执行。这里选取了hadoop001和hadoop002.

[hadoop@hadoop001 ~]$ crontab -l
0  * * * * /opt/beh/utility/crontab/beh_serv_mon.sh > /opt/beh/utility/crontab/beh_serv_mon.log 2>&1
[hadoop@hadoop002 ~]$ crontab -l
30 * * * * /opt/beh/utility/crontab/beh_serv_mon.sh > /opt/beh/utility/crontab/beh_serv_mon.log 2>&1


你可能感兴趣的:(Zookeeper+HDFS+YARN监控)