Spark streaming应用状态监控及其自动重启脚本

Spark 应用的状态监控,应用提交,应用定时重启等等,有很多开源的组件可以用的方案,比如说hue,azkaban,oozie之类的。总有一些客户提供大数据环境,不提供一些工具,嫌弃工具的。那只能通过最原始的方法来提交Spark应用,应用状态监控只能通过脚本来实现。

脚本主要内容每隔10分钟检查一次状态,如果不在运行,则重启。由于客户自己要求还加了一个凌晨无论在不在运行,都是先关闭,然后在重启。唉。。。。。写脚本也是无奈之举,总有人生活在原始社会。脚本如下:

#!/bin/bash
# check spark app state
# if not running, restart spark app
#
# */10 * * * * /opt/spark/autorestart-sparkapp.sh 2>&1
#
basePath=$(pwd)
proxyUser="用户名,提交程序的应用名,免得把别人的程序kill掉了"
# 下面规定多几个匹配条件,就是匹配误杀的。
applicationType="SPARK"
appStatus="RUNNING"
# 日志临时目录
logDir=/tmp/aaaaaaaalog
initLockFile=${basePath}/autoInitSparkApp.lock
isNeedInit=false
nowDate=$(date "+%Y%m%d")
nowTime=$(date "+%H%M")

# wait app seconds
maxStartAppWait=600

if [ ! -e ${initLockFile} ]&&[[ ${nowTime} < "0200" ]]; then
  isNeedInit=true
elif [ -e ${initLockFile} ];then
  initDate=$(cat ${initLockFile})
  if [ X${initDate} != X${nowDate} ]&&[[ ${nowTime} < "0200" ]]; then
      isNeedInit=true
  fi
fi

if [ ! -d "$logDir" ] ; then
  mkdir $logDir
fi
# 用于临时存储spark 应用列表
jobListFile=/tmp/aaalog/jobList.txt
# aaaaaa之类代表应用名称,匹配误杀,多个判断条件安全一点
allAppNames=("aaaaaa" "bbbbbb" "cccccc")

yarn application -list 2>/dev/null|awk '{print $0;}'|grep -E "aaaaaa|bbbbbb|cccccc" > ${jobListFile}


declare isRunning=false

for idx in $(seq 0 ${#allAppNames[@]}) ;
do
  appName=${allAppNames[$idx]}
  isRunning=false;
  jobId=""
  if [ -z $appName ];then
    continue;
  fi
  while read line
  do
    jobId=$(echo $line|awk '{print $1;}');
    jobName=$(echo $line|awk '{print $2;}');
    appType=$(echo $line|awk '{print $3;}');
    user=$(echo $line|awk '{print $4;}');
    queue=$(echo $line|awk '{print $5;}');
    jobStatus=$(echo $line|awk '{print $6;}');
    if [ ! -z $appName ]&&[ "$appName" == "$jobName" ]&&[ "$proxyUser" = "$user" ]&&[ "$appType" = "$applicationType" ]&&[ "$appStatus" = "$jobStatus" ];then
       isRunning=true
       break;
    elif [ ! -z $appName ]&&[ "$appName" == "$jobName" ]&&[ "$proxyUser" = "$user" ]&&[ "$appType" = "$applicationType" ];then
       isRunning=false     
       break;
    else
       jobId=""
       jobName=""
       jobStatus=""
       isRunning=false
    fi
  done < $jobListFile

  if [ $isRunning = true ];then
    echo "Spark application "$appName" is running!"
    if [ ${isNeedInit} = true ]&&[ ! -z $jobId ];then
       yarn application -kill $jobId
    fi
    jobId=""
  else
    finishTime=0;
    timeDiff=0;
    if [ ! -z $jobId ];then
       finishTime=`yarn application -status $jobId 2>/dev/null | grep "Finish-Time" | awk '{print $NF}'`
       if [ "$finishTime" -eq 0 ]; then
         timeDiffExpr=`date +%s`-`yarn application -status $jobId 2>/dev/null | grep "Start-Time" | awk '{print $NF}' | sed 's!$!/1000!'`
         timeDiff=`echo $timeDiffExpr|bc`
         # wait for $maxStartAppWait time maybe allays accept status
         if [ "$timeDiff" -gt "$maxStartAppWait" ];then
            yarn application -kill $jobId
	    sleep 15s
	 else
	    continue;
         fi
       fi
    fi     

    if [ x"$appName" == x"${allAppNames[0]}" ];then
       echo "Spark Submit $appName to Cluster!!"
       # aaaaa 的应用提交脚本
       nohup sh $basePath/run-aaaaaa-cluster.sh  2>&1 >/dev/null &
    elif [ x"$appName" == x"${allAppNames[1]}" ];then
       echo "Spark Submit $appName to Cluster!!"
       #  bbbbbb的应用提交脚本
       nohup sh $basePath/run-bbbbbb-cluster.sh 2>&1 >/dev/null &
    elif [ x"$appName" == x"${allAppNames[2]}" ];then
       echo "Spark Submit $appName to Cluster!!"
       #  cccccc的应用提交脚本
       nohup sh $basePath/run-cccccc-cluster.sh 2>&1 >/dev/null &
    fi
    sleep 30s
  fi
done

if [ ${isNeedInit} = true ]; then
  # delete checkpoint directory
  hadoop fs -rm -r -f /tmp/aaaaaa/checkpoint
  hadoop fs -rm -r -f /tmp/bbbbbb/checkpoint
  echo ${nowDate} > ${initLockFile}
fi

 

你可能感兴趣的:(大数据)