监控指标包括:应用是否启用、磁盘使用率是否超过告警值、CPU使用率是否超过告警值、内存使用率是否超过告警值、IP能否ping通、远程IP的端口telnet能否成功、文件夹中最后修改时间、数据库是否能连接/查询数据库结果
做之前介绍下情况,正在做的系统,采用分布式,包括前台、后台、数据库一共10台机器,其中一台要是出现问题,例如某台机器磁盘空间满了,系统就会出异常,但是客户提供的运维工具简直无语,只能自己写一个极简监控自身系统相关的脚本。
思路:1.写一个监控配置文本,格式:要监控的IP|监控类型|监控指标|监控关键参数,例如,用于记录要监控的机器及监控参数,我这边每台机器都有挂载同一个文件目录,赶紧将配置文本放到挂载目录里面,2.再写一个监控shell脚本,从监控配置文本中读取监控参数,将监控结果返回到一个txt中,3.有了监控结果,是发短信还是告警之类的都好说,4定时监控可以在crontab中配置时间频率
monitconfig.txt监控配置文本
11.211.19.21|PROGRAM|IS_ENABLED|PDFWorkServer
11.211.19.21|PROGRAM|IS_ENABLED|CompassServer
11.211.19.21|PROGRAM|IS_ENABLED|GrabServer
11.211.19.20|PROGRAM|IS_ENABLED|WorkServer
11.211.19.22|PROGRAM|IS_ENABLED|WorkServer
11.211.19.22|PROGRAM|IS_ENABLED|PDFWorkServer
11.211.19.23|PROGRAM|IS_ENABLED|tomcat
11.211.19.24|PROGRAM|IS_ENABLED|tomcat
11.211.19.25|PROGRAM|IS_ENABLED|tomcat
11.211.19.23|SYSTEMCFG|DISK_SPACE_USAGE|80
11.211.19.24|SYSTEMCFG|DISK_SPACE_USAGE|80
11.211.19.25|SYSTEMCFG|DISK_SPACE_USAGE|80
11.211.19.22|SYSTEMCFG|DISK_SPACE_USAGE|80
11.211.19.20|SYSTEMCFG|DISK_SPACE_USAGE|80
11.211.19.21|SYSTEMCFG|DISK_SPACE_USAGE|80
11.211.19.23|SYSTEMCFG|CPU_USAGE|90
11.211.19.24|SYSTEMCFG|CPU_USAGE|90
11.211.19.25|SYSTEMCFG|CPU_USAGE|90
11.211.19.22|SYSTEMCFG|CPU_USAGE|90
11.211.19.20|SYSTEMCFG|CPU_USAGE|90
11.211.19.21|SYSTEMCFG|CPU_USAGE|90
11.211.19.23|NETWORK|NETWORK_PING|11.211.19.21
11.211.19.21|NETWORK|NETWORK_PING|11.211.19.23
11.211.19.23|NETWORK|NETWORK_TELNET|11.211.19.21,9090
11.211.19.21|NETWORK|NETWORK_TELNET|11.211.19.23,8300
11.211.19.21|FILES|FOLDER_OVERTIME_VALID|/filedata/ftp/as_gd/,GOrder
11.211.19.21|FILES|FOLDER_OVERTIME_VALID|/filedata/ftp/as_face/,FOrder
11.211.19.21|FILES|FOLDER_OVERTIME_VALID|/filedata/ftp/as_aicenter/,AOrder
11.211.19.21|DB|DB_CONN|ORACLE,smartItil,smart@2019,11.211.19.26:1522/smart
11.211.19.21|DB|DB_CONN|ORACLE,test,ddd,11.211.19.26:1522/test
11.211.19.21|DB|DB_DATA_MONIT|ORACLE,smartItil,smart@2019,11.211.19.26:1522/smart,select count(*) from tbl_wo
监控shell脚本monit.shell
#!/bin/bash
printf "*************************************\n"
searchMonitType=$1
searchMonitCode=$2
searchMonitPara=$3
#判断应用是否启用,并将结果存入txt文件 参数说明 localIpAddr,monitType,monitCode,procName,datetime,FILE
procNameIsEnabled(){
isEnabled=`ps -ef |grep -w $4|grep -v grep|wc -l`
procNumber=`ps -ef | grep $4 | grep -v "grep" | awk '{print $2}'`
descs="IP:$1,应用:$4"
if [ $isEnabled -le 0 ];then
result=0
sendMsg=1
descs=$descs"未启用"
else
result=1
sendMsg=0
descs=$descs"已启用,进程ID:$procNumber"
fi
echo "$1|$2|$3|$4|$result|$sendMsg|$5|$descs" >> $6
}
#判断磁盘使用率是否超过告警值,并将结果存入txt文件 参数说明 localIpAddr,monitType,monitCode,diskAlarmNum,datetime,FILE
diskSpaceUsage(){
isAlarm=0
diskPer=0
for diskArr in `df -Ph | grep / | awk '{print $6","$2","$3","$4","$5}' | sed 's/%//g'`
do
DISK_OLD_IFS="$IFS"
IFS=","
diskTmpArr=($diskArr)
IFS="$DISK_OLD_IFS"
diskPer=${diskTmpArr[4]}
if [ $diskPer -ge $4 ]; then
isAlarm=1
sendMsg=1
echo "$1|$2|$3|$4|$diskPer|$sendMsg|$5|IP:$localIpAddr,目录:${diskTmpArr[0]}磁盘使用率超过$diskPer%,其中目录总大小${diskTmpArr[1]},已用${diskTmpArr[2]},剩余${diskTmpArr[3]}" >> $6
fi
done
if [ $isAlarm -le 0 ]; then
sendMsg=0
echo "$1|$2|$3|$4|$diskPer|$sendMsg|$5|IP:$localIpAddr磁盘空间未超过上限$4%" >> $6
fi
}
#判断CPU使用率是否超过告警值,并将结果存入txt文件 参数说明 localIpAddr,monitType,monitCode,cpuAlarmNum,datetime,FILE
cpuUsage(){
cpuCurUsage=`top -b -n 1|grep Cpu|awk '{print $2}'|cut -f 1 -d "."`
if [ $cpuCurUsage -ge $4 ]; then
sendMsg=1
echo "$1|$2|$3|$4|$cpuCurUsage|$sendMsg|$5|IP:$localIpAddr,CPU使用率:$cpuCurUsage%,异常,超过阀值$4," >> $6
else
sendMsg=0
echo "$1|$2|$3|$4|$cpuCurUsage|$sendMsg|$5|IP:$localIpAddr,CPU使用率:$cpuCurUsage%,正常" >> $6
fi
}
#判断内存使用率是否超过告警值,并将结果存入txt文件 参数说明 localIpAddr,monitType,monitCode,memAlarmNum,datetime,FILE
memerySpaceUsage(){
memeryCurUsage=`free |grep Mem|awk '{print ($6+$7)*100/$2}'|cut -f 1 -d "."`
if [ $memeryCurUsage -ge $4 ]; then
sendMsg=1
echo "$1|$2|$3|$4|$memeryCurUsage|$sendMsg|$5|IP:$localIpAddr,内存使用率:$memeryCurUsage%,异常,超过阀值$4," >> $6
else
sendMsg=0
echo "$1|$2|$3|$4|$memeryCurUsage|$sendMsg|$5|IP:$localIpAddr,内存使用率:$memeryCurUsage%,正常" >> $6
fi
}
#判断IP能否ping通,并将结果存入txt文件 参数说明 localIpAddr,monitType,monitCode,testIp,datetime,FILE
networkPing(){
descs="IP:$1,ping:$4"
if ping -c1 $4 &>/dev/null
then
result=1
sendMsg=0
descs=$descs"成功"
else
result=0
sendMsg=1
descs=$descs"失败"
fi
echo "$1|$2|$3|$4|$result|$sendMsg|$5|$descs" >> $6
}
#判断远程IP的端口telnet能否成功,并将结果存入txt文件 参数说明 localIpAddr,monitType,monitCode,testIp|testPort,datetime,FILE
networkTelnet(){
TELNET_OLD_IFS="$IFS"
IFS=","
telnetArr=($4)
IFS="$TELNET_OLD_IFS"
testIp=${telnetArr[0]}
testPort=${telnetArr[1]}
descs="IP:$1,telnet $testIp:$testPort"
num=`echo -n "\n"|telnet 10.253.58.36 8300|grep Connected|wc -l`
if [ $num -eq 1 ]
then
result=1
descs=$descs"成功"
sendMsg=0
else
result=0
descs=$descs"失败"
sendMsg=1
fi
echo "$1|$2|$3|$4|$result|$sendMsg|$5|$descs" >> $6
}
#获取文件夹中最后修改时间,并将结果存入txt文件 参数说明 localIpAddr,monitType,monitCode,path,datetime,FILE
fileFolderOvertimeValid(){
FILE_OLD_IFS="$IFS"
IFS=","
telnetArr=($4)
IFS="$FILE_OLD_IFS"
testPath=${telnetArr[0]}
valiKey=${telnetArr[1]}
descs="IP:$1,文件夹 $testPath"
FILE_NAME=`ls -lt $testPath | grep $valiKey | head -n 1 |awk '{print $9}'`
if [ $valiKey = "" ]
then
FILE_NAME=`ls -lt $testPath | grep -v "total" | head -n 1 |awk '{print $9}'`
fi
descs=$descs"最后修改的文件名称$FILE_NAME"
FILE_NAME=$testPath$FILE_NAME
LAST_MODIFY_TIMESTAMP=`stat -c %Y $FILE_NAME`
echo "$LAST_MODIFY_TIMESTAMP"
result=`date '+%Y-%m-%d %H:%M:%S' -d @$LAST_MODIFY_TIMESTAMP`
echo "$result"
descs=$descs",最后修改时间:$result"
sendMsg=0
echo "$1|$2|$3|$4|$result|$sendMsg|$5|$descs" >> $6
}
#判断数据库是否能连接,并将结果存入txt文件 参数说明 localIpAddr,monitType,monitCode,path,datetime,FILE
dbConn(){
DB_OLD_IFS="$IFS"
IFS=","
dbArr=($4)
IFS="$DB_OLD_IFS"
dbType=${dbArr[0]}
dbUser=${dbArr[1]}
dbPwd=${dbArr[2]}
dbSid=${dbArr[3]}
descs="IP:$1,连接$dbType类型数据库,用户:$dbUser,SID:$dbSid"
echo $descs
if [ $dbType == "ORACLE" ];
then
sqlplus /nolog >/dev/null < whenever sqlerror exit 1;
connect $dbUser/$dbPwd@$dbSid;
exit
!
fi
if [ $? == 1 ]
then
result=0
descs=$descs",连接失败"
sendMsg=1
else
result=1
descs=$descs",连接成功"
sendMsg=0
fi
echo "$1|$2|$3|$dbType,$dbUser,$dbSid|$result|$sendMsg|$5|$descs" >> $6
}
#查询数据库结果,并将结果存入txt文件 参数说明 localIpAddr,monitType,monitCode,path,datetime,FILE
dbExec(){
DB_OLD_IFS="$IFS"
IFS=","
dbArr=($4)
IFS="$DB_OLD_IFS"
dbType=${dbArr[0]}
dbUser=${dbArr[1]}
dbPwd=${dbArr[2]}
dbSid=${dbArr[3]}
dbSql=${dbArr[4]}
descs="IP:$1,操作$dbType类型数据库,用户:$dbUser,SID:$dbSid,SQL:$dbSql"
echo $descs
if [ $dbType == "ORACLE" ];
then
rowsCount=`sqlplus -s ${dbUser}/${dbPwd}@${dbSid} <
${dbSql}
quit;
EOF`
result=${rowsCount}
echo ${rowsCount}
descs=$descs",查询结果为${rowsCount}"
fi
sendMsg=0
# if [ ${rowsCount} -le $hopeRls ]
# then
# descs=$descs",与期望结果$hopeRls不匹配"
# sendMsg=1
# else
# descs=$descs""
# sendMsg=0
# fi
echo "$1|$2|$3|$dbType,$dbUser,$dbSid|$result|$sendMsg|$5|$descs" >> $6
}
localIpAddr='127.0.0.1'
localIpAddr=$(ip addr | awk '/^[0-9]+: / {}; /inet.*global/ {print gensub(/(.*)\/(.*)/, "\\1", "g", $2)}')
#echo "本地IP:" $localIpAddr
monitcfgfile="/filedata/ftp/monit/monitcfg.txt"
#echo " 遍历读取监控配置文件$monitcfgfile"
datetime=$(date '+%Y-%m-%d %H:%M:%S')
monitRlsFile="/filedata/ftp/monit/records/"$(date '+%Y%m%d')".txt"
cat $monitcfgfile |while read line
do
echo $line;
string=$line;
OLD_IFS="$IFS"
IFS="|"
array=($string)
IFS="$OLD_IFS"
monitIp=${array[0]}
monitType=${array[1]}
monitCode=${array[2]}
monitPara=${array[3]}
#echo "本地IP$localIpAddr监控IP$monitIp监控类型$monitType监控编码$monitCode关键值$monitPara入参$searchMonitCode"
if [[ $localIpAddr == $monitIp ]] && [[ $monitType == $searchMonitType ]] && [[ $monitCode == $searchMonitCode ]]; then
if [ $monitType = "PROGRAM" ] ; then
if [ $monitCode = "IS_ENABLED" ] ; then
procNameIsEnabled $localIpAddr $monitType $monitCode $monitPara "$datetime" $monitRlsFile
else
echo "not monitCode"
fi
elif [ $monitType = "NETWORK" ] ; then
if [ $monitCode = "NETWORK_PING" ] ; then
networkPing $localIpAddr $monitType $monitCode $monitPara "$datetime" $monitRlsFile
elif [ $monitCode = "NETWORK_TELNET" ] ; then
networkTelnet $localIpAddr $monitType $monitCode $monitPara "$datetime" $monitRlsFile
else
echo "not monitCode"
fi
elif [ $monitType = "SYSTEMCFG" ] ; then
if [ $monitCode = "DISK_SPACE_USAGE" ] ; then
diskSpaceUsage $localIpAddr $monitType $monitCode $monitPara "$datetime" $monitRlsFile
elif [ $monitCode = "CPU_USAGE" ] ; then
cpuUsage $localIpAddr $monitType $monitCode $monitPara "$datetime" $monitRlsFile
elif [ $monitCode = "MEMERY_USAGE" ] ; then
memerySpaceUsage $localIpAddr $monitType $monitCode $monitPara "$datetime" $monitRlsFile
else
echo "not monitCode"
fi
elif [ $monitType = "FILES" ] ; then
if [ $monitCode = "FILE_OVERTIME_VALID" ] ; then
echo "FILE_OVERTIME_VALID"
elif [ $monitCode = "FOLDER_OVERTIME_VALID" ] ; then
if [[ $monitPara == *$searchMonitPara* ]] ; then
fileFolderOvertimeValid $localIpAddr $monitType $monitCode "$monitPara" "$datetime" $monitRlsFile
fi
else
echo "not monitCode"
fi
elif [ $monitType = "DB" ] ; then
if [ $monitCode = "DB_CONN" ] ; then
dbConn $localIpAddr $monitType $monitCode "$monitPara" "$datetime" $monitRlsFile
elif [ $monitCode = "DB_DATA_MONIT" ] ; then
dbExec $localIpAddr $monitType $monitCode "$monitPara" "$datetime" $monitRlsFile
else
echo "not monitCode"
fi
else
echo "not monit type"
fi
fi
done