1、设置钉钉机器人报警
详见阿里云官方文档 https://help.aliyun.com/document_detail/106247.html?spm=5176.2020520101.0.0.57824df5GNP6Jn
重点:获取生成的webhook机器人地址
https://oapi.dingtalk.com/robot/send?access_token=8b21a85c6d22d25dca5bbee5a207bc31e93022d4f68ac1d8b45ddbc9xxxxxxxx
2、创建钉订告警发送 python脚本 [email protected]
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
import json
import sys
import os
headers = {'Content-Type': 'application/json;charset=utf-8'}
api_url = "https://oapi.dingtalk.com/robot/send?access_token=8b21a85c6d22d25dca5bbee5a207bc31e93022d4f68ac1d8b45ddbc9xxxxxxxx"
def msg(text):
json_text= {
"msgtype": "text",
# "at": {
# "atMobiles": [
# "13812345678"
# ],
# "isAtAll": False
# },
"text": {
"content": text
}
}
print requests.post(api_url,json.dumps(json_text),headers=headers).content
if __name__ == '__main__':
text = sys.argv[1]
msg(text)
####注释部分 at 为@指定手机号码的人,不想@任何人可以取消at代码段
测试脚本
# python [email protected] 'nagios监控测试 2020.04.01'
返回 {"errcode":0,"errmsg":"ok"},说明发送成功
钉钉群会收到消息:
将[email protected]脚本放至 /usr/local/nagios/libexec/目录
至此,告警发送脚本就设置完成了,接下来就来配置nagios,通过调用告警脚本实现nagios的告警信息实时地通过钉订发送出来。
在/usr/local/nagios/etc/objects目录,编辑命令配置文件commands.cfg (使用之前的py脚本发送告警信息)
增加如下内容
########################### notify by dingding ###################################################
# 'notify-host-by-DD' command definition
define command{
command_name notify-host-by-DD
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" | /usr/local/nagios/libexec/[email protected] '[$NOTIFICATIONTYPE$] Host Alert: $HOSTNAME$ is $HOSTSTATE$ 【ECS-nagios监控】'
}
# 'notify-service-by-DD' command definition
define command{
command_name notify-service-by-DD
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTALIAS$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$\n" | /usr/local/nagios/libexec/[email protected] '[$NOTIFICATIONTYPE$] Service Alert: $HOSTALIAS$/$SERVICEDESC$ is $SERVICESTATE$ 【ECS-nagios监控】'
}
####################################################################################################
修改联系人配置文件 contacts.cfg,将默认告警方式改为钉钉
define contact{
contact_name nagiosadmin ; Short name of user
use generic-contact ; Inherit default values from generic-contact template (defined above)
service_notification_period 24x7 ;
host_notification_period 24x7 ;
alias Nagios Admin ; Full name of user
email nagios@localhost ;
service_notification_commands notify-service-by-DD ;
host_notification_commands notify-host-by-DD ;
}
修改完成后,用以下命令检查配置文件是否有误:
[root@yunwei_server ~]# /usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg
Nagios Core 4.2.1
Copyright (c) 2009-present Nagios Core Development Team and Community Contributors
Copyright (c) 1999-2009 Ethan Galstad
Last Modified: 09-06-2016
License: GPL
Website: https://www.nagios.org
Reading configuration data...
Read main config file okay...
Read object config files okay...
Running pre-flight check on configuration data...
Checking objects...
Checked 19 services.
Checked 43 hosts.
Checked 2 host groups.
Checked 0 service groups.
Checked 1 contacts.
Checked 1 contact groups.
Checked 33 commands.
Checked 5 time periods.
Checked 0 host escalations.
Checked 0 service escalations.
Checking for circular paths...
Checked 43 hosts
Checked 0 service dependencies
Checked 0 host dependencies
Checked 5 timeperiods
Checking global event handlers...
Checking obsessive compulsive processor commands...
Checking misc settings...
Total Warnings: 0
Total Errors: 0
Things look okay - No serious problems were detected during the pre-flight check
------------------------------------------------------------------------------------------------------------
最后结果为0,则说明配置文件正常
重启nagios服务 #service nagios restart
【判断网站状态,出现502等状态即告警】
[root@yunwei_server ~]# cat /home/shell/Web_Status_Check.sh
#!/bin/bash
############## web status check #############
webcheck_code=`curl -I -m 10 -o /dev/null -s -w %{http_code} www.webcheck.com`
sleep 0.2
time=$(date "+%Y-%m-%d %H:%M:%S")
echo "www.webcheck.com URL status is ${webcheck_code}, on ${time} " >>/tmp/web_status.log
echo "-------------------------------------------------------- " >>/tmp/web_status.log
if [ ${webcheck_code} -ne 200 ] && [ ${webcheck_code} -ne 000 ]
then
/usr/local/nagios/libexec/[email protected] "[webcheck.com网站告警] www.webcheck.com HTTP_status is ${webcheck_code} at ${time}, 请及时查看!"
fi
【判断mysql同步状态】
[root@yunwei_server ~]# cat /home/shell/mysql-slavestatuscheck.sh
#!/bin/bash
time=$(date "+%Y-%m-%d %H:%M:%S")
mysql_cmd="mysql --defaults-extra-file=/etc/my.cnf" ###远程连接mysql的账号密码写在配置文件
array=($($mysql_cmd -e "show slave status\G"|egrep '_Running:|Behind_Master|Last_SQL_Errno'|awk '{ print $NF }'))
if [ "${array[0]}" == "Yes" -a "${array[1]}" == "Yes" -a "${array[2]}" == "0" ]
then
echo "【Mysql主从】 Mysql Slave is OK, ${time}" >>/home/shell/logs/mysql_master_slave_check.log
else
echo "【Mysql主从】 Mysql Slave is error, ${time}, please check!" >>/home/shell/logs/mysql_master_slave_check.log
/usr/local/nagios/libexec/[email protected] "【Mysql主从】 Mysql Slave is error, ${time}, please check!" >>/dev/null
break
fi
给上述脚本设置定时任务
*/15 * * * * /home/shell/Web_Status_Check.sh #网站状态检查每15分钟一次
30 */2 * * * /home/shell/mysql-slavestatuscheck.sh #mysql主从同步检查,每两个小时
2020.04.01 愚人节 深圳南山.南海大道