1、当pinpoint上监控的应用出现错误的调用,且错误次数>5(可以自己设置阈值大小,默认是5)时,即可报警。
2、只需更改脚本中的webhook和PPURL,并放入计划任务,比如每三分钟执行一次。
3、脚本运行时,会检查最近五分钟内应用的错误调用数,超过阈值就会报警。
4、环境:python3.6(也可支持python2)
#!/usr/local/bin/python
#Author:zzx
#功能:调用pinpoint接口,监控每个应用的调用错误数,并将告警信息发送到钉钉。
import sys
import os
import requests
import time
import datetime
import json
from dingtalkchatbot.chatbot import DingtalkChatbot #pip install dingtalkchatbot
webhook = "你的钉钉webhook"
PPURL = "http://your_pinpoint_ip:port"
'''获取最近五分钟内的时间戳'''
From_Time = datetime.datetime.now() + datetime.timedelta(seconds=-300)
To_Time = datetime.datetime.now()
From_TimeStamp = int(time.mktime(From_Time.timetuple()))*1000
To_TimeStamp = int(time.mktime(datetime.datetime.now().timetuple()))*1000
"""获取pinpoint中所有服务的基础信息,包括服务名,服务类型等"""
def get_applications():
'''return application dict
'''
applicationListUrl = PPURL + "/applications.pinpoint"
res = requests.get(applicationListUrl)
if res.status_code != 200:
print("请求异常,请检查")
return
return res.json()
#print(res.json()[0])
'''传入服务名,返回该服务的节点数和各节点的节点名'''
def getAgentList(appname):
AgentListUrl = PPURL + "/getAgentList.pinpoint"
param = {
'application':appname
}
res = requests.get(AgentListUrl, params=param)
if res.status_code != 200:
print("请求异常,请检查")
return
return len(res.json().keys()),json.dumps(list(res.json().keys()))
'''获取调用失败次数'''
def update_servermap(appname , from_time=From_TimeStamp,to_time=To_TimeStamp, serviceType='TOMCAT'):
'''更新app上下游关系
:param appname: 应用名称
:param serviceType: 应用类型
:param from_time: 起始时间
:param to_time: 终止时间
:
'''
#https://pinpoint.*****.com/getServerMapData.pinpoint?applicationName=test-app&from=1547721493000&to=1547721553000&callerRange=1&calleeRange=1&serviceTypeName=TOMCAT&_=1547720614229
param = {
'applicationName':appname,
'from':from_time,
'to':to_time,
'callerRange':1,
'calleeRange':1,
'serviceTypeName':serviceType
}
# serverMapUrl = PPURL + "/getServerMapData.pinpoint"
serverMapUrl = "{}{}".format(PPURL, "/getServerMapData.pinpoint")
res = requests.get(serverMapUrl, params=param)
if res.status_code != 200:
print("请求异常,请检查")
return
update_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
links = res.json()["applicationMapData"]["linkDataArray"]
#links包含该app的上下游调用关系链,以及相互之间调用的次数和失败的次数等信息。
#print(links)
#print(len(links))
# totalCount=0
errorCount=0
# slowCount=0
for link in links :
###排除test的应用
if link['sourceInfo']['applicationName'].startswith('test'):
continue
#应用名称、应用类型、下游应用名称、下游应用类型、应用节点数、下游应用节点数、总请求数、 错误请求数、慢请求数(本应用到下一个应用的数量)
# application = link['sourceInfo']['applicationName']
# serviceType = link['sourceInfo']['serviceType']
# to_application = link['targetInfo']['applicationName']
# to_serviceType = link['targetInfo']['serviceType']
# agents = len(link.get('fromAgent',' '))
# to_agents = len(link.get('toAgent',' '))
'''总错误数进行累计'''
# totalCount += link['totalCount']
errorCount += link['errorCount']
# slowCount += link['slowCount']
# return totalCount
return errorCount
'''原生钉钉报警,此脚本中没用到'''
def messages(application_name,service_type,error_count): # 定义信息函数
headers = {'Content-Type': 'application/json;charset=utf-8'} # 头部信息,Zabbix官方文档写法,可以查看zabbix官方文档
text="""
报警策略:ERROR COUNT
报警内容:ERROR COUNT value is {error_count} during the past 5 mins.
服务类型:{service_type}
服务名:{application_name}
""".format(error_count=error_count,service_type=service_type,application_name=application_name)
text_info = { # 编写规则可以查看Zabbix官方文档的Zabbix Api
"msgtype": "text",
"at": {
"atMobiles": [
"13728978429"
],
"isAtAll": False
},
"text": {
"content": text
}
}
print(requests.post(webhook, json.dumps(text_info), headers=headers).content) # 将返回的数据编码成 JSON 字符串
requests.post(webhook, json.dumps(text_info), headers=headers).content # 将返回的数据编码成 JSON 字符串
if __name__ == "__main__":
'''初始化钉钉对象'''
xiaoding = DingtalkChatbot(webhook)
at_mobiles=['13728978429']
'''获取所有服务的app名和服务类型,并存到字典中'''
applicationLists=get_applications()
#print(applicationLists)
'''调试update_servermap函数,需要改动该函数的返回值:totalCount、errotCount、slowCount'''
#count=update_servermap('push-base', from_time=From_TimeStamp,to_time=To_TimeStamp,serviceType='TOMCAT')
#print(count)
'''轮询application,查询每个application在过去五分钟内的总错误数,并通过钉钉报警'''
for app in applicationLists:
application_name = app['applicationName']
service_type = app['serviceType']
pid = os.fork() #使用fork实现并发
if pid == 0:
error_count = update_servermap(application_name, from_time=From_TimeStamp,to_time=To_TimeStamp, serviceType=service_type)
text = """
pinpoint报警\n\n
> 报警策略:ERROR COUNT\n\n
> 报警内容:ERROR COUNT value is {error_count} during the past 5 mins.\n\n
> 服务类型:{service_type}\n\n
> 服务名:{application_name}
""".format(error_count=error_count, service_type=service_type, application_name=application_name)
'''如果总调用错误数超过阈值5(根据实际需求进行设置),则报警'''
if error_count >5:
#messages(application_name,service_type,error_count)
xiaoding.send_markdown(title='pp报警', text=text,at_mobiles=at_mobiles)
exit(0)
希望帮到大家。有什么需要交流的加本人qq:1074060710
参考文章:
https://yq.aliyun.com/articles/690351