altermanager基础设置可以参照: https://editor.csdn.net/md/?articleId=121845743
钉钉群设置
群设置 -> 智能群助手 -> 添加机器人 -> 自定义 -> 添加 -> 保存生成的webhook地址
root@prometheus:~# mkdir data/scripts -p
root@prometheus:~# cd data/scripts/
root@prometheus:~/data/scripts# vim dinngding-keyworlds.sh
#/bin/bash
source /etc/profile
MESSAGE=$1
curl -X "POST" '你生成的Webhook地址' \
-H 'Content-Type:application/json' \
-d '{ "msgtype" : "text",
"text" : {
"content":"'${MESSAGE}'"
}
}'
root@prometheus:~/data/scripts# chmod +x dinngding-keyworlds.sh
root@prometheus:~/data/scripts# bash dinngding-keyworlds.sh "namespace=defalt\npod=pod1\ncpu=90%\n持续时间=8s\nalertname=pod"
{"errcode":0,"errmsg":"ok"}
github地址 : https://github.com/timonwong/prometheus-webhook-dingtalk
root@prometheus:/apps# wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v1.4.0/prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
root@prometheus:/apps# tar xf prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
root@prometheus:/apps/prometheus-webhook-dingtalk-1.4.0.linux-amd64# ./prometheus-webhook-dingtalk --web.listen-address="0.0.0.0:8060" --ding.profile="alertname=你的webhook地址"
level=info ts=2022-02-10T03:49:21.015Z caller=main.go:62 msg="Starting prometheus-webhook-dingtalk" version="(version=1.4.0, branch=HEAD, revision=02fe8265a98ab4caaa78ebbed209d3f06b87b4a6)"
level=info ts=2022-02-10T03:49:21.016Z caller=main.go:63 msg="Build context" (gogo1.13.5,userroot@eb9f8d8f0437,date20191211-03:00:38)=(MISSING)
level=warn ts=2022-02-10T03:49:21.016Z caller=main.go:105 msg="DEPRECATION: Detected one of the following flags: --ding.profile, --ding.timeout, --template.file"
level=warn ts=2022-02-10T03:49:21.016Z caller=main.go:106 msg="DEPRECATION: Now working in compatibility mode, please consider upgrading your configurations"
level=info ts=2022-02-10T03:49:21.016Z caller=main.go:117 component=configuration msg="Loading templates" templates=
ts=2022-02-10T03:49:21.016Z caller=main.go:133 component=configuration msg="Webhook urls for prometheus alertmanager" urls=http://0.0.0.0:8060/dingtalk/alertname/send
level=info ts=2022-02-10T03:49:21.016Z caller=web.go:210 component=web msg="Start listening for connections" address=0.0.0.0:8060
level=info ts=2022-02-10T03:49:21.428Z caller=entry.go:22 component=web http_scheme=http http_proto=HTTP/1.1 http_method=POST remote_addr=10.0.0.61:55076 user_agent=Alertmanager/0.23.0 uri=http://10.0.0.61:8060/dingtalk/alertname/send resp_status=200 resp_bytes_length=2 resp_elapsed_ms=184.460895 msg="request complete"
#测试一下
root@prometheus:~# telnet 10.0.0.61 8060
Trying 10.0.0.61...
Connected to 10.0.0.61.
Escape character is '^]'.
HTTP/1.1 400 Bad Request
Content-Type: text/plain; charset=utf-8
Connection: close
400 Bad RequestConnection closed by foreign host.
root@prometheus:/apps/alertmanager# vim alertmanager.yml
---
#修改接受者
receiver: 'dingding'
receivers:
#添加钉钉
- name: dingding
webhook_configs:
- url:'http://10.0.0.61:8060/dingtalk/altername/send'
send_resolved: true
root@prometheus:/apps/alertmanager# apt install python2
root@prometheus:~# vim data/scripts/dingding-label-sign.py
#!/usr/bin/python2.7
import time
import hmac
import hashlib
import base64
import urllib
timestamp=long(round(time.time())*1000)
secret='你的加签生成的秘钥'
secret_enc=bytes(secret).encode('utf-8')
string_to_sign='{}\n{}'.format(timestamp,secret)
string_to_sign_enc=bytes(string_to_sign).encode('utf-8')
hmac_code=hmac.new(secret_enc,string_to_sign_enc,digestmod=hashlib.sha256).digest()
sign=urllib.quote_plus(base64.b64encode(hmac_code))
print(timestamp)
print(sign)
#生成时间戳和认证
root@prometheus:~# python2.7 data/scripts/dingding-label-sign.py
#测试脚本可用
root@prometheus:~# vim /root/data/scripts/dingding-label-send.sh
#!/bin/bash
source /etc/profile
MESSAGE=$1
secret='你的加签生成的秘钥'
getkey=$(python2.7 /root/data/scripts/dingding-label-sign.py)
timestamp=${getkey:0:13}
sign=$(echo "${getkey:13:100}"|tr -d '\n')
# DateStamp=$(date -d @${getkey:0:10}"+%F%H:%m:%S")
curl -X "POST" "你的webhook地址×tamp=${timestamp}&sign=${sign}" \
-H 'Content-Type:application/json' \
-d '{ "msgtype" : "text",
"text" : {
"content":"'${MESSAGE}'"
}
}'
root@prometheus:~# bash /root/data/scripts/dingding-label-send.sh sss
{"errcode":0,"errmsg":"ok"}
#先获取当前时间戳和认证秘钥
root@prometheus:~# python2.7 /root/data/scripts/dingding-label-sign.py
#启动webhook的dingtalk
root@prometheus:/apps/prometheus-webhook-dingtalk-1.4.0.linux-amd64# ./prometheus-webhook-dingtalk --web.listen-address="0.0.0.0:8060" --ding.profile="alertname=你的webhook地址×tamp=生成的时间戳&sign=生成的认证秘钥"
level=info ts=2022-02-10T05:22:45.778Z caller=main.go:62 msg="Starting prometheus-webhook-dingtalk" version="(version=1.4.0, branch=HEAD, revision=02fe8265a98ab4caaa78ebbed209d3f06b87b4a6)"
level=info ts=2022-02-10T05:22:45.778Z caller=main.go:63 msg="Build context" (gogo1.13.5,userroot@eb9f8d8f0437,date20191211-03:00:38)=(MISSING)
level=warn ts=2022-02-10T05:22:45.779Z caller=main.go:105 msg="DEPRECATION: Detected one of the following flags: --ding.profile, --ding.timeout, --template.file"
level=warn ts=2022-02-10T05:22:45.779Z caller=main.go:106 msg="DEPRECATION: Now working in compatibility mode, please consider upgrading your configurations"
level=info ts=2022-02-10T05:22:45.779Z caller=main.go:117 component=configuration msg="Loading templates" templates=
ts=2022-02-10T05:22:45.779Z caller=main.go:133 component=configuration msg="Webhook urls for prometheus alertmanager" urls=http://0.0.0.0:8060/dingtalk/alertname/send
level=info ts=2022-02-10T05:22:45.779Z caller=web.go:210 component=web msg="Start listening for connections" address=0.0.0.0:8060
level=info ts=2022-02-10T05:22:46.788Z caller=entry.go:22 component=web http_scheme=http http_proto=HTTP/1.1 http_method=POST remote_addr=10.0.0.61:59396 user_agent=Alertmanager/0.23.0 uri=http://10.0.0.61:8060/dingtalk/alertname/send resp_status=200 resp_bytes_length=2 resp_elapsed_ms=908.904779 msg="request complete"
#修改配置文件
root@prometheus:/apps/alertmanager# vim alertmanager.yml
---
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 2s
repeat_interval: 2m
#receiver: 'web.hook'
#receiver: dingding
receiver: wechat
----
- name: wechat
wechat_configs:
- corp_id: 你的企业ID
to_user: '@all' #发送给所有人
agent_id: 你的应用id
api_secret: 你的应用秘钥
send_resolved: true
root@prometheus:/apps/alertmanager# vim alertmanager.yml
- name: wechat
wechat_configs:
- corp_id: 你的企业ID
#to_user: '@all'
to_party: 1 #指定部门ID
agent_id: 你的应用ID
api_secret: 你的应用secret
send_resolved: true
root@prometheus:/apps/alertmanager# systemctl restart alertmanager
根据消息中的属性信息设置规则,将消息分类发送,如将severity级别为critical的通知信息发送到邮箱,其他发送到微信
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 2s
repeat_interval: 2m
#receiver: 'web.hook'
#receiver: dingding
receiver: wechat
routes: #添加信息路由
- receiver: web.hook #critical级别的信息发送到邮箱
group_wait: 10s
match_re:
severity: critical
默认的消息内容需要调整,而且信息是连接在一起的
root@prometheus:/apps/alertmanager# vim alertmanager-wechat.tmpl
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= 监控告警 =========
告警程序: Alertmanager
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }} 级
告警状态: {{ .Status }}
故障主机: {{ $alert.Labels.instance }} {{ $alert.Labels.device }}
告警主题: {{ .Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
主机标签: {{ range .Labels.SortedPairs }} [{{ .Name }}: {{ .Value | html }} ] {{- end }}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end = =========
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= 告警恢复 =========
告警程序: Alertmanager
告警主题: {{ $alert.Annotations.summary }}
告警主机: {{ .Labels.instance }}
告警类型: {{ .Labels.alertname }}
告警级别: {{ $alert.Labels.severity }} 级
告警状态: {{ .Status }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end = =========
{{- end }}
{{- end }}
{{- end }}
root@prometheus:/apps/alertmanager# vim alertmanager.yml
---
#添加模板
templates:
- /apps/alertmanager/alertmanager-wechat.tmpl
root@prometheus:/apps/alertmanager# systemctl restart alertmanager
基于告警规则,超过80%就不在发60%的告警,即由60%的表达式触发的告警被抑制了
root@prometheus:/apps/prometheus# vim rules.yml
groups:
- name: altermanager_pod.rules
rules:
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes{fstype=~"ext4|xfs"}*100)>30 #故意写小
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.mountpoint}} 磁盘分区使用大于30%(目前使用:{{$value}}%)"
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes{fstype=~"ext4|xfs"}*100)>20 #故意写小
for: 2s
labels:
severity: warning
annotations:
description: "{{$labels.mountpoint}} 磁盘分区使用大于20%(目前使用:{{$value}}%)"
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
root@prometheus:/apps/prometheus# systemctl restart prometheus.service
root@prometheus:/apps/prometheus# systemctl restart alertmanager.service
先找到要静默的告警事件,然后手动静默指定的事件