一、安装配置alertmanager
1.1、下载安装包
wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
tar -xf alertmanager-0.24.0.linux-amd64.tar.gz -C /opt/
cd /opt/
mv alertmanager-0.24.0.linux-amd64/ alertmanager
[root@monitoring alertmanager]# vim /etc/systemd/system/alertmanager.service
[root@monitoring alertmanager]# cat /etc/systemd/system/alertmanager.service
[Unit]
Description=Prometheus alertmanager
After=network.target
[Service]
ExecStart=/opt/alertmanager/alertmanager --config.file="/opt/alertmanager/alertmanager.yml"
[Install]
WantedBy=multi-user.target
[root@monitoring alertmanager]#
systemctl daemon-reload
systemctl enable --now alertmanager
1.2、配置邮箱接收告警
root@monitoring alertmanager]# vim alertmanager.yml
[root@monitoring alertmanager]# cat alertmanager.yml
global:
resolve_timeout: 1m
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'xxxxxxxxxx' #填写邮箱授权码
smtp_hello: '@qq.com'
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 2m
repeat_interval: 5m
receiver: 'web.hook'
receivers:
- name: 'web.hook'
email_configs:
- to: '[email protected]'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
[root@monitoring alertmanager]# ./amtool check-config alertmanager.yml
Checking 'alertmanager.yml' SUCCESS
Found:
- global config
- route
- 1 inhibit rules
- 1 receivers
- 0 templates
[root@monitoring alertmanager]#
[root@monitoring alertmanager]# systemctl enable --now alertmanager.service
Created symlink /etc/systemd/system/multi-user.target.wants/alertmanager.service → /etc/systemd/system/alertmanager.service.
[root@monitoring alertmanager]# systemctl status alertmanager.service
● alertmanager.service - Prometheus alertmanager
Loaded: loaded (/etc/systemd/system/alertmanager.service; enabled; vendor preset: disabled)
Active: active (running) since Tue 2022-09-27 22:33:06 CST; 1min 22s ago
Main PID: 31820 (alertmanager)
Tasks: 9 (limit: 49440)
Memory: 14.9M
CGroup: /system.slice/alertmanager.service
└─31820 /opt/alertmanager/alertmanager --config.file=/opt/alertmanager/alertmanager.yml
Sep 27 22:33:06 monitoring alertmanager[31820]: ts=2022-09-27T14:33:06.960Z caller=main.go:231 level=info msg="Starting Alertmanager" version="(version=0.24>
Sep 27 22:33:06 monitoring alertmanager[31820]: ts=2022-09-27T14:33:06.960Z caller=main.go:232 level=info build_context="(go=go1.17.8, user=root@265f14f5c6f>
Sep 27 22:33:06 monitoring alertmanager[31820]: ts=2022-09-27T14:33:06.972Z caller=cluster.go:185 level=info component=cluster msg="setting advertise addres>
Sep 27 22:33:06 monitoring alertmanager[31820]: ts=2022-09-27T14:33:06.985Z caller=cluster.go:680 level=info component=cluster msg="Waiting for gossip to se>
Sep 27 22:33:07 monitoring alertmanager[31820]: ts=2022-09-27T14:33:07.135Z caller=coordinator.go:113 level=info component=configuration msg="Loading config>
Sep 27 22:33:07 monitoring alertmanager[31820]: ts=2022-09-27T14:33:07.137Z caller=coordinator.go:126 level=info component=configuration msg="Completed load>
Sep 27 22:33:07 monitoring alertmanager[31820]: ts=2022-09-27T14:33:07.154Z caller=main.go:535 level=info msg=Listening address=:9093
Sep 27 22:33:07 monitoring alertmanager[31820]: ts=2022-09-27T14:33:07.155Z caller=tls_config.go:195 level=info msg="TLS is disabled." http2=false
Sep 27 22:33:08 monitoring alertmanager[31820]: ts=2022-09-27T14:33:08.986Z caller=cluster.go:705 level=info component=cluster msg="gossip not settled" poll>
Sep 27 22:33:16 monitoring alertmanager[31820]: ts=2022-09-27T14:33:16.989Z caller=cluster.go:697 level=info component=cluster msg="gossip settled; proceedi>
[root@monitoring alertmanager]# netstat -tnlp
Active Internet connections (only servers)
Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name
tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN 835/sshd
tcp6 0 0 :::22 :::* LISTEN 835/sshd
tcp6 0 0 :::3000 :::* LISTEN 24712/grafana-serve
tcp6 0 0 :::9115 :::* LISTEN 29832/blackbox_expo
tcp6 0 0 :::9090 :::* LISTEN 30218/prometheus
tcp6 0 0 :::51234 :::* LISTEN 24847/node_exporter
tcp6 0 0 :::9093 :::* LISTEN 31820/alertmanager
tcp6 0 0 :::9094 :::* LISTEN 31820/alertmanager
tcp6 0 0 :::9256 :::* LISTEN 24879/process-expor
[root@monitoring alertmanager]#
配置Prometheus告警规则
mkdir /opt/prometheus/rules
cd /opt/prometheus/rules/
vim server_rules.yaml
vim server_rules.yaml
groups:
- name: alertmanager_pod.rules
rules:
- alert: Pod_all_cpu_usage
expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
for: 2m
labels:
severity: critical
service: pods
annotations:
description: 容器 {{ $labels.name }} CPU 资源利用率大于 10% , (current value is {{ $value }})
summary: Dev CPU 负载告警
- alert: Pod_all_memory_usage
expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2*1024*1024*1024
for: 2m
labels:
severity: critical
annotations:
description: 容 器 {{ $labels.name }} Memory 资 源 利 用 率 大 于 2G , (current value is {{ $value }})
summary: Dev Memory 负载告警
- alert: Pod_all_network_receive_usage
expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 50*1024*1024
for: 2m
labels:
severity: critical
annotations:
description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})
- alert: pod 内存可用大小
expr: node_memory_MemFree_bytes < 100*1024*1024
for: 2m
labels:
severity: critical
annotations:
description: 容器可用内存小于 100m
Prometheus配置
配置Prometheus文件,alertmanagers服务器的IP和端口,prometheus服务器规则文件的路径:
vim /apps/prometheus/prometheus.yml
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.100.197:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/apps/prometheus/rules/server_rules.yaml"
[root@monitoring prometheus]# systemctl restart prometheus.service
[root@monitoring prometheus]#
通过命令也可以./amtool alert --alertmanager.url=http://172.16.88.20:9093可以看到当前告警事件
钉钉告警
在群聊中,群设置 - 智能群助手 - 添加机器人 - 自定义Webhook
安全设置可选关键词,具有关键词的内容才被转发:
创建完成,复制Webhook链接:
编写钉钉认证-关键字测试脚本
钉钉认证-关键字-shell 脚本 [root@monitoring prometheus]# cat /opt/scripts/dingding-keywords.sh
#!/bin/bash
source /etc/profile
#PHONE=$1
#SUBJECT=$2
MESSAGE=$1
/usr/bin/curl -X "POST" 'https://oapi.dingtalk.com/robot/send?access_token=axxxxxxxxxxxxxxxxxxxxxxxxxxxx5d' \
-H 'Content-Type: application/json' \
-d '{"msgtype": "text",
"text": {
"content": "'${MESSAGE}'"
}
}'
钉钉认证-关键字-python 脚本[root@monitoring prometheus]# cat /opt/scripts/dingding-keywords.py
#!/usr/bin/python3
import sys
import requests
import json
#钉钉告警:
def info(msg):
url = 'https://oapi.dingtalk.com/robot/send?access_token=axxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx5d'
headers = {
'Content-Type': 'application/json;charset=utf-8'
}
formdata = {
"msgtype": "text",
"text": {"content":str(msg)}
}
#print(formdata)
requests.post(url=url, data=json.dumps(formdata),headers=headers)
info(sys.argv[1])
测试是否能正常发送消息
bash dingding-keywords.sh "node=172.16.88.20:51234,alertname=node内存可用大小"
shell脚本对发送信息存在空格支持不是太好,所以"node=172.16.88.20:51234,alertname=node内存可用大小"里面字段不能存在空格
此时使用python测试不存在该问题
需要提前安装python环境
yum install python38 -y
pip3 install requests
部署webhook-dingtalk
需要再安装prometheus-webhook-dingtalk组件,报警才能顺利转发:
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v1.4.0/prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
tar zxvf prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
mv prometheus-webhook-dingtalk-1.4.0.linux-amd64 /apps/webhook-dingtalk
创建prometheus-webhook-dingtalk组件,粘贴上述复制的Webhook链接。
vim /etc/systemd/system/webhook-dingtalk.service
[Unit]
Description=Prometheus webhook dingtalk
After=network.target
[Service]
# 注意profile后面不能有单双引号
ExecStart=/apps/webhook-dingtalk/prometheus-webhook-dingtalk \
--ding.profile=alertname=https://oapi.dingtalk.com/robot/send?access_token=abcdef1234567890abcdef1234567890
[Install]
WantedBy=multi-user.target
开机自启动webhook-dingtalk,服务开启了8060端口:
systemctl daemon-reload
systemctl restart webhook-dingtalk.service
systemctl status webhook-dingtalk.service
netstat -lntp | grep 8060
tcp6 0 0 :::8060 :::* LISTEN 2295/prometheus-web
配置alertmanager服务
vim /opt/alertmanager/alertmanager.yml
global:
resolve_timeout: 1m
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'yxxxxxxxh'
smtp_hello: '@qq.com'
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 2m
repeat_interval: 5m
receiver: 'dingding-webhook'
receivers:
- name: 'dingding-webhook'
webhook_configs:
- url: 'http://localhost:8060/dingtalk/webhook1/send'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
重启Prometheus服务
[root@monitoring prometheus]# systemctl restart prometheus.service
企业微信告警
打开企业微信网页端,拥有自己的企业后,再创建一个应用:
根据上图的红框,要收集3样东西:
企业ID:ww12345abcde
AgentId:1000002
Secret:xcvnjfsdaUREOIJDLFJ743219742478932
alertmanager配置wechat
在alertmanager配置文件添加wechat_configs:
vim /opt/alertmanager/alertmanager.yml
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'wechat'
receivers:
- name: 'wechat'
wechat_configs:
- corp_id: 'ww12345abcde'
agent_id: '1000002'
api_secret: 'xcvnjfsdaUREOIJDLFJ743219742478932'
to_user: '@all'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
重启
systemctl restart alertmanager.service
systemctl restart prometheus.service
告警分类发送
告警分类发送就是将告警根据严重程度,项目组等划分类别,不同类别的告警发送给不同的组织。
prometheus规则
修改prometheus规则文件,依据lables标签定义告警的类别:
vim /apps/prometheus/rules/server_rules.yaml
groups:
- name: alertmanager_pod.rules
rules:-
alert: Pod_all_cpu_usage
expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
for: 2m
labels:该报警严重程度最高,属于web项目组
severity: critical
project: web
annotations:
description: 容器 {{ value }})
summary: Dev CPU 负载告警 -
alert: Pod_all_memory_usage
expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 210241024*1024
for: 2m
labels:该报警严重程度一般,属于blog项目组
severity: warning
project: blog
annotations:
description: 容 器 {{ value }})
summary: Dev Memory 负载告警
alertermanager配置
在alertmanager配置文件修改route告警分发机制:
vim /apps/alertmanager/alertmanager.yml
-
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
没有匹配子路由的将发送到默认接收者
receiver: 'default-receiver'
子路由一:项目为web或blog的严重警告发送到钉钉
- receiver: 'dingtalk'
group_wait: 10s
match_re:
severity: critical
project: web|blog
子路由二:项目为blog的一般警告发送到企业微信
- receiver: 'wechat'
group_wait: 20s
match_re:
severity: warning
project: blog
receivers:
默认接收者
- name: 'default-receiver'
webhook_configs:- url: 'http://127.0.0.1:5001/'
钉钉接收者
- name: 'dingtalk'
webhook_configs:- url: 'http://192.168.100.197:8060/dingtalk/alertname/send'
企业微信接收者
- name: 'wechat'
wechat_configs:- corp_id: 'ww12345abcde'
agent_id: '1000002'
api_secret: 'xcvnjfsdaUREOIJDLFJ743219742478932'
to_party: 2
send_resolved: true
- corp_id: 'ww12345abcde'
抑制规则
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
重启两个服务,即可实现告警分类发送:
systemctl restart alertmanager.service
systemctl restart prometheus.service
告警模板使用
Alertmanager的通知模板基于Go模板系统。支持变量,HTML格式。不同的接受者可以使用不一样的模板。
tmpl模板文件
创建模板文件,email和钉钉的报警模板,文件名自定义:
vim /apps/alertmanager/alert.tmpl
{{ define "email.default.html" }}
{{ range alert :=.Alerts }}
=== email 监控报警 ===
告警状态:{{ .Status }}
告警级别:{{ alert.Labels.alertname }}
告警应用:{{ alert.Labels.instance }}
告警主题: {{ alert.Annotations.value }}
告警详情: {{ alert.StartsAt.Format "2006-01-02 15:04:05" }}
===========end============
{{ end }}
{{ end }}
{{ define "wechat.default.message" }}
{{ range alert :=.Alerts }}
=== wechat 监控报警 ===
告警级别:{{ alert.Labels.alertname }}
故障主机: {{ $alert.Labels.instance }}
===========end============
{{ end }}
{{ end }}
alertmanager配置
alertmanager配置在原理基础上添加templates选项,指定上述文件路径:
vim /apps/alertmanager/alertmanager.yml
...
指定模板文件路径
templates:
- '/apps/alertmanager/alert.tmpl'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 2m
repeat_interval: 5m
接收者email,则使用email模板
receiver: 'email'
...