简单实操,可直接拷贝命令执行
前提条件:
centos7 10.11.7.95
关闭selinux
vi /etc/sysconfig/selinux
SELINUX=disabled
setenforce 0
关闭防火墙或者开放对应端口
systemctl stop firewalld
systemctl start firewalld
需要开放端口
9090/tcp 3000/tcp 9093/tcp 8060/tcp
9090 prometheus
3000 grafana
9093 alertmanager
8060 prometheus-ding
firewall-cmd --zone=public --list-ports 查看端口
firewall-cmd --zone=public --query-port=80/tcp 查看端口
firewall-cmd --zone=public --add-port=3306/tcp --permanent 添加端口
firewall-cmd --zone=public --remove-port=80/tcp --permanent 删除端口
firewall-cmd --reload
systemctl reload firewalld.service
一、安装Prometheus平台
从 https://prometheus.io/download/ 下载相应版本 并 安装
访问地址:http://10.11.7.95:9090/
cd /home/ && mkdir package && cd package
下载对应安装包:
wget https://github.com/prometheus/prometheus/releases/download/v2.37.0/prometheus-2.37.0.linux-amd64.tar.gz
解压至指定文件夹
tar -zxvf /home/package/prometheus-2.37.0.linux-amd64.tar.gz -C /usr/local/
创建软链
ln -s /usr/local/prometheus-2.37.0.linux-amd64/ /usr/local/prometheus
配置prometheus开机自启
vi /usr/lib/systemd/system/prometheus.service
写入以下信息:
[Unit]
Description=https://prometheus.io
[Service]
Restart=on-failure
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --storage.tsdb.path=/usr/local/prometheus/data
[Install]
WantedBy=multi-user.target
systemctl start prometheus
systemctl status prometheus
systemctl enable prometheus
二、搭建grafana平台
https://grafana.com/grafana/download
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-9.0.2-1.x86_64.rpm
yum -y install grafana-enterprise-9.0.2-1.x86_64.rpm
启动-并开机自启
systemctl start grafana-server
systemctl enable grafana-server
访问地址:http://10.11.7.95:3000/login admin/admin登陆后修改密码
data sources添加数据源:http://10.11.7.95:9090
三、安装node_exporter
四、节点添加入监控--展示
cd /usr/local/prometheus/
cp /usr/local/prometheus/prometheus.yml /usr/local/prometheus/prometheus.yml.bak
添加对应的需监控信息
vi /usr/local/prometheus/prometheus.yml
检测填写是否正确
./promtool check config prometheus.yml
Checking prometheus.yml
SUCCESS: prometheus.yml is valid prometheus config file syntax
重启prometheus
systemctl restart prometheus
导入模板id :16098
五、安装alertmanager报警
cd /home/package
wget https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz
解压至指定文件夹
tar -zxvf /home/package/alertmanager-0.21.0.linux-amd64.tar.gz -C /usr/local/
软连接
ln -s /usr/local/alertmanager-0.21.0.linux-amd64/ /usr/local/alertmanager
cd /usr/local/alertmanager && cp alertmanager.yml alertmanager.yml.bak
vi alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.126.com:25' # smtp地址
smtp_from: '[email protected]' # 谁发邮件
smtp_auth_username: '[email protected]' # 邮箱用户
smtp_auth_password: 'XXXXXXXXXXXXXXXX' # 邮箱客户端授权密码
smtp_require_tls: false
templates: # 指定邮件模板的路径,可以使用相对路径,template/*.tmp的方式
- '/usr/local/alertmanager/template/*.tmp'
route: # route用来设置报警的分发策略
group_by: ["alertname"] # 分组名
group_wait: 30s # 当收到告警的时候,等待三十秒看是否还有告警,如果有就一起发出去
group_interval: 30s # 发送警告间隔时间
repeat_interval: 20m # 重复报警的间隔时间
receiver: Node_warning # 设置默认接收人,如果想分组接收,把下面这段的注释去掉
receivers: # 定义接收者,将告警发送给谁
- name: 'Node_warning'
email_configs:
- send_resolved: true
to: '[email protected]'
html: '{{ template "email.html" . }}' # 指定使用模板,如果不指定,还是会加载默认的模板的
headers: { Subject: "[WARN]告警" } # 配置邮件主题
webhook_configs:
- url: http://127.0.0.1:8060/dingtalk/webhook/send
#警报被解决之后是否通知 消息模板/usr/local/prometheus-webhook-dingtalk/config.yml
send_resolved: true
################结束###############
webhook_configs为钉钉消息
mkdir template && vi template/email.tmp
{{ define "email.html" }}
{{ range .Alerts }}
========start==========
告警程序: prometheus_alert
告警级别: {{ .Labels.severity }}
告警类型: {{ .Labels.alertname }}
故障主机: {{ .Labels.instance }}
告警主题: {{ .Annotations.summary }}
告警详情: {{ .Annotations.description }}
触发时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========end==========
{{ end }}
{{ end }}
################结束###############
检测配置是否正确
./amtool check-config alertmanager.yml
配置alertmanager开机自启动
vi /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=https://prometheus.io
[Service]
Restart=on-failure
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
[Install]
WantedBy=multi-user.target
启动 && 自启动
systemctl start alertmanager
systemctl enable alertmanager
访问地址http://10.11.7.95:9093/#/alerts
在grafana的alert-admin中添加alertmanager地址
在grafana的alert-Concat point中添加Alertmanager预警
在grafana的alert-Policies中使用Alertmanager预警
六、配置钉钉预警
cd /home/package
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v1.4.0/prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
tar -zxvf prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz -C /usr/local/
ln -s /usr/local/prometheus-webhook-dingtalk-1.4.0.linux-amd64/ /usr/local/prometheus-webhook-dingtalk
cd /usr/local/prometheus-webhook-dingtalk
修改相关配置
vi config.yml
# Request timeout
timeout: 5s
## Customizable templates path
# templates:
# - contrib/templates/legacy/template.tmpl
#模板文件
templates:
- /usr/local/prometheus-webhook-dingtalk/template/*.tmp
## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
# default_message:
# title: '{{ template "legacy.title" . }}'
# text: '{{ template "legacy.content" . }}'
## Targets, previously was known as "profiles"
targets:
webhook:
url: https://oapi.dingtalk.com/robot/send?access_token=**************
message:
# Use legacy template
title: '{{ template "ding.link.title" . }}'
text: '{{ template "ding.link.content" . }}'
###############结束#############
创建模板文件
mkdir template && vi template/template.tmp
{{ define "__subject" }}[Linux 基础监控告警:{{ .Alerts.Firing | len }}] {{ end }}
{{ define "__text_list" }}{{ range . }}
{{ range .Labels.SortedPairs }}
{{ if eq .Name "instance" }}> 实例: {{ .Value | html }}{{ end }}
{{ end }}
{{ range .Labels.SortedPairs }}
{{ if eq .Name "serverity" }}> 告警级别: {{ .Value | html }}{{ end }}
{{ if eq .Name "hostname" }}> 主机名称: {{ .Value | html }}{{ end }}
{{ end }}
{{ range .Annotations.SortedPairs }}
{{ if eq .Name "description" }}> 告警详情: {{ .Value | html }}{{ end }}
{{ end }}
触发时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{"============================"}}
{{ end }}{{ end }}
{{ define "ding.link.title" }}{{ template "__subject" . }}{{ end }}
{{ define "ding.link.content" }}
{{ if gt (len .Alerts.Firing) 0 }}#### [{{ .Alerts.Firing | len }}]【Linux 报警触发】
{{ template "__text_list" .Alerts.Firing }}{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}#### [{{ .Alerts.Resolved | len }}]【Linux 报警恢复】
{{ end }}
{{ end }}
配置开机自启动服务
vi /usr/lib/systemd/system/prometheus-webhook-dingtalk.service
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target
[Service]
ExecStart=/usr/local/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --config.file=/usr/local/prometheus-webhook-dingtalk/config.yml
[Install]
WantedBy=default.target
启动 && 开机自启动
systemctl start prometheus-webhook-dingtalk
systemctl enable prometheus-webhook-dingtalk
七、编写prometheus告警规则
cd /usr/local/prometheus && mkdir rules && cd rules
参考
https://www.bbsmax.com/A/1O5EQv7G57/
修改对应位置
vi prometheus.yml
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml"
# - "first_rules.yml"
# - "second_rules.yml"
重启服务
systemctl restart prometheus
到此时,钉钉已经能推送消息