1.Grafana从Prometheus拉取数据图表化展示监控结果
2.Prometheus生成告警规则并推送告警至AlertManager做分组抑制处理最后到达邮箱
Prometheus 2.36.2
Influxdb 1.7.8
Grafana 7.0.6
Consul 1.6.1
Consul_Exporter 0.8.0
Node_Exporter 1.3.1
AlertManager 0.24.0
BlackBox_Exporter 0.21.1
CentOS 7.5
Prometheus官网下载地址:https://prometheus.io/download/
自己整理的安装包可用天翼云盘下载:https://cloud.189.cn/t/3aMBJzJNFZje (访问码:9j3c)
#1.1解压prometheus
tar -xvf prometheus-2.36.2.linux-amd64.tar.gz -C /usr/local/prometheus
#1.2编辑服务开机启动文件
#--web.enable-admin-api 这个参数是为了开启用api查看删除prometheus收集的数据
vim /usr/lib/systemd/system/prometheus.service
[Unit]
Description=prometheus
After=network.target
[Service]
User=prometheus
Group=prometheus
WorkingDirectory=/usr/local/prometheus
ExecStart=/usr/local/prometheus/prometheus --web.enable-admin-api
[Install]
WantedBy=multi-user.target
#1.3编辑配置文件,配置数据保存至influxdb
#追加到配置文件末尾即可
vim /usr/local/prometheus/prometheus.yml
# Data save to influxdb
remote_write:
- url: "http://localhost:8086/api/v1/prom/write?db=prometheus"
remote_read:
- url: "http://localhost:8086/api/v1/prom/read?db=prometheus"
#1.4influxdb创建prometheus库
create database prometheus;
#2.安装grafana
yum install /root/grafana-7.0.6-1.x86_64.rpm
#3.1解压Node_Exporter
tar -xvf node_exporter-1.3.1.linux-amd64.tar.gz -C /usr/local/node_exporter
#3.2编辑服务开机启动文件
vim /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_exporter
After=network.target
[Service]
User=prometheus
Group=prometheus
ExecStart=/usr/local/exporter/node_exporter --web.listen-address=:20001 --collector.systemd --collector.systemd.unit-whitelist=(sshd|nginx).service --collector.processes
[Install]
WantedBy=multi-user.target
#4.1consul解压
unzip /root/consul_1.6.1_linux_amd64.zip -d /usr/local/consul
#4.2编辑服务开机启动文件
vim /usr/lib/systemd/system/consul.service
[Unit]
Description=consul
After=network.target
[Service]
ExecStart=/usr/local/consul/consul agent -dev -client 0.0.0.0 -ui
[Install]
WantedBy=multi-user.target
#4.3prometheus配置文件增加配置
#在scrape_configs块增加配置
vim /usr/local/prometheus/prometheus.yml
scrape_configs:
# node metrics 8500是consul监听的端口,所有的服务注册到consul上
- job_name: "Client"
consul_sd_configs:
- server: "192.168.8.140:8500"
services: []
#5.1consul_exporter解压
tar -xvf consul_exporter-0.8.0.linux-amd64.tar.gz -C /usr/local/consul_exporter
#5.2编辑服务开机启动文件
vim /usr/lib/systemd/system/consul_exporter.service
[Unit]
Description=consul_exporter
After=network.target
[Service]
ExecStart=/usr/local/consul_exporter/consul_exporter
[Install]
WantedBy=multi-user.target
#6.1alertmanager解压
tar -xvf alertmanager-0.24.0.linux-amd64.tar.gz -C /usr/local/alertmanager
#6.2编辑服务开机启动文件
vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
After=network.target
[Service]
WorkingDirectory=/usr/local/alertmanager
ExecStart=/usr/local/alertmanager/alertmanager
[Install]
WantedBy=multi-user.target
#6.3prometheus配置文件增加配置
#这段配置是本来就有的,直接在相应地方修改就行
vim /usr/local/prometheus/prometheus.yml
global:
scrape_interval: 1s # 这个配置默认1分钟,我为了看效果,修改成了一秒,会导致发送很多告警
evaluation_interval: 1s # 加载执行报警规则间隔,默认1分钟,我为了看效果,修改成了一秒
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.8.140:9093 #alertmanager的地址
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml" # rules是在安装目录下创建的文件夹,存放定义的告警规则
# - "second_rules.yml"
#6.4创建rules目录并定义告警规则
mkdir /usr/local/prometheus/rules
#annotations下定义的summary,recover等变量都可以在alertmanager的模板文件引用到,
#同时也可自定义变量,recover,recover_description都是我自定义的变量
#$labels.instance是prometheus预定义的变量,可通过官网文档查看
vim /usr/local/prometheus/rules/base.yml
groups:
- name: node-up
rules:
- alert: node-up
expr: up{job="Client"} == 0
for: 1s
labels:
severity: 1
team: node
annotations:
summary: "{{ $labels.instance }} 停止运行"
recover: "{{ $labels.instance }} 恢复运行"
description: "{{ $labels.instance }} 异常"
recover_description: "{{ $labels.instance }} 恢复正常,请确认!"
#6.5创建邮件模板
#创建模板存放目录
mkdir /usr/local/alertmanager/data/template
#编辑模板文件
#在渲染邮件内容时,根据if判断选择告警通知还是告警恢复模板
#email.from email.to填写自己真实的发件人收件人邮箱地址
vim /usr/local/alertmanager/data/template/email.template
{{ define "email.from" }}test@189.cn{{ end }}
{{ define "email.to" }}test@qq.com{{ end }}
{{ define "email.to.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}
告警通知</h2>
=========start==========
告警程序: PrometheusAlert
告警时间: {{ .StartsAt.Local }}
告警级别: {{ .Labels.severity }} 级
告警类型: {{ .Labels.alertname }}
故障主机: {{ .Labels.instance }}
告警主题: {{ .Annotations.summary }}
告警详情: {{ .Annotations.description }}
=========end==========
{{ end }}
{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
告警恢复</h2>
=========start==========
告警程序: PrometheusAlert
告警时间: {{ .StartsAt.Local }}
恢复时间: {{ .EndsAt.Local }}
告警级别: {{ .Labels.severity }} 级
告警类型: {{ .Labels.alertname }}
故障主机: {{ .Labels.instance }}
告警主题: {{ .Annotations.recover }}
告警详情: {{ .Annotations.recover_description }}
=========end==========
{{ end }}
{{ end -}}
#6.6编辑alermanager配置文件
# 根据alertname聚合同类邮件
vim /usr/local/alertmanager/alertmanager.yml
global:
resolve_timeout: 3s
smtp_smarthost: 'smtp.189.cn:465' #配置邮箱服务器地址
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]' #配置发件人地址
smtp_auth_password: 'Vmnifonm197.' #密码
smtp_require_tls: false
templates:
- '/usr/local/alertmanager/data/template/email.template' # 邮件模板
route:
group_by: ['alertname']
group_wait: 1s
group_interval: 1s
repeat_interval: 30s # 邮件重复发送间隔
receiver: 'mail'
receivers:
- name: 'mail'
email_configs:
- to: '{{ template "email.to" . }}' # 邮件模板里定义的变量
html: '{{ template "email.to.html" . }}'
#insecure_skip_verify: true
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
#7.1blackbox_exporter解压
tar -xvf blackbox_exporter-0.21.1.linux-amd64.tar.gz -C /usr/local/blackbox_exporter
#7.2编辑服务开机启动文件
vim /usr/lib/systemd/system/blackbox_exporter.service
[Unit]
Description=blackbox_exporter
After=network.target
[Service]
ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml
[Install]
WantedBy=multi-user.target
#7.3prometheus配置文件增加配置
#在scrape_configs下增加配置
vim /usr/local/prometheus/prometheus.yml
scrape_configs:
# blackbox job
- job_name: "Check"
metrics_path: /probe #这项配置固定的不能改
params:
module: [http_2xx]
consul_sd_configs:
- server: "192.168.8.140:8500"
services: []
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 192.168.8.140:9115
#开机启动并立刻启动
systemctl enable --now node_exporter
systemctl enable --now grafana-server
systemctl enable --now prometheus
systemctl enable --now consul
systemctl enable --now consul_exporter
systemctl enable --now alertmanager
systemctl enable --now blackbox_exporter
#将服务node_exporter注册到consul
cd /usr/local/consul
./consul services register -id=node-1 -name=node-1 -address=192.168.8.140 -port=20001 -tag=node_exporter
./consul services register -id=consul_exporter-1 -name=consul_exporter-1 -address=192.168.8.140 -port=9107 -tag=consul_exporter
./consul services register -id=blackbox_exporter-1 -name=blackbox_exporter-1 -address=192.168.8.140 -port=9115 -tag=blackbox_exporter
网址:http://192.168.8.140:9090
1.查看运行状态版本信息
可以看到运行目录,Prometheus版本,Go版本,以及配置的Alertmanager地址信息
2.查看配置的监控节点
3.查看配置的告警规则
4.查看告警页面
1.注意Grafana版本必须是7.0.6,其他版本一些图可能显示不出来
2.创建数据源
3.导入dashboard,开箱即用的node_exporter的图表,ID是:11074,输入ID从Grafana网站下载
4.最终效果
监听端口默认是8500
地址:http://192.168.8.140:8500/ui/dc1/services
地址:http://192.168.8.140:20001/metrics
其实就是Prometheus抓取数据的地址