day03Prometheus概述部署Prometheus服务器环境说明:配置时间安装Prometheus服务器添加被监控端部署通用的监控exporterGrafana概述部署Grafana展示node1的监控信息监控MySQL数据库配置MySQL配置mysql exporter配置mysql exporter配置prometheus监控mysql自动发现机制概述基于文件自动发现修改Prometheus使用自动发现配置web2接受Prometheus监控Alertmanager概述Alertmanager特性部署Alertmanager部署配置文件Prometheus与Alertmanager对接配置Alertmanager通过邮件发送告警
# 1. 查看时区
[root@prometheus ~]# timedatectl
Local time: Sun 2023-01-01 11:15:11 CST
Universal time: Sun 2023-01-01 03:15:11 UTC
RTC time: Sun 2023-01-01 03:15:11
Time zone: Asia/Shanghai (CST, +0800)
System clock synchronized: no
NTP service: inactive
RTC in local TZ: no
# 2. 如果时区不正确,则改为正确的时区
[root@prometheus ~]# timedatectl set-timezone Asia/Shanghai
# 3. 查看时间
[root@prometheus ~]# date
# 4. 如果时间不正确,则改为正确的时间
[root@prometheus ~]# date -s "年月日 时:分:秒"
[root@prometheus ~]# cd prometheus_soft/
[root@prometheus prometheus_soft]# tar xf prometheus-2.37.5.linux-amd64.tar.gz
[root@prometheus prometheus_soft]# mv prometheus-2.37.5.linux-amd64 /usr/local/prometheus
配置文件
global
、rule_files
和scrape_configs
。global
块控制 Prometheus 服务器的全局配置。我们有两个选择。第一个,scrape_interval
控制 Prometheus 抓取目标的频率。您可以为单个目标覆盖它。在这种情况下,全局设置是每 15 秒抓取一次。该evaluation_interval
选项控制 Prometheus 评估规则的频率。Prometheus 使用规则来创建新的时间序列并生成警报。rule_files
块指定我们希望 Prometheus 服务器加载的任何规则的位置。现在我们还没有规则。scrape_configs
控制 Prometheus 监控的资源。由于 Prometheus 还将有关自身的数据公开为 HTTP 端点,因此它可以抓取和监控自身的健康状况。在默认配置中,有一个名为 的作业prometheus
,用于抓取 Prometheus 服务器公开的时间序列数据。该作业包含一个单一的、静态配置的目标,即localhost的9090
端口。Prometheus期望度量在/metrics路径上的目标上可用,所以这个默认作业是通过 URL 抓取的:http://localhost:9090/metrics。编写服务启动文件并启动服务
[root@prometheus ~]# vim /usr/lib/systemd/system/prometheus.service
[Unit]
Description=Prometheus Monitoring System
After=network.target
[Service]
ExecStart=/usr/local/prometheus/prometheus \
--config.file=/usr/local/prometheus/prometheus.yml \
--storage.tsdb.path=/usr/local/prometheus/data/
[Install]
WantedBy=multi-user.target
# 启动服务
[root@prometheus prometheus_soft]# systemctl daemon-reload
[root@prometheus prometheus_soft]# systemctl enable prometheus.service --now
[root@prometheus prometheus_soft]# ss -tlnp | grep :9090
LISTEN 0 128 *:9090 *:* users:(("prometheus",pid=4396,fd=7))
监控方式:
被监控端根据自身运行的服务,可以运行不同的exporter(被监控端安装的、可以与Prometheus通信,实现数据传递的软件)
exporter列表:Exporters and integrations | Prometheus
# 1. 拷贝node_exporter到web1
[root@prometheus ~]# scp prometheus_soft/node_exporter-1.5.0.linux-amd64.tar.gz 192.168.88.100:/root/
# 2. 解压即部署
[root@web1 ~]# tar xf node_exporter-1.5.0.linux-amd64.tar.gz
[root@web1 ~]# mv node_exporter-1.5.0.linux-amd64 /usr/local/node_exporter
# 3. 创建服务文件,并启动服务
[root@web1 ~]# vim /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_exporter
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/node_exporter/node_exporter
[Install]
WantedBy=multi-user.target
[root@web1 ~]# systemctl daemon-reload
[root@web1 ~]# systemctl enable node_exporter.service --now
[root@web1 ~]# ss -tlnp | grep :9100
LISTEN 0 128 *:9100 *:* users:(("node_exporter",pid=7371,fd=3))
# 1. 修改配置文件,追加以下内容。特别注意缩进
[root@prometheus ~]# vim /usr/local/prometheus/prometheus.yml
...略...
- job_name: "web1"
static_configs:
- targets: ["192.168.88.100:9100"]
# 2. 重启服务
[root@prometheus ~]# systemctl restart prometheus.service
[root@prometheus ~]# yum install -y prometheus_soft/grafana-enterprise-9.3.2-1.x86_64.rpm
[root@prometheus ~]# systemctl enable grafana-server.service --now
添加仪表盘
查看仪表盘
[root@web1 ~]# yum install -y mysql-server
[root@web1 ~]# systemctl enable mysqld --now
[root@web1 ~]# mysql
mysql> create user dbuser1@localhost identified by '123456';
mysql> grant all privileges on *.* to dbuser1@localhost;
mysql> quit
# 1. 安装
[root@prometheus ~]# scp prometheus_soft/mysqld_exporter-0.14.0.linux-amd64.tar.gz 192.168.88.100:/root/
[root@web1 ~]# tar xf mysqld_exporter-0.14.0.linux-amd64.tar.gz
[root@web1 ~]# mv mysqld_exporter-0.14.0.linux-amd64 /usr/local/mysqld_exporter
# 2. 编写用于连接mysql服务的配置文件
[root@web1 ~]# vim /usr/local/mysqld_exporter/.my.cnf
[client]
host=127.0.0.1
port=3306
user=dbuser1
password=123456
# 3. 创建service文件
[root@web1 ~]# vim /usr/lib/systemd/system/mysqld_exporter.service
[Unit]
Description=mysqld_exporter
After=network.target
[Service]
ExecStart=/usr/local/mysqld_exporter/mysqld_exporter \
--config.my-cnf=/usr/local/mysqld_exporter/.my.cnf
[Install]
WantedBy=multi-user.target
[root@web1 ~]# systemctl daemon-reload
[root@web1 ~]# systemctl enable mysqld_exporter.service --now
# 1. 在配置文件中追加内容
[root@prometheus ~]# vim /usr/local/prometheus/prometheus.yml
...略...
- job_name: "mysql"
static_configs:
- targets: ["192.168.88.100:9104"]
# 2. 重启服务
[root@prometheus ~]# systemctl restart prometheus.service
file_sd_configs
基于文件自动发现、基于K8S自动发现、基于openstack自动发现、基于consul自动发现等。file_sd_configs
实现文件级别的自动发现
# 1. 备份现有配置文件
[root@prometheus ~]# cp /usr/local/prometheus/prometheus.yml ~
# 2. 修改配置文件,删除静态配置,添加自动发现配置
[root@prometheus ~]# vim /usr/local/prometheus/prometheus.yml
# 将scrape_configs及以下内容修改为:
21 scrape_configs:
22 - job_name: "prometheus"
23 file_sd_configs:
24 - refresh_interval: 120s
25 files:
26 - /usr/local/prometheus/sd_config/*.yml
# 3. 重启服务
[root@prometheus ~]# systemctl restart prometheus.service
[root@prometheus ~]# mkdir /usr/local/prometheus/sd_config
[root@prometheus ~]# vim /usr/local/prometheus/sd_config/discovery.yml
- targets:
- 192.168.88.5:9090
- 192.168.88.100:9100
- 192.168.88.100:9104
[root@web1 ~]# scp -r /usr/local/node_exporter 192.168.88.200:/usr/local/
[root@web1 ~]# scp /usr/lib/systemd/system/node_exporter.service 192.168.88.200:/usr/lib/systemd/system/
[root@web2 ~]# systemctl daemon-reload
[root@web2 ~]# systemctl enable node_exporter.service --now
[root@prometheus ~]# vim /usr/local/prometheus/sd_config/discovery.yml
- targets:
- 192.168.88.5:9090
- 192.168.88.100:9100
- 192.168.88.100:9104
- 192.168.88.200:9100
设置告警和通知的主要步骤是:
在Prometheus中一条告警规则主要由以下几部分组成:
# 1. 解压
[root@prometheus ~]# cd prometheus_soft/
[root@prometheus prometheus_soft]# tar xf alertmanager-0.25.0.linux-amd64.tar.gz
[root@prometheus prometheus_soft]# mv alertmanager-0.25.0.linux-amd64 /usr/local/alertmanager
# 2. 编写服务文件并启动
[root@prometheus prometheus_soft]# vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager System
[Service]
ExecStart=/usr/local/alertmanager/alertmanager \
--config.file=/usr/local/alertmanager/alertmanager.yml
[Install]
WantedBy=multi-user.target
[root@prometheus ~]# systemctl daemon-reload
[root@prometheus ~]# systemctl enable alertmanager.service --now
Alertmanager的配置主要包含两个部分:路由(route)以及接收器(receivers)。所有的告警信息都会从配置中的顶级路由(route)进入路由树,根据路由规则将告警信息发送给相应的接收器。
在Alertmanager中可以定义一组接收器,比如可以按照角色(比如系统运维,数据库管理员)来划分多个接收器。接收器可以关联邮件,Slack以及其它方式接收告警信息。
目前配置文件中只设置了一个顶级路由route并且定义的接收器为default-receiver。因此,所有的告警都会发送给default-receiver。
因此在Alertmanager配置中一般会包含以下几个主要部分:
[root@prometheus ~]# vim /usr/local/prometheus/prometheus.yml
...略...
8 alerting:
9 alertmanagers:
10 - static_configs:
11 - targets:
12 - localhost:9093
...略...
[root@prometheus ~]# systemctl restart prometheus.service
# 1. 备份配置文件
[root@prometheus ~]# cp /usr/local/alertmanager/alertmanager.yml ~
# 2. 修改配置文件
[root@prometheus ~]# vim /usr/local/alertmanager/alertmanager.yml
global:
smtp_from: '[email protected]' # 发件人地址
smtp_smarthost: 'localhost:25' # 邮件服务器地址
smtp_require_tls: false # 是否使用TLS安全连接
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'default-receiver' # 接收器
receivers:
- name: 'default-receiver' # 配置接收器为邮件
email_configs:
- to: '[email protected]'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
# 3. 定义告警规则
[root@prometheus ~]# mkdir /usr/local/prometheus/rules
[root@prometheus ~]# vim /usr/local/prometheus/rules/hoststats-alert.rules
groups:
- name: example
rules:
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: warn
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: hostMemUsageAlert
expr: (node_memory_MemTotal - node_memory_MemAvailable)/node_memory_MemTotal > 0.85
for: 1m
labels:
severity: warn
annotations:
summary: "Instance {{ $labels.instance }} MEM usgae high"
# 4. 在Prometheus中声明规则文件位置
[root@prometheus ~]# vim /usr/local/prometheus/prometheus.yml
...略...
15 rule_files:
16 - /usr/local/prometheus/rules/*.rules
...略...
# 5. 重启服务
[root@prometheus ~]# systemctl restart alertmanager.service
[root@prometheus ~]# systemctl restart prometheus.service
# 6. 安装并启动邮件服务
[root@prometheus ~]# yum install -y postfix mailx
[root@prometheus ~]# systemctl enable postfix --now
# 1. 将web1关机
[root@web1 ~]# shutdown -h now
# 2. 查看邮件
[root@prometheus ~]# mail
>N 1 [email protected] Sun Jan 1 18:59 227/10404 "[FIRING:1] InstanceDown (192.168.88.200:9100 prometheus warn)"
将告警邮件内容从【】这一行到【