此文章只是记录一下搭建一套prometheus以及微信告警的步骤,没有详细讲解原理。
下面文字包括的组件比较多,可以选择性安装。
- 安装node_exporter监控服务器基础信息
1.1 下载node_exporter并安装
# 下载地址https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz
# 创建主目录,并上传安装包
mkdir /opt/monitor
cd /opt/monitor
tar -xzvf node_exporter-1.0.1.linux-amd64.tar.gz
mv node_exporter-1.0.1.linux-amd64 node_exporter
# 默认端口是9100,需要考虑是否端口冲突
cd /opt/monitor/node_exporter
nohup ./node_exporter --web.listen-address=:9100 &
1.2 测试node_exporter是否可以采集到数据
# 浏览器访问,或者以下curl命令
curl http://xx.xx.xx.xx:9100/metrics
1.3 将node_exporter服务化,方便重启,以及设置自启动
# 服务化,端口可更改
echo '[Unit]
Description=node_exporter
Documentation=node_exporter
[Service]
User=finance
Group=finance
ExecStart=/opt/monitor/node_exporter/node_exporter --web.listen-address=:9100
[Install]
WantedBy=multi-user.target' >/usr/lib/systemd/system/node_exporter.service
# 设置自启动
systemctl enable --now node_exporter
- 安装mysqld_exporter监控mysql
2.1 下载mysqld_exporter并安装
# 下载地址https://github.com/prometheus/mysqld_exporter/releases/download/v0.12.1/mysqld_exporter-0.12.1.linux-amd64.tar.gz
# 安装上传安装包并解压
cd /opt/monitor
tar -xzvf mysqld_exporter-0.12.1.linux-amd64.tar.gz
mv mysqld_exporter-0.12.1.linux-amd64 mysqld_exporter
# 需要在监控的目标数据库创建账号用于收集数据库监控指标
CREATE USER 'exporter'@'localhost' IDENTIFIED BY 'pwd4test';
GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'localhost';
flush privileges;
# 创建mysqld_exporter配置文件
echo '[client]
user=exporter
password=pwd4test' > /opt/monitor/mysqld_exporter/.my.cnf
# 启动mysqld_exporter
cd mysqld_exporter
nohup ./mysqld_exporter --config.my-cnf=.my.cnf --web.listen-address=:9104 &
2.2 测试验证
# 可以浏览器访问,也可以curl
curl http://xx.xx.xx.xx:9104/metrics
2.3 服务化
# 服务化,端口可更改
echo '[Unit]
Description=mysqld_exporter
Documentation=mysqld_exporter
[Service]
User=finance
Group=finance
ExecStart=/opt/monitor/mysqld_exporter/mysqld_exporter --config.my-cnf=/opt/monitor/mysqld_exporter/.my.cnf --web.listen-address=:9104
[Install]
WantedBy=multi-user.target' > /usr/lib/systemd/system/mysqld_exporter.service
# 配置自启动
systemctl daemon-reload
systemctl enable --now mysqld_exporter
- 安装process_exporter监控指定进程
3.1 下载process_exporter并安装
# 下载地址https://github-production-release-asset-2e65be.s3.amazonaws.com/64079945/7d8eb200-1f3a-11eb-8261-7d8780cc4638?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201231%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201231T061102Z&X-Amz-Expires=300&X-Amz-Signature=cc89dafcb363f69d91c6b3c8982aa666afbfddbd341e0fca94207566267828d4&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=64079945&response-content-disposition=attachment%3B%20filename%3Dprocess-exporter-0.7.5.linux-amd64.tar.gz&response-content-type=application%2Foctet-stream
# 解压
cd /opt/monitor
tar -xzvf process-exporter-0.7.5.linux-amd64.tar.gz
mv process-exporter-0.7.5.linux-amd64 process-exporter
# 创建配置文件,以下实例为监控redis_exporter进程
cd process-exporter
vi process-name.yml
process_names:
- name: "{{.Matches}}"
cmdline:
- 'redis_exporter'
# 启动
nohup ./process-exporter -config.path process-name.yml --web.listen-address=:9256 &
3.2 测试验证
# 可以浏览器访问,或者curl
curl http://xx.xx.xx.xx:9256/metrics
3.3 服务化
# 服务器,端口适当修改
echo '[Unit]
Description=process_exporter
Documentation=process_exporter
[Service]
User=finance
Group=finance
ExecStart=/opt/monitor/process-exporter/process-exporter -config.path /opt/monitor/process-exporter/process-name.yml --web.listen-address=:9256
[Install]
WantedBy=multi-user.target' > /usr/lib/systemd/system/process_exporter.service
# 设置服务自启动
systemctl enable --now process_exporter
- 安装redis_exporter监控redis集群
4.1 下载安装redis_exporter
# 下载地址https://github-production-release-asset-2e65be.s3.amazonaws.com/34705315/3938b180-200a-11ea-852e-5c7b617446b2?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201230%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201230T065656Z&X-Amz-Expires=300&X-Amz-Signature=7ca5c9b2c2f67dcaee356f5ac9fbe0527beb006ffa721d4fde607f9a1d4c434e&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=34705315&response-content-disposition=attachment%3B%20filename%3Dredis_exporter-v1.3.5.linux-amd64.tar.gz&response-content-type=application%2Foctet-stream
# 解压安装
cd /opt/monitor
tar -xzvf redis_exporter-v1.3.5.linux-amd64.tar.gz
mv redis_exporter-v1.3.5.linux-amd64 redis_exporter
# redis如果是单节点,以下命令就可以监控单节点
# 如果是集群,那么还需要配合prometheus.yml来实现,不过此处只需要监控一个节点就行
cd redis_exporter
nohup ./redis_exporter --web.listen-address=:9121 -redis.addr localhost:7000 -redis.password pwd4test &
4.2 测试验证
# 浏览器访问,或者curl
curl http://xx.xx.xx.xx:9121/metrics
4.3 服务化
#服务化,注意适当修改端口
echo '[Unit]
Description=redis_exporter
Documentation=redis_exporter
[Service]
User=finance
Group=finance
ExecStart=/opt/monitor/redis_exporter/redis_exporter --web.listen-address=:9121 -redis.addr localhost:7000 -redis.password pwd4test
[Install]
WantedBy=multi-user.target' > /usr/lib/systemd/system/redis_exporter.service
#配置自启动
systemctl enable --now redis_exporter
- 安装配置alaertmanager用于实现告警(此处只配置了微信告警)
# 此步骤之前,首先需要申请企业微信以及拿到一些信息参考[可以参考这篇文字](https://blog.csdn.net/qq_25934401/article/details/83088344)
#1. 访问[企业微信网站](https://work.weixin.qq.com/),注册企业微信
#2. 访问应用管理,创建“第三方应用”,并填写信息
#3. 点击应用,查询到以下信息:
#AgentId
#Secret
#DepartmentId
#Comany ID
#4. allow users里面,添加用户,这样才能接受到消息
#5. 要接收微信展会接收企业微信告警消息,分两步:
#5.1 扫描企业二微码加入企业,并且在通讯录把账号加入对应部门
#5.2 扫描“微工作台”二维码并关注
5.1 安装配置alartmanager
# 下载安装包 https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz
# 解压安装
cd /opt/monitor
tar -xzvf alertmanager-0.21.0.linux-amd64.tar.gz
mv alertmanager-0.21.0.linux-amd64 alertmanager
cd /opt/monitor/alertmanager
cp alertmanager.yml alertmanager.yml.bk
# 修改配置文件
vim alertmanager.yml
# 以下仅为实例
/opt/monitor/alertmanager/alertmanager.yml
文件内容
global:
resolve_timeout: 5m
wechat_api_corp_id: 'wwf9ad187705xxxx'
wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
wechat_api_secret: 'yytUedAXmHOOdZODZFeI_QDuzYOX8PGRouxxxx'
templates:
- '/opt/monitor/alertmanager/template/wechat.tmpl'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 20m
receiver: 'wechat'
receivers:
- name: 'wechat'
wechat_configs:
- send_resolved: true
to_party: '4'
agent_id: 1000005
corp_id: 'wwf9ad18770xxxx'
api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
api_secret: 'yytUedAXmHOOdZODZFeI_QDuzYOX8PGRou6Gkg8xxxx'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
/opt/monitor/alertmanager/template/wechat.tmpl
文件内容
{{ define "wechat.default.message" }}
{{ range .Alerts }}
告警主题: {{ .Annotations.summary }}
========start=========
告警程序: prometheus_alert
告警级别: {{ .Labels.severity }}
告警类型: {{ .Labels.alertname }}
故障主机: {{ .Labels.instance }}
告警详情: {{ .Annotations.description }}
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
=========end===========
{{ end }}
{{ end }}
~
5.2 启动并验证
# 启动
nohup ./alertmanager --web.listen-address=:9093 &
# 浏览器访问,或者curl
curl http://xx.xx.xx.xx:9093/metrics
5.3 服务化
# 服务化,注意适当修改端口
echo '[Unit]
Description=alertmanager
Documentation=alertmanager
[Service]
User=finance
Group=finance
ExecStart=/opt/monitor/alertmanager/alertmanager --config.file=/opt/monitor/alertmanager/alertmanager.yml --web.listen-address=:9093
[Install]
WantedBy=multi-user.target' > /usr/lib/systemd/system/alertmanager.service
# 设置自启动
systemctl enable --now alertmanager
- 安装配置Prometheus服务端
6.1 下载安装prometheus
# 下载地址https://github.com/prometheus/prometheus/releases/download/v2.23.0/prometheus-2.23.0.linux-amd64.tar.gz
# 解压安装配置
cd /opt/monitor
tar -xzvf prometheus-2.23.0.linux-amd64.tar.gz
mv prometheus-2.23.0.linux-amd64 prometheus
cd prometheus
# 修改prometheus.yml主配置文件,参考下面文件内容
# 创建存放rule的目录
mkdir rules
# 创建rule文件来规定告警项目,参考下面文件内容
/opt/monitor/prometheus/prometheus.yml
内容如下,仅供参考
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
# - targets: ["localhost:9093"]
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "rules/nodes.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: "node"
static_configs:
- targets: ["localhost:9100"]
- job_name: "mysql"
static_configs:
- targets: ["localhost:9104"]
- job_name: 'redis_exporter_targets'
static_configs:
- targets:
- redis://localhost:7000
- redis://localhost:7001
- redis://localhost:7002
- redis://localhost:7003
- redis://localhost:7004
- redis://localhost:7005
metrics_path: /scrape
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: localhost:9121
## config for scraping the exporter itself
- job_name: 'redis'
static_configs:
- targets:
- localhost:9121
- job_name: 'process'
static_configs:
- targets: ['localhost:9256']
/opt/monitor/prometheus/rules/nodes.yml
内容如下,仅供参考
groups:
- name: node
rules:
- alert: "服务器挂了--测试信息请忽略"
expr: up{job="prometheus"} == 0 or up{job="node"} == 0
for: 1m
labels:
severity: warning
annotations:
summary: "服务器{{ $labels.instance }} 挂了"
description: "当前服务器无法连接,服务器挂掉了,或者node_exporter挂了,请立即查看!"
- name: process
rules:
- alert: "指定监控进程挂了"
expr: namedprocess_namegroup_num_threads == 0
for: 1m
labels:
severity: warning
annotations:
summary: "指定监控进程{{ $labels.groupname }} 挂了"
description: "指定监控进程{{ $labels.groupname }} 挂了,请立即查看相关应用是否正常启动!"
- name: redis
rules:
- alert: "Redis挂了 -- 测试信息请忽略"
expr: redis_up == 0
for: 1m
labels:
severity: error
annotations:
summary: "Redis down (instance {{ $labels.instance }})"
description: "Redis 挂了,请立即查看。 VALUE = {{ $value }}\n LABELS: {{ $labels }}"
6.2 启动并验证
# 启动,开启web.enable-lifecycle,方便动态加载配置
nohup ./prometheus --web.enable-lifecycle &
# 浏览器验证,或者curl
curl http://xx.xx.xx.xx:9090/
6.3 服务化
# 服务化,适当修改路径,以及端口
echo '[Unit]
Description=Prometheus Monitoring System
Documentation=Prometheus Monitoring System
[Service]
User=finance
Group=finance
ExecStart=/opt/monitor/prometheus/prometheus \
--config.file=/opt/monitor/prometheus/prometheus.yml --web.enable-admin-api \
--storage.tsdb.retention=90d --web.enable-lifecycle --web.listen-address=:9090 --storage.tsdb.path=/opt/monitor/prometheus/data
[Install]
WantedBy=multi-user.target' > /usr/lib/systemd/system/prometheus.service
#配置自启动
systemctl enable --now prometheus
6.4 动态修改配置命令
curl -X POST http://localhost:9090/-/reload
- 安装配置grafana用于监控信息展示
7.1 下载安装配置grafana
# 下载
cd /opt/monitor
wget https://dl.grafana.com/oss/release/grafana-7.3.5-1.x86_64.rpm
# 安装之后默认配置信息如下:
Installs binary to /usr/sbin/grafana-server
Copies init.d script to /etc/init.d/grafana-server
Installs default file (environment vars) to /etc/sysconfig/grafana-server
Copies configuration file to /etc/grafana/grafana.ini
Installs systemd service (if systemd is available) name grafana-server.service
The default configuration uses a log file at /var/log/grafana/grafana.log
The default configuration specifies an sqlite3 database at /var/lib/grafana/grafana.db
#安装grafana
cd /opt/monitor
yum install initscripts urw-fonts wget
rpm -Uvh grafana-7.3.5-1.x86_64.rpm
7.2 启动以及验证
# 启动
systemctl daemon-reload
systemctl start grafana-server
systemctl status grafana-server
# 浏览器验证,设置密码
http://xx.xx.xx.xx:3000/login
7.3 配置添加promethus数据源
Configuration -> Add data source -> Promethus
URL: http://10.247.121.6:9090
Save & Test
7.4导入展示监控信息模板
# node_exporter
https://grafana.com/dashboards/9276
# mysqld_exporter
https://grafana.com/dashboards/7362
# process_exporter
# redis_exporter
https://grafana.com/dashboards/763