Prometheus是一个开源的系统监视和警报工具包,自2012成立以来,许多公司和组织采用了Prometheus。它现在是一个独立的开源项目,并独立于任何公司维护。
在2016年,Prometheus加入云计算基金会作为Kubernetes之后的第二托管项目。
wget https://github.com/prometheus/prometheus/releases/download/v2.5.0/prometheus-2.5.0.linux-amd64.tar.gz
groupadd prometheus
useradd -g prometheus prometheus -d /app/prometheus
tar -xvf prometheus-2.5.0.linux-amd64.tar.gz
cd prometheus-2.5.0.linux-amd64/
mv * /app/prometheus/
cd /app/prometheus/
mkdir {data,cfg,logs,bin} -p
mv prometheus promtool bin/
mv prometheus.yml cfg/
chown -R prometheus.prometheus *
vim /etc/profile
PATH=/app/prometheus/bin:$PATH:$HOME/bin
source /etc/profile
# promtool check config /app/prometheus/config/prometheus.yml
Checking /app/prometheus/config/prometheus.yml
SUCCESS: 0 rule files found
vim /etc/systemd/system/prometheus.service
[Unit]
Description=Prometheus
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/app/prometheus/bin/prometheus --config.file=/app/prometheus/cfg/prometheus.yml --storage.tsdb.path=/app/prometheus/data
Restart=on-failure
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
systemctl enable prometheus.service
systemctl start prometheus.service
# systemctl status prometheus.service
● prometheus.service - Prometheus
Loaded: loaded (/etc/systemd/system/prometheus.service; enabled; vendor preset: disabled)
Active: active (running) since 日 2018-12-09 22:21:52 CST; 4min 59s ago
Docs: https://prometheus.io/
Main PID: 1308 (prometheus)
CGroup: /system.slice/prometheus.service
└─1308 /app/prometheus/bin/prometheus --config.file=/app/prometheus/cfg/prometheus.yml --storage.tsdb.path=/app/prometheus/data
12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.190312051Z caller=main.go:245 build_context="(go=go1.11.1, user=root@578ab108d0b9, date=20...-11:40:44)"
12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.190327105Z caller=main.go:246 host_details="(Linux 3.10.0-862.el7.x86_64 #1 SMP Fri Apr 20...us (none))"
12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.190342191Z caller=main.go:247 fd_limits="(soft=1024, hard=4096)"
12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.190351846Z caller=main.go:248 vm_limits="(soft=unlimited, hard=unlimited)"
12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.192559162Z caller=main.go:562 msg="Starting TSDB ..."
12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.204059097Z caller=main.go:572 msg="TSDB started"
12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.204101343Z caller=main.go:632 msg="Loading configuration file" filename=/app/prometheus/cf...metheus.yml
12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.204905309Z caller=main.go:658 msg="Completed loading of configuration file" filename=/app/...metheus.yml
12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.204919014Z caller=main.go:531 msg="Server is ready to receive web requests."
12月 09 22:21:52 qas-prometheus prometheus[1308]: level=info ts=2018-12-09T14:21:52.20493548Z caller=web.go:399 component=web msg="Start listening for connections" address=0.0.0.0:9090
Hint: Some lines were ellipsized, use -l to show in full.
wget https://github.com/prometheus/node_exporter/releases/download/v0.17.0/node_exporter-0.17.0.linux-amd64.tar.gz
tar -xvf node_exporter-0.17.0.linux-amd64.tar.gz -C /app/prometheus/
cd /app/prometheus/
mv node_exporter-0.17.0.linux-amd64 node_exporter
groupadd prometheus
useradd -g prometheus prometheus -d /app/prometheus
chown -R prometheus.prometheus node_exporter
# vim /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_exporter
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/app/prometheus/node_exporter/node_exporter
Restart=on-failure
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
systemctl enable node_exporter.service
systemctl start node_exporter.service
]# systemctl status node_exporter.service
● node_exporter.service - node_exporter
Loaded: loaded (/usr/lib/systemd/system/node_exporter.service; enabled; vendor preset: disabled)
Active: active (running) since 日 2018-12-09 22:45:10 CST; 4min 8s ago
Docs: https://prometheus.io/
Main PID: 1515 (node_exporter)
CGroup: /system.slice/node_exporter.service
└─1515 /app/prometheus/node_exporter/node_exporter
12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - sockstat" source="node_exporter.go:97"
12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - stat" source="node_exporter.go:97"
12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - textfile" source="node_exporter.go:97"
12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - time" source="node_exporter.go:97"
12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - timex" source="node_exporter.go:97"
12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - uname" source="node_exporter.go:97"
12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - vmstat" source="node_exporter.go:97"
12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - xfs" source="node_exporter.go:97"
12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg=" - zfs" source="node_exporter.go:97"
12月 09 22:45:10 qas-prometheus node_exporter[1515]: time="2018-12-09T22:45:10+08:00" level=info msg="Listening on :9100" source="node_exporter.go:111"
访问:http://172.16.8.253:9100/metrics,查看从exporter具体能抓到的数据.如下:
wget https://dl.grafana.com/oss/release/grafana-5.4.0-1.x86_64.rpm
yum localinstall grafana-5.4.0-1.x86_64.rpm
systemctl daemon-reload
systemctl enable grafana-server.service
systemctl start grafana-server.service
默认账号/密码:admin/admin
http://172.16.9.253:3000
在登陆首页,点击"Configuration-Data Sources"按钮,跳转到添加数据源页面,配置如下:
Name: prometheus
Type: prometheus
URL: http://172.16.9.253:9090/
Access: Server
取消Default的勾选,其余默认,点击"Add",如下:
在"Dashboards"页签下"import"自带的模版,如下:
tar -xvf alertmanager-0.15.3.linux-amd64.tar.gz -C /app/prometheus/
cd /app/prometheus/
mv alertmanager-0.15.3.linux-amd64 alertmanager
cd alertmanager/
mkdir {bin,cfg,data}
mv alertmanager amtool bin/
mv alertmanager.yml cfg/
chown -R prometheus.prometheus *
# vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/app/prometheus/alertmanager/bin/alertmanager \
--config.file=/app/prometheus/alertmanager/cfg/alertmanager.yml \
--web.listen-address=172.16.9.201:9093 \
--cluster.listen-address=0.0.0.0:8001 \
--storage.path=/app/prometheus/alertmanager/data \
--log.level=info
Restart=on-failure
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
systemctl enable alertmanager.service
systemctl start alertmanager.service
# tail -f /var/log/messages
Dec 11 10:51:11 prometheus-node2 systemd: Stopping alertmanager...
Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.118658711Z caller=main.go:426 msg="Received SIGTERM, exiting gracefully..."
Dec 11 10:51:11 prometheus-node2 systemd: Started alertmanager.
Dec 11 10:51:11 prometheus-node2 systemd: Starting alertmanager...
Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.156033311Z caller=main.go:174 msg="Starting Alertmanager" version="(version=0.15.3, branch=HEAD, revision=d4a7697cc90f8bce62efe7c44b63b542578ec0a1)"
Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.156186095Z caller=main.go:175 build_context="(go=go1.11.2, user=root@4ecc17c53d26, date=20181109-15:40:48)"
Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.179081721Z caller=cluster.go:155 component=cluster msg="setting advertise address explicitly" addr=172.16.9.202 port=8001
Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.182933235Z caller=main.go:322 msg="Loading configuration file" file=/app/prometheus/alertmanager/cfg/alertmanager.yml
Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.1953798Z caller=main.go:398 msg=Listening address=172.16.9.202:9093
Dec 11 10:51:11 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:11.203980995Z caller=cluster.go:570 component=cluster msg="Waiting for gossip to settle..." interval=2s
Dec 11 10:51:13 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:13.205051348Z caller=cluster.go:595 component=cluster msg="gossip not settled" polls=0 before=0 now=1 elapsed=2.000725532s
Dec 11 10:51:21 prometheus-node2 alertmanager: level=info ts=2018-12-11T02:51:21.208105947Z caller=cluster.go:587 component=cluster msg="gossip settled; proceeding" elapsed=10.003795489s
vim alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:25' # 邮箱smtp服务器代理
smtp_from: '[email protected]' # 发送邮箱名称
smtp_auth_username: '[email protected]' # 邮箱名称
smtp_auth_password: 'xxxxx' # 邮箱密码或授权码
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'mail'
receivers:
- name: 'mail'
email_configs:
- to: '[email protected]'
### rules配置告警规则
```
vim qas.yml
groups:
- name: 主机状态-监控告警
rules:
- alert: 主机状态
expr: up == 0
for: 1m
labels:
status: 非常严重
annotations:
summary: "{{$labels.instance}}:服务器宕机"
description: "{{$labels.instance}}:服务器延时超过5分钟"
- alert: CPU使用情况
expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 60
for: 1m
labels:
status: 一般告警
annotations:
summary: "{{$labels.mountpoint}} CPU使用率过高!"
description: "{{$labels.mountpoint }} CPU使用大于60%(目前使用:{{$value}}%)"
- alert: 内存使用
expr: 100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100> 80
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 内存使用率过高!"
description: "{{$labels.mountpoint }} 内存使用大于80%(目前使用:{{$value}}%)"
- alert: IO性能
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})"
- alert: 网络
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流入网络带宽过高!"
description: "{{$labels.mountpoint }}流入网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"
- alert: 网络
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流出网络带宽过高!"
description: "{{$labels.mountpoint }}流出网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}"
- alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 1000
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
for: 1m
labels:
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"
![](https://s1.51cto.com/images/blog/201906/12/40ef2698ad7ba396dd77678e3e58cbbf.jpg?x-oss-process=image/watermark,size_16,text_QDUxQ1RP5Y2a5a6i,color_FFFFFF,t_100,g_se,x_10,y_10,shadow_90,type_ZmFuZ3poZW5naGVpdGk=)
### 已经收到邮件内容告警
![](https://s1.51cto.com/images/blog/201906/12/201c5fe81bdd31ad4277788cb3847b94.jpg?x-oss-process=image/watermark,size_16,text_QDUxQ1RP5Y2a5a6i,color_FFFFFF,t_100,g_se,x_10,y_10,shadow_90,type_ZmFuZ3poZW5naGVpdGk=)
转载于:https://blog.51cto.com/10880347/2328092