安装包
alertmanager-0.23.0.linux-amd64.tar.gz
node_exporter-1.3.1.linux-amd64.tar.gz
prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
服务端口
Prometheus |
9090 |
node_exporter |
9100 |
alertmanager |
9093 |
prometheus-webhook-dingtalk |
8060 |
#修改配置文件之前先备份
systemctl stop firewalld
setenforce 0
/usr/local
curl -O https://storage.googleapis.com/golang/go1.8.3.linux-amd64.tar.gz 或者 wget -c https://storage.googleapis.com/golang/go1.8.3.linux-amd64.tar.gz
tar -C /usr/local -zxvf go1.8.3.linux-amd64.tar.gz
vim /etc/profile #修改配置文件
export PATH=$PATH:/usr/local/go/bin #文件末添加
source /etc/profile #保存配置文件
go version #验证go环境是否安装成功
/usr/local
curl -O https://blockchain-sre.oss-cn-hangzhou.aliyuncs.com/prometheus-2.31.1.linux-amd64.tar.gz 或者 wget -c https://blockchain-sre.oss-cn-hangzhou.aliyuncs.com/prometheus-2.31.1.linux-amd64.tar.gz
tar -C /usr/local -zxvf prometheus-2.31.1.linux-amd64.tar.gz
cd /usr/local
mv prometheus-2.31.1.linux-amd64 prometheus #为方便进入目录,修改目录名为prometheus
cd
useradd -M -s /sbin/nologin prometheus
mkdir -p /data/prometheus
chown -R prometheus:prometheus /usr/local/prometheus /data/prometheus # 修改权限 新增配置文件之后最好也执行一下这步
cd /usr/local/prometheus
mkdir bin
mv promtool bin
vim /etc/profile
export PATH=$PATH:/sbin:/usr/bin:/usr/sbin
export PATH=$PATH:/usr/local/go/bin
export PATH=/usr/local/prometheus/bin:$PATH:$HOME/bin
source /etc/profile
/usr/local/prometheus/prometheus.yml
cd /usr/local/prometheus
cp prometheus.yml prometheus.yml.bak #修改配置文件前先进行备份
vim prometheus.yml
修改添加
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
- job_name: "node_exporter"
static_configs:
- targets: ["172.19.88.86:9100"]
- job_name: "node2_exporter"
static_configs:
- targets: ["localhost:9100"]
- job_name: "alertmanager"
static_configs:
- targets: ["localhost:9093"]
# - job_name: 'consul'
# consul_sd_configs:
# - server: 'localhost:8500'
# services: ['test']
# - job_name: 'blackbox'
# metrics_path: /probe
# params:
# module: [http_2xx] # Look for a HTTP 200 response.
# file_sd_configs:
# - refresh_interval: 1m
# files:
# - "/usr/local/prometheus/conf/blackbox*.yml"
# relabel_configs:
# - source_labels: [__address__]
# target_label: __param_target
# - source_labels: [__param_target]
# target_label: instance
# - target_label: __address__
# replacement: localhost:9115 # The blackbox exporter's real hostname:port.
#检查配置文件
promtool check config /usr/local/prometheus/prometheus.yml
/usr/lib/systemd/system/prometheus.service
cat >> /usr/lib/systemd/system/prometheus.service <
/usr/local
curl -O https://github.com/prometheus/node_exporter/releases/download/v1.3.1/node_exporter-1.3.1.linux-amd64.tar.gz 或者 wget -c https://github.com/prometheus/node_exporter/releases/download/v1.3.1/node_exporter-1.3.1.linux-amd64.tar.gz
exit
scp C:\Users\wangdachu\Desktop\node_exporter-1.3.1.linux-amd64.tar.gz [email protected]:/root
Aldaba123!@#
ssh [email protected]
tar -C /usr/local -zxvf node_exporter-1.3.1.linux-amd64.tar.gz
cd /usr/local
mv node_exporter-1.3.1.linux-amd64 node_exporter
chown -R root:root /usr/local/node_exporter
/usr/lib/systemd/system/node_exporter.service
cat >> /usr/lib/systemd/system/node_exporter.service <
/usr/lib/systemd/system/node_exporter.service
tar -C /usr/local -zxvf node_exporter-1.3.1.linux-amd64.tar.gz
cd /usr/local
mv node_exporter-1.3.1.linux-amd64 node_exporter
chown -R prometheus:prometheus /usr/local/node_exporter
cat >> /usr/lib/systemd/system/node_exporter.service < Graph -> 键入up -> Execute 查看监控状态
cd /var/lib
mkdir -p node_exproter/textfile_collector
vim /etc/cron.d/directory_size
*/5 * * * * root du -sb /var/log /var/cache/apt /var/lib/prometheus | sed -ne 's/^\([0-9]\+\)\t\(.*\)$/node_directory_size_bytes{directory="\2"} \1/p' > /var/lib/node_exporter/textfile_collector/directory_size.prom.$$ && mv /var/lib/node_exporter/textfile_collector/directory_size.prom.$$ /var/lib/node_exporter/textfile_collector/directory_size.prom
crontab -u root /etc/cron.d/directory_size
crontab -l
修改node_exporter自启动脚本
cat /usr/lib/systemd/system/node_exporter.service
[Unit]
Description=node_export
Documentation=https://github.com/prometheus/node_exporter
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/usr/local/node_exporter/node_exporter --collector.textfile.directory=/var/lib/node_exporter/textfile_collector #指定路径
Restart=on-failure
[Install]
WantedBy=multi-user.target
表达式:node_directory_size_bytes{directory="/var/lib/prometheus"}
查看监控图形
/usr/local
curl -O https://github.com/prometheus/alertmanager/releases/download/v0.23.0/alertmanager-0.23.0.linux-amd64.tar.gz 或者 wget -c https://github.com/prometheus/alertmanager/releases/download/v0.23.0/alertmanager-0.23.0.linux-amd64.tar.gz
tar -C /usr/local -zxvf alertmanager-0.23.0.linux-amd64.tar.gz
mv alertmanager-0.23.0.linux-amd64 alertmanager
mkdir /usr/local/alertmanager/data
chown -R prometheus:prometheus /usr/local/alertmanager
/usr/local/alertmanager/alertmanager.yml
cp alertmanager.yml alertmanager.yml.bak #备份配置文件
# 全局配置项
global:
resolve_timeout: 5m # 处理超时时间,默认为5min
# 定义路由树信息
route:
group_by: [alertname] # 报警分组依据
receiver: ops_notify # 设置默认接收人
group_wait: 30s # 最初即第一次等待多久时间发送一组警报的通知
group_interval: 60s # 在发送新警报前的等待时间
repeat_interval: 1h # 重复发送告警时间。默认1h
routes:
- receiver: ops_notify # 基础告警通知
group_wait: 10s
match_re:
alertname: 实例存活告警|磁盘使用率告警 # 匹配告警规则中的名称发送
- receiver: info_notify # 消息告警通知
group_wait: 10s
match_re:
alertname: 内存使用率告警|CPU使用率告警
# 定义基础告警接收者
receivers:
- name: ops_notify
webhook_configs:
- url: http://localhost:8060/dingtalk/webhook2/send #prometheus-webhook-dingtalk的url地址
send_resolved: true # 警报被解决之后是否通知
# 定义消息告警接收者
- name: info_notify
webhook_configs:
- url: http://localhost:8060/dingtalk/webhook2/send #prometheus-webhook-dingtalk的url地址
send_resolved: true
# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的警报失效的规则。两个警报必须具有一组相同的标签。
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
/usr/lib/systemd/system/alertmanager.service
cat >> /usr/lib/systemd/system/alertmanager.service <
/usr/local
curl -O https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.0.0/prometheus-webhook-dingtalk-2.0.0.linux-amd64.tar.gz 或者 wget -c https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.0.0/prometheus-webhook-dingtalk-2.0.0.linux-amd64.tar.gz
tar -C /usr/local -zxvf prometheus-webhook-dingtalk-2.0.0.linux-amd64.tar.gz
mv prometheus-webhook-dingtalk-2.0.0.linux-amd64 prometheus-webhook-dingtalk
1\#修改配置文件 /usr/local/prometheus-webhook-dingtalk/config.yml
cp config.yml config.yml.bak
targets:
webhook2:
url: https://oapi.dingtalk.com/robot/send?access_token=cec57e121cf51ffdcf108ac9218bb01591826ab16b535928b6a860c87eebc9e6 #修改url为钉钉机器人的token ,机器人的webhook地址
# secret for signature
secret: SEC000000000000000000000
剩余注释#到message
/usr/lib/systemd/system/prometheus-webhook-dingtalk.service
cat >> /usr/lib/systemd/system/prometheus-webhook-dingtalk.service <
curl -H "Content-Type: application/json" -d '{"msgtype":"text","text":{"content":"prometheus alert test"}}' https://oapi.dingtalk.com/robot/send?access_token=cec57e121cf51ffdcf108ac9218bb01591826ab16b535928b6a860c87eebc9e6
#修改url为钉钉机器人的token ,机器人的webhook地址
curl -H "Content-Type: application/json" -d '{"msgtype":"text","text":{"content":"prometheus alert test"}}' https://oapi.dingtalk.com/robot/send?access_token=72405a3c5684584a2a13447cc58977fb34ae9c10e060696ef228c6daed1b6f61
查看prometheus-webhook-dingtalk的url地址,altermanager会将通知像这个地址发送
journalctl -u prometheus-webhook-dingtalk -f
可以看到url urls=http://localhost:8060/dingtalk/webhook1/send
/usr/local/prometheus/first_rules.yml
cat >> /usr/local/prometheus/first_rules.yml << EOF
groups:
# 实例存活报警
- name: 实例存活告警规则
rules:
- alert: 实例存活告警
expr: up == 0
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
# mem报警
- name: 内存报警规则
rules:
- alert: 内存使用率告警
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "服务器: 内存使用超过80%!(当前值: {{ $value }}%)"
# disk报警
- name: 磁盘报警规则
rules:
- alert: 磁盘使用率告警
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 30
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "服务器: 磁盘设备: 使用超过30%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
# cpu报警
- name: CPU报警规则
rules:
- alert: CPU使用率告警
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 30
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "服务器: CPU使用超过30%!(当前值: {{ $value }}%)"
#目录大小告警(复制前面规则不会出错)
- name: 目录报警规则
rules:
- alert:目录大小告警
expr: node_directory_size_bytes > 10
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "服务器: 目录大小超过0!(当前值: {{ $value }})"
EOF
#检查规则配置文件语法是否正确
promtool check rules /usr/local/prometheus/first_rules.yml
/usr/local/prometheus-webhook-dingtalk/contrib/templates/legacy/template.tmpl
cd prometheus-1/
curl -O https://dl.grafana.com/oss/release/grafana-7.1.3.linux-amd64.tar.gz 或者 wget -c https://dl.grafana.com/oss/release/grafana-7.1.3.linux-amd64.tar.gz
tar -C /usr/local -zxvf grafana-7.1.3.linux-amd64.tar.gz
mv grafana-7.1.3 grafana
mkdir /usr/local/grafana/{data,log}
chown -R prometheus:prometheus /usr/local/grafana
/usr/local/grafana/conf/
cd /usr/local/grafana/conf/
cp defaults.ini grafana.ini
vim grafana.ini
# logs = data/log
logs = log
/usr/lib/systemd/system/grafana-server.service
cat >> /usr/lib/systemd/system/grafana-server.service <