环境:ansible主机一台:10.1.234.11
node主机3台:node1 10.1.234.110 Prometheus+grafana+node_exporter+pushgateway+alertmanager
node2 10.1.234.111 node_exporter
node3 10.1.234.112 node_exporter
搭建全在ansible主机:其他主机不用登陆:
[root@ansible-11 promethes]# tree
.
├── Alertmanager.yaml
├── Blackbox.yaml
├── conf
│ ├── alertmanager.back.yml
│ ├── alertmanager.service
│ ├── alertmanager.yml
│ ├── blackbox_exporter.service
│ ├── node_exporter.service
│ ├── prometheus.service
│ └── pushgateway.service
├── Grafana.yaml
├── hosts
├── inster_promethes.sh
├── node_exporter.yaml
├── pkg
│ ├── alertmanager-0.20.0.linux-amd64.tar.gz
│ ├── blackbox_exporter-0.16.0.linux-amd64.tar.gz
│ ├── grafana-6.1.3-1.x86_64.rpm
│ ├── node_exporter-1.0.0-rc.1.linux-amd64.tar.gz
│ ├── prometheus-2.8.1.linux-amd64.tar.gz
│ ├── pushgateway-0.4.0.linux-amd64
│ │ ├── LICENSE
│ │ ├── NOTICE
│ │ └── pushgateway
│ └── pushgateway-0.4.0.linux-amd64.tar.gz
├── Prometheus.yaml
├── Pushgateway.yaml
├── reload_Promethes
│ ├── hosts
│ ├── node.yml
│ ├── prometheus.yml
│ ├── prometheus.ymlback
│ └── reload_promethes.yaml
└── template_file
├── blackbox-exporter_rev1.json
├── MySQL_Overview-1589503416459.json
├── Node_Exporter_0.16_0.17_for_Prometheus-1589503429385.json
└── Redis_Dashboard_for_Prometheus_Redis_Exporter_1.x-1589503441940.json
[root@ansible-11 promethes]# cat Alertmanager.yaml
---
- hosts: master
vars:
remote_user: root
gather_facts: false
tasks:
- name: 分发node_exporter二进制包
unarchive: src=pkg/alertmanager-0.20.0.linux-amd64.tar.gz dest=/tmp
- name: 创建文件夹
file: dest=/usr/local/prometheus state=directory
- name: 创建数据目录
file: dest=/data/prometheus/alertmanager/data state=directory
- name: 创建用户
user: name=prometheus state=present
- name: 文件重命名
shell: mv /tmp/alertmanager-0.20.0.linux-amd64 /usr/local/prometheus/alertmanager
- name: 把文件划到组
shell: chown -R prometheus:prometheus /usr/local/prometheus /data/prometheus
- name: 拷贝alertmanager.service启动文件
copy: src=conf/alertmanager.service dest=/usr/lib/systemd/system/alertmanager.service
- name: 启动服务并设置开机自启
systemd: name=alertmanager state=restarted enabled=yes
- name: 拷贝配置文件到master
copy: src=conf/alertmanager.yml dest=/usr/local/prometheus/alertmanager/alertmanager.yml
- name: 重启服务
systemd: name=alertmanager state=restarted
- name: 查看状态并将结果注入到alertmanager变量
shell: ss -nutlp |grep 9093
register: alertmanager
- name: 将结果输出到控制台
debug: var=alertmanager.stdout_lines
[root@ansible-11 promethes]# cat Blackbox.yaml
---
- hosts: master
vars:
remote_user: root
gather_facts: false
tasks:
- name: 分发blackbox_exporter二进制包
unarchive: src=pkg/blackbox_exporter-0.16.0.linux-amd64.tar.gz dest=/tmp
- name: 创建文件夹
file: dest=/usr/local/prometheus state=directory
- name: 创建用户
user: name=prometheus state=present
- name: 文件重命名
shell: mv /tmp/blackbox_exporter-0.16.0.linux-amd64 /usr/local/prometheus/blackbox_exporter
- name: 把文件划到组
shell: chown -R prometheus:prometheus /usr/local/prometheus
- name: 拷贝blackbox_exporter.service启动文件
copy: src=conf/blackbox_exporter.service dest=/usr/lib/systemd/system/blackbox_exporter.service
- name: 启动服务并设置开机自启
systemd: name=blackbox_exporter state=restarted enabled=yes
- name: 查看状态并将结果注入到blackbox_exporter变量
shell: ss -nutlp |grep 9115
register: blackbox_exporter
- name: 将结果输出到控制台
debug: var=blackbox_exporter.stdout_lines
[root@ansible-11 promethes]# cat Grafana.yaml
---
- hosts: master
vars:
remote_user: root
gather_facts: false
tasks:
- name: 拷贝grafana rpm文件
copy: src=pkg/grafana-6.1.3-1.x86_64.rpm dest=/root
- name: 安装grafana
yum: name=grafana-6.1.3-1.x86_64.rpm
- name: 启动服务并设置开机自启
systemd: name=grafana-server state=restarted enabled=yes
- name: 安装grafana所在server安装饼图插件
shell: grafana-cli plugins install grafana-piechart-panel
- name: 重启服务
systemd: name=grafana-server state=restarted
- name: 查看状态并将结果注入到grafana变量
shell: ss -nutlp |grep 3000
register: grafana
- name: 将结果输出到控制台
debug: var=grafana.stdout_lines
[root@ansible-11 promethes]# cat hosts
[master]
10.1.234.110
[node]
10.1.234.110
10.1.234.111
10.1.234.112
[root@ansible-11 promethes]# cat node_exporter.yaml
---
- hosts: node
vars:
remote_user: root
gather_facts: false
tasks:
- name: 分发node_exporter二进制包
unarchive: src=pkg/node_exporter-1.0.0-rc.1.linux-amd64.tar.gz dest=/tmp
- name: 创建文件夹
file: dest=/usr/local/prometheus state=directory
- name: 创建用户
user: name=prometheus state=present
- name: 文件重命名
shell: mv /tmp/node_exporter-1.0.0-rc.1.linux-amd64 /usr/local/prometheus/node_exporter
- name: 把文件划到组
shell: chown -R prometheus:prometheus /usr/local/prometheus
- name: 拷贝node_exporter.service启动文件
copy: src=conf/node_exporter.service dest=/usr/lib/systemd/system/node_exporter.service
- name: 启动服务并设置开机自启
systemd: name=node_exporter state=restarted enabled=yes
- name: 查看状态并将结果注入到node_exporter变量
shell: ss -nutlp |grep 9100
register: node_exporter
- name: 将结果输出到控制台
debug: var=node_exporter.stdout_lines
[root@ansible-11 promethes]# cat Prometheus.yaml
---
- hosts: master
vars:
remote_user: root
gather_facts: false
tasks:
- name: 分发prometheus二进制包
unarchive: src=pkg/prometheus-2.8.1.linux-amd64.tar.gz dest=/tmp
- name: 创建文件夹
file: dest=/usr/local/prometheus state=directory
- name: 创建数据目录
file: dest=/data/prometheus/data state=directory
- name: 文件重命名
shell: mv /tmp/prometheus-2.8.1.linux-amd64 /usr/local/prometheus/prometheus
- name: 创建用户
user: name=prometheus state=present
- name: 把文件划到组
shell: chown -R prometheus:prometheus /usr/local/prometheus /data/prometheus
- name: 拷贝prometheus.service 启动文件
copy: src=conf/prometheus.service dest=/usr/lib/systemd/system/prometheus.service
- name: 启动服务并设置开机自启
systemd: name=prometheus state=restarted enabled=yes
- name: 查看状态并将结果注入到prometheus变量
shell: ss -nutlp |grep 9090
register: prometheus
- name: 将结果输出到控制台
debug: var=prometheus.stdout_lines
[root@ansible-11 promethes]# cat Pushgateway.yaml
---
- hosts: master
vars:
remote_user: root
gather_facts: false
tasks:
- name: 分发pushgateway二进制包
unarchive: src=pkg/pushgateway-0.4.0.linux-amd64.tar.gz dest=/tmp
- name: 创建文件夹
file: dest=/usr/local/prometheus state=directory
- name: 文件重命名
shell: mv /tmp/pushgateway-0.4.0.linux-amd64 /usr/local/prometheus/pushgateway
- name: 创建用户
user: name=prometheus state=present
- name: 把文件划到组
shell: chown -R prometheus:prometheus /usr/local/prometheus
- name: 拷贝prometheus.service 启动文件
copy: src=conf/pushgateway.service dest=/usr/lib/systemd/system/pushgateway.service
- name: 启动服务并设置开机自启
systemd: name=pushgateway state=restarted enabled=yes
- name: 查看状态并将结果注入到pushgateway变量
shell: ss -nutlp |grep 9091
register: pushgateway
- name: 将结果输出到控制台
debug: var=pushgateway.stdout_lines
[root@ansible-11 conf]# ls
alertmanager.back.yml alertmanager.service alertmanager.yml blackbox_exporter.service node_exporter.service prometheus.service pushgateway.service
[root@ansible-11 reload_Promethes]# cat reload_promethes.yaml
---
- hosts: master
gather_facts: no
tasks:
- name: 更新配置文件
copy:
src: prometheus.yml
dest: /usr/local/prometheus/prometheus/prometheus.yml
- name: 创建规则文件
file: dest=/usr/local/prometheus/prometheus/rules state=directory
- name: 拷贝规则文件
copy: src=node.yml dest=/usr/local/prometheus/prometheus/rules/node.yml
- name: 把文件划到组
shell: chown -R prometheus:prometheus /usr/local/prometheus
- name: 重动服务
systemd: name=prometheus state=restarted
- name: 查看状态并将结果注入到prometheus变量
shell: ss -nutlp |grep 9090
register: prometheus
- name: 将结果输出到控制台
debug: var=prometheus.stdout_lines
[root@ansible-11 reload_Promethes]# cat prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/node.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-1'
static_configs:
- targets: ['10.1.234.110:9100']
- job_name: 'node-2'
static_configs:
- targets: ['10.1.234.111:9100']
- job_name: 'node-3'
static_configs:
- targets: ['10.1.234.112:9100']
- job_name: 'pushgateway'
static_configs:
- targets: ['10.1.234.110:9091']
[root@ansible-11 reload_Promethes]# cat node.yml
# groups:组告警
groups:
# name:组名。报警规则组名称
- name: general.rules
# rules:定义角色
rules:
# alert:告警名称。 任何实例5分钟内无法访问发出告警
- alert: NodeFilesystemUsage_disk
# expr:表达式。 获取磁盘使用率 大于百分之80 触发
expr: 100 - (node_filesystem_free_bytes{mountpoint="/",fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
# for:持续时间。 表示持续一分钟获取不到信息,则触发报警。0表示不使用持续时间
for: 1m
# labels:定义当前告警规则级别
labels:
# severity: 指定告警级别。
severity: warning
# annotations: 注释 告警通知
annotations:
# 调用标签具体指附加通知信息
summary: "Instance {{ $labels.instance }} :{{ $labels.mountpoint }} 分区使用率过高" # 自定义摘要
description: "{{ $labels.instance }} : {{ $labels.job }} :{{ $labels.mountpoint }} 这个分区使用大于百分之80% (当前值:{{ $value }})" # 自定义具体描述
[root@ansible-11 promethes]# cat inster_promethes.sh
#!/bin/bash
echo '--------我只用了3台搭建,增加node节点请自己加---------'
read -p '请输入promethesmaster的ip地址:' x
read -p '请输入promethes node1的ip地址:' y
read -p '请输入promethes node2的ip地址:' z
for i in `grep -r 10.1.234.110 . |awk -F : '{print $1}' |uniq`
do
sed -i "s#10.1.234.110#$x#g" $i
done
for j in `grep -r 10.1.234.111 . |awk -F : '{print $1}' |uniq`
do
sed -i "s#10.1.234.111#$y#g" $j
done
for k in `grep -r 10.1.234.112 . |awk -F : '{print $1}' |uniq`
do
sed -i "s#10.1.234.112#$z#g" $k
done
ansible-playbook -i hosts Prometheus.yaml
ansible-playbook -i hosts node_exporter.yaml
ansible-playbook -i hosts Grafana.yaml
ansible-playbook -i hosts Alertmanager.yaml
ansible-playbook -i hosts Pushgateway.yaml
echo 'promethes安装完成'
cd reload_Promethes
ansible-playbook -i hosts reload_promethes.yaml
echo '规则添加完成,请导入模板'
echo '-----------修改报警的微信不要用我的--------------'
##########################################################################
直接执行 ansible-palybook -i host inster_promethes.sh 大概4-5分钟安装完成,测试报警完成
看起来有点繁琐,后面再改进
###################################################################################
企业微信告警配置
####################################################
放入周期性计划任务
[root@test_dc_rpdns_com ~]# crontab -l
SHELL=/bin/sh
PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
#0 */1 * * * /win/sh/rm_laji_log_data.sh
*/5 * * * * /root/node_exporter_shell.sh
*/5 * * * * /root/port.sh
[root@test_dc_rpdns_com ~]# cat port.sh
#!/bin/bash
#获取主机名,常传输到Prometheus标签以主机名
instance_name=`hostname -f | cut -d'.' -f1`
#判断主机名不能是localhost不然发送过的数据不知道是那个主机的
if [ $instance_name == "localhost" ];then
echo "Hostname must not localhost"
exit 1
fi
#自定义key,在Prometheus即可使用key查询
label="node_port_8500"
node_port_8500=`ss -nutlp | grep 8500 |wc -l`
echo "$label $node_port_8500" | curl --data-binary @- http://10.1.234.110:9091/metrics/job/pushgateway/instance/$instance_name
label="node_port_9201"
node_port_9201=`ss -nutlp | grep 9201 |wc -l`
echo "$label $node_port_9201" | curl --data-binary @- http://10.1.234.110:9091/metrics/job/pushgateway/instance/$instance_name
label="node_port_5601"
node_port_5601=`ss -nutlp | grep 5601 |wc -l`
echo "$label $node_port_5601" | curl --data-binary @- http://10.1.234.110:9091/metrics/job/pushgateway/instance/$instance_name