目录
资源列表
一、基础环境
关闭防火墙
关闭selinux
修改主机名
节点添加主机名与IP对应关系
二、部署Prometheus
下载安装包
解压
创建用于运行Prometheus的组和用户
创建Prometheus数据存储目录
给Prometheus主目录赋用户Prometheus权限
修改配置文件
启动
三、部署Grafana
下载安装包
创建grafana用户及数据存放目录
修改配置文件
启动
四、node节点部署node_exporter
下载安装包
新建一个目录专门安装各种exporter
启动
五、配置Grafana
六、部署AlertManager
获取并安装软件包
解压
修改配置文件
启动
七、Prometheus和Alertmanager对接
修改Prometheus的配置文件
重启Prometheus
今天给大家分享的是二进制部署Prometheus并结合Grafana进行展示
操作系统 配置 主机名 IP CentOS7.3 2C2G prometheus 192.168.207.131 CentOS7.3 2C2G grafana 192.168.207.165 CentOS7.3 2C2G node 192.168.207.166 CentOS7.3 2C2G alertmanager 192.168.207.167
systemctl stop firewalld
systemctl disable firewalld
sed -i "s/^SELINUX=*/SELINUX=disabled/g" /etc/selinux/config
setenforce 0
hostnamectl set-hostname prometheus
hostnamectl set-hostname grafana
hostnamectl set-hostname node
hostnamectl set-hostname alertmanager
cat >> /etc/hosts << EOF
192.168.207.131 prometheus
192.168.207.165 grafana
192.168.207.166 node
192.168.207.167 alertmanager
EOF
wget https://github.com/prometheus/prometheus/releases/download/v2.26.0/prometheus-2.26.0.linux-amd64.tar.gz
tar -zxvf prometheus-2.26.0.linux-amd64.tar.gz -C /usr/local
mv /usr/local/prometheus-2.26.0.linux-amd64/ /usr/local/prometheus
groupadd prometheus
useradd -g prometheus -s /sbin/nologin prometheus
mkdir -p /var/lib/prometheus
chown -R prometheus /var/lib/prometheus
chown -R prometheus:prometheus /usr/local/prometheus/
# 备份
cp /usr/local/prometheus/prometheus.yml{,.bak}
############################################################################################
[root@prometheus prometheus]# cat /usr/local/prometheus/prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: 'node'
scrape_interval: 10s
static_configs:
- targets: ['192.168.207.166:9100']
labels:
instance: node
#################################################################################################
/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml
wget https://dl.grafana.com/oss/release/grafana-7.5.6.linux-amd64.tar.gz
tar -zxvf grafana-7.5.6.linux-amd64.tar.gz
mv grafana-7.5.6 /usr/local/grafana
useradd -s /sbin/nologin -M grafana
mkdir -p /data/grafana
mkdir -p /data/grafana/conf/provisioning
chown -R grafana:grafana /usr/local/grafana/
chown -R grafana:grafana /data/grafana/
vi /usr/local/grafana/conf/defaults.ini
# 修改一下四个地方
data = /data/grafana/data
logs = /data/grafana/log
plugins = /data/grafana/plugins
provisioning = /data/grafana/conf/provisioning #通过配置的方式进行datasource和dashboard的配置
/usr/local/grafana/bin/grafana-server -homepath /usr/local/grafana/
# granfa默认的账户密码是admin/admint监听端口是3000
wget https://github.com/prometheus/node_exporter/releases/download/v1.1.2/node_exporter-1.1.2.linux-amd64.tar.gz
tar zxvf node_exporter-1.1.2.linux-amd64.tar.gz
mv node_exporter-1.1.2.linux-amd64 node_exporter
mkdir -p /usr/local/prometheus_exporter
mv node_exporter /usr/local/prometheus_exporter/
/usr/local/prometheus_exporter/node_exporter/node_exporter
登陆grafana,首先配置数据源
配置好数据源以后可以添加模板
# 监控Linux服务器推荐模板id
8919
9276
wget https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz
tar zxvf alertmanager-0.21.0.linux-amd64.tar.gz
mv alertmanager-0.21.0.linux-amd64 alertmanager
mv alertmanager /usr/local/
cp /usr/local/alertmanager/alertmanager.yml{,_bak}
vi /usr/local/alertmanager/alertmanager.yml
#################################################################################
[root@alertmanager ~]# cat /usr/local/alertmanager/alertmanager.yml
# 全局配置,设置警报解决的超时时间为5分钟
global:
resolve_timeout: 5m
# 邮件通知中的发件人地址
smtp_from: '[email protected]'
# SMTP服务器的地址和端口
smtp_smarthost: 'smtp.163.com:25'
# SMTP身份验证的用户名
smtp_auth_username: '[email protected]'
# SMTP身份验证的密码
smtp_auth_password: 'AAAAAAAAAAAAAAAAAAAAA'
# 警报路由配置
route:
# 根据警报名称分组
group_by: ['alertname']
# 等待5秒,以便将同一组的警报集中处理
group_wait: 5s
# 两次发送同一组警报之间的等待时间
group_interval: 5s
# 重复发送通知的时间间隔
repeat_interval: 5m
# 默认接收器为'email'
receiver: 'email'
# 配置警报接收器
receivers:
- name: 'email'
email_configs:
# 邮件接收地址
- to: '[email protected]'
# 发送已解决的警报通知
send_resolved: true
# 配置抑制规则
inhibit_rules:
# 当源警报的严重性为'critical',目标警报的严重性为'warning',且标签匹配时,触发抑制
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
# 抑制规则用于定义在满足一定条件时,某些警报事件会抑制其他警报事件的通知。这通常用于避免在特定情境下产生不必要的或重复的通知。
#################################################################################
/usr/local/alertmanager/alertmanager --config.file /usr/local/alertmanager/alertmanager.yml
# 取消- alertmanager:9093注释,在rule_files下添加规则文件
###########################################################################
[root@prometheus ~]# head -16 /usr/local/prometheus/prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/data/prometheus/rules/*_rules.yml"
#############################################################################
# 创建报警规则
mkdir -p /data/prometheus/rules/
##########################################################################
[root@prometheus ~]# cat /data/prometheus/rules/disk_rules.yml
# groups:组告警
groups:
# name:组名。报警规则组名称
- name: general.rules
# rules:定义角色
rules:
# alert:告警名称。 任何实例5分钟内无法访问发出告警
- alert: NodeFilesystemUsage
# expr:表达式。 获取磁盘使用率 大于百分之80 触发
expr: 100 - (node_filesystem_free_bytes{mountpoint="/",fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
# for:持续时间。 表示持续一分钟获取不到信息,则触发报警。0表示不使用持续时间
for: 1m
# labels:定义当前告警规则级别
labels:
# severity: 指定告警级别。
severity: warning
# annotations: 注释 告警通知
annotations:
# 调用标签具体指附加通知信息
summary: "Instance {{ $labels.instance }} :{{ $labels.mountpoint }} 分区使用率过高" # 自定义摘要
description: "{{ $labels.instance }} : {{ $labels.job }} :{{ $labels.mountpoint }} 这个分区使用大于百分之80% (当前值:{{ $value }})" # 自定义具体描述
################################################################################
[root@ansible-node1 ~]# cat /data/prometheus/rules/up_rules.yml
groups:
- name: general.rules
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: error
annotations:
summary: "Instance {{ $labels.instance }} 停止工作" # 自定义摘要
description: "{{ $labels.instance }} : {{ $labels.job }} 已经停止五分钟以上了" # 自定义具体描述
################################################################################
groups:
- name: node.rules
rules:
- alert: NodeFilesystemUsage
expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }})"
- alert: NodeMemoryUsage
expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 80
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} 内存使用率过高"
description: "{{ $labels.instance }}内存使用大于80% (当前值: {{ $value }})"
- alert: NodeCPUUsage
expr: (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) < 20
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} CPU使用率过高"
description: "{{ $labels.instance }}CPU使用大于80% (当前值: {{ $value }})"
##########################################################################################
/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml