当前共有三个ceph集群,需要对三个ceph集群搭建一套高可用对监控告警系统,因为prometheus支持对ceph集群的监控告警,所以本文采用prometheus+alertmanager的形式搭建一套相对健壮的监控告警系统。
bj03、bj04以及k8s-test共部署了三套独立的prometheus服务,它们都拉取全量的指标数据;
当其中一台prometheus挂掉之后,grafana仍然可以查看到来自另一个prometheus的指标数据;
bj03、bj04以及kes-test部署了三套相互感知的alertmanager服务,它们均接收来自三个prometheus的告警信息;
alertmanager本身实现了gossip协议,通过配置启动参数,启动后可以使得alertmanager集群内部对于同一个告警信息不会重复发送;
综合上面prometheus以及alertmanager部署情况,给出下面的监控告警系统架构图:
话不多说,先上一张流程图:
如上图所示,
1.prometheus通过配置文件prometheus.yml中的scrape_configs配置项识别到ceph_exporter及其所在机器;
2.prometheus从ceph_exporter拉取指标数据;
3.prometheus将拉取到的指标数据存储在自身集成的时序型数据库,并将符合告警规则的指标告警发往AlertManager;
4.AlertManager定义了路由规则以及接收告警消息的接口地址,AlertManager将告警信息整合后发往webhook;
5.webhook回调告警中心接口,至此,告警信息已经发送到告警中心;(webhook实现参考)
6.告警中心通过管理平台的配置,将告警信息通过V消息和短信等形式发送给组内成员。
7.grafana也支持prometheus数据源,只需要在grafana中配置一下即可使用。(grafana配置参考文章)
可以采用容器化安装,也可以非容器化安装。
#ops on 10.xxx.xxx.xxx
wget http://static.zybuluo.com/zphj1987/jiwx305b8q1hwc5uulo0z7ft/ceph_exporter-2.0.0-1.x86_64.rpm
rpm -ivh ceph_exporter-2.0.0-1.x86_64.rpm
systemctl start ceph_exporter
systemctl status ceph_exporter
#ops on 10.xxx.xxx.xxx
docker pull prom/prometheus:v2.3.2
docker pull prom/alertmanager:v0.16.0
docker pull docker.io/grafana/grafana:5.2.1
mkdir -p /etc/prometheus
cat /etc/prometheus/alert_config.yml
cat /etc/prometheus/alert_rules_szsk_04_17.yml
cat /etc/prometheus/prometheus_sz02_04_17.yml
docker run -d --name alertmager_sz02ceph -p 9096:9093 -v /etc/prometheus/alert_config.yml:/etc/alertmanager/config.yml prom/alertmanager:v0.16.0
docker run -d --name promethues_sz02ceph -p 9191:9090 -v /etc/prometheus/prometheus_sz02_04_17.yml:/etc/prometheus/prometheus.yml -v /etc/prometheus/alert_rules_sz02_04_17.yml:/etc/prometheus/l prom/prometheus:v2.3.2
docker run -d --name=grafana -p 3000:3000 docker.io/grafana/grafana:5.2.1
wget http://static.zybuluo.com/zphj1987/jiwx305b8q1hwc5uulo0z7ft/ceph_exporter-2.0.0-1.x86_64.rpm
rpm -qpl ceph_exporter-2.0.0-1.x86_64.rpm
rpm -ivh ceph_exporter-2.0.0-1.x86_64.rpm
systemctl status ceph_exporter
systemctl start ceph_exporter
systemctl enable ceph_exporter
wget http://static.zybuluo.com/zphj1987/7ro7up6r03kx52rkwy1qjuwm/prometheus-2.3.2-1.x86_64.rpm
rpm -qpl prometheus-2.3.2-1.x86_64.rpm
rpm -ivh prometheus-2.3.2-1.x86_64.rpm
vim /usr/lib/systemd/system/prometheus.service
--config-file=.../prometheus_xxx.yml
systemctl status prometheus
systemctl start prometheus
systemctl enable prometheus
netstat -tunlp|grep 9090
wget --content-disposition https://packagecloud.io/prometheus-rpm/release/packages/el/7/alertmanager-0.16.0-1.el7.centos.x86_64.rpm/download.rpm
###注意:这里alertmanager采用的是1.6版本的,之前的1.3版本在配置alertmanger高可用的时候,对于--cluster.listen-address等参数无法识别
rpm -qpl alertmanager-0.13.0-1.el7.centos.x86_64.rpm
rpm -ivh alertmanager-0.13.0-1.el7.centos.x86_64.rpm
vim /usr/lib/systemd/system/alertmanager.service
--config-file=.../alert_config.yml \
--web.listen-address=:9096 \
--cluster.listen-address=:8001 \
--cluster.peer=[the other alertmanager ip:port]
systemctl status alertmanager
systemctl start alertmanager
systemctl enable alertmanager
netstat -tunlp | grep 9096
wget https://s3-us-west-2.amazonaws.com/grafana-releases/release/grafana-5.2.1-1.x86_64.rpm
yum install grafana-5.2.1-1.x86_64.rpm
systemctl start grafana-server.service
netstat -tunlp|grep grafana
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 10.1xx.xxx.xxx:9093
- 10.1xx.xxx.xxx:9093
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- alert_rules.yml
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: 'ceph-exporter'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['10.xx.xx.xx:9128']
- targets: ['10.xx.xx.xx:9128']
- targets: ['10.xx.xx.xx:9128']
#The job of "ceph-exporter-alias" use the cluster name instead of host:port by the label link:https://zhuanlan.zhihu.com/p/77020680 ; https://github.com/prometheus/prometheus/blob/release-2.18/config/testdata/conf.good.yml
- job_name: 'ceph-exporter-alias'
file_sd_configs:
- refresh_interval: 10s
files:
- '/etc/prometheus/ceph_exporter.yml'
relabel_configs:
- source_labels:
- '__address__'
regex: '(.*)'
target_label: '__address__'
action: replace
replacement: '${1}'
groups:
- name: ceph.rules
rules:
- alert: CephTargetDown
expr: up{job="ceph"} == 0
for: 10m
labels:
severity: critical
annotations:
description: CEPH target down for more than 2m, please check - it could be a either exporter crash or a whole cluster crash
summary: CEPH exporter down
- alert: CephErrorState
expr: ceph_health_status > 1
for: 5m
labels:
severity: critical
annotations:
description: Ceph is in Error state longer than 5m, please check status of pools and OSDs
summary: CEPH in ERROR
- alert: OsdDown
expr: ceph_osd_up == 0
for: 30m
labels:
severity: warning
annotations:
description: OSD is down longer than 30 min, please check whats the status
summary: OSD down
- alert: OsdApplyLatencyTooHigh
expr: ceph_osd_perf_apply_latency_seconds > 10
for: 90s
labels:
severity: warning
annotations:
description: OSD latency for {{ $labels.osd }} is too high. Please check if it doesn't stuck in weird state
summary: OSD latency too high {{ $labels.osd }}
- alert: MonitorClockSkewTooHigh
expr: abs(ceph_monitor_clock_skew_seconds) > 0.1
for: 60s
labels:
severity: warning
annotations:
description: Monitor clock skew detected on {{ $labels.monitor }} - please check ntp and harware clock settins
summary: Clock skew detected on {{ $labels.monitor }}
- alert: MonitorAvailableStorage
expr: ceph_monitor_avail_percent < 30
for: 60s
labels:
severity: warning
annotations:
description: Monitor storage for {{ $labels.monitor }} less than 30% - please check why its too high
summary: Nonitor storage for {{ $labels.monitor }} less than 30%
- alert: MonitorAvailableStorage
expr: ceph_monitor_avail_percent < 15
for: 60s
labels:
severity: critical
annotations:
description: Monitor storage for {{ $labels.monitor }} less than 15% - please check why its too high
summary: Nonitor storage for {{ $labels.monitor }} less than 15%
- alert: CephOSDUtilizatoin
expr: ceph_osd_utilization > 90
for: 60s
labels:
severity: critical
annotations:
description: Osd free space for {{ $labels.osd }} is higher tan 90%. Please validate why its so big, reweight or add storage
summary: OSD {{ $labels.osd }} is going out of space
- alert: CephPgDown
expr: ceph_pg_down > 0
for: 3m
labels:
severity: critical
annotations:
description: Some groups are down (unavailable) for too long on {{ $labels.cluster }}. Please ensure that all the data are available
summary: PG DOWN [{{ $value }}] on {{ $labels.cluster }}
- alert: CephPgIncomplete
expr: ceph_pg_incomplete > 0
for: 2m
labels:
severity: critical
annotations:
description: Some groups are incomplete (unavailable) for too long on {{ $labels.cluster }}. Please ensure that all the data are available
summary: PG INCOMPLETE [{{ $value }}] on {{ $labels.cluster }}
- alert: CephPgInconsistent
expr: ceph_pg_inconsistent > 0
for: 1m
labels:
severity: warning
annotations:
description: Some groups are inconsistent for too long on {{ $labels.cluster }}. Data is available but inconsistent across nodes
summary: PG INCONSISTENT [{{ $value }}] on {{ $labels.cluster }}
- alert: CephPgActivating
expr: ceph_pg_activating > 0
for: 5m
labels:
severity: critical
annotations:
description: Some groups are activating for too long on {{ $labels.cluster }}. Those PGs are unavailable for too long!
summary: PG ACTIVATING [{{ $value }}] on {{ $labels.cluster }}
- alert: CephPgBackfillTooFull
expr: ceph_pg_backfill_toofull > 0
for: 5m
labels:
severity: warning
annotations:
description: Some groups are located on full OSD on cluster {{ $labels.cluster }}. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.
summary: PG TOO FULL [{{ $value }}] on {{ $labels.cluster }}
- alert: CephPgUnavailable
expr: ceph_pg_total - ceph_pg_active > 0
for: 5m
labels:
severity: critical
annotations:
description: Some groups are unavailable on {{ $labels.cluster }}. Please check their detailed status and current configuration.
summary: PG UNAVAILABLE [{{ $value }}] on {{ $labels.cluster }}
- alert: CephOsdReweighted
expr: ceph_osd_weight < 1
for: 1h
labels:
severity: warning
annotations:
description: OSD {{ $labels.ceph_daemon}} on cluster {{ $labels.cluster}} was reweighted for too long. Please either create silent or fix that issue
summary: OSD {{ $labels.ceph_daemon }} on {{ $labels.cluster }} reweighted - {{ $value }}
- alert: CephAvailableBytesNotEnough
expr: ceph_cluster_available_bytes / ceph_cluster_capacity_bytes < 0.3
for: 1m
labels:
severity: warning
annotations:
description: ceph cluster {{ $labels.cluster}} has no enough available bytes. Please check the cluster available bytes.
summary: ceph cluster {{ $labels.cluster }} available bytes [{{ $value }}].
global:
# The directory from which notification templates are read.
templates:
- '/etc/alertmanager/template/*.tmpl'
# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
group_by: ['alertname', 'cluster', 'service']
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 30m
# A default receiver
receiver: 'team-ceph-ops-mails'
# All the above attributes are inherited by all child routes and can
# overwritten on each.
# The child route trees.
#routes:
#- receiver: 'caas'
# match:
# alertname: 'PodCpuUsage'
routes:
- match_re:
alertname: ^ceph.*
receiver: team-ceph-ops-mails
- match_re:
alertname: ^skidc.*
receiver: team-skidc-ops-mails
receivers:
- name: 'team-skidc-ops-mails'
webhook_configs:
- url: http://10.xx.xx.xx:8101/sendmms
- url: http://10.xx.xx.xx:8101/sendmsg
- name: 'team-ceph-ops-mails'
webhook_configs:
- url: http://10.xx.xx.xx:8106/webhook/sendMsg
参考文章:
[1]https://ceph.io/planet/快速构建ceph可视化监控系统/