prometheus介绍:
Prometheus是一个开源的系统监控和报警系统,现在已经加入到CNCF基金会,成为继k8s之后第二个在CNCF托管的项目,在kubernetes容器管理系统中,通常会搭配prometheus进行监控,同时也支持多种exporter采集数据,还支持pushgateway进行数据上报,Prometheus性能足够支撑上万台规模的集群。
prometheus组件:
prometheus:主要组件,负责收集告警的调度
alertmanager(告警):告警,负责接受prometheus传来的告警数据,进行告警
node_exproter:监控主机数据,比如cpu,内存,网络等
black_exporter(黑盒测试): 检测网站状态码,主机存活,端口检测等
mysql_exporter: 监控mysql数据
redis_exporter:监控redis数据
docker-compoes搭建:
mkdir -p /docker/{prometheus,prometheus/data,alertmanager,grafana,blackbox_exporter}
cd /docker
创建prometheus配置文件
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- docker_alertmanager_1:9093
rule_files:
- "*rules.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['192.1.1.12:9090']
- job_name: 'node'
static_configs:
- targets: ['192.1.1.12:9100','192.168.88.11:9100']
- job_name: 'alertmanager'
static_configs:
- targets: ['192.1.1.12:9093']
- job_name: 'cadvisor'
static_configs:
- targets: ['192.168.88.13:8080','192.168.88.11:8080']
- job_name: 'mysql-exporter'
#scrape_interval: 5s
static_configs:
- targets: ['192.168.88.13:9104']
- job_name: 'redis-exporter'
#scrape_interval: 5s
static_configs:
- targets: ['192.168.88.13:9121']
#下面这个6个redis-targets是集群的,不是集群去掉
- job_name: 'redis_exporter_targets'
static_configs:
- targets:
- redis://172.16.8.58:6379
metrics_path: /metrics
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 172.16.0.46:9121
- job_name: 'http_status' # 配置job名
metrics_path: /probe # 定义metric获取的路径
params:
module: [http_2xx] # 这里就是我们在black_exporter中定义的模块名
file_sd_configs: # 因需要监控的地址很多,我们这里将所有地址独立出来,后面会介绍该文件
- files:
- '/etc/prometheus/job-web.yml'
refresh_interval: 30s # 30秒刷新一次,当有新的监控地址时,会自动加载进来不需要重启
relabel_configs:
- source_labels: [__address__] # 当前target的访问地址,比如监控百度则为 https://baidu.com
target_label: __param_target # __param是默认参数前缀,target为参数,这里可以理解为把__address__ 的值赋给__param_target,若监控百度,则target=https://baidu.com
- source_labels: [__param_target]
target_label: instance # 可以理解为把__param_target的值赋给instance标签
- target_label: __address__
replacement: docker_black_exporter_1:9115 # web监控原本的target为站点的地址,但Prometheus不是直接去请求该地址,而是去请求black_exporter,故需要把目标地址替换为black_exporter的地址
- job_name: 'rocketmq-exporter'
#scrape_interval: 5s
static_configs:
- targets: ['rocketmq-exporter的ip:5557']
labels:
project: baidu
instance: 172.30.0.150:9876 # 这里加了一个标签方便检索
app: rocketmq
env: pro
vi /docker/prometheus/rules.yml #yml文件注意格式
groups:
- name: node-alert
rules:
- alert: NodeDown
expr: up{job="node"} == 0
for: 1m
labels:
severity: critical
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} down"
description: "Instance: {{ $labels.instance }} 已经宕机 1分钟"
value: "{{ $value }}"
- alert: NodeCpuHigh
expr: (1 - avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m]))) * 100 > 85
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} cpu使用率过高"
description: "CPU 使用率超过 80%"
value: "{{ $value }}"
- alert: NodeCpuIowaitHigh
expr: avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="iowait"}[5m])) * 100 > 80
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} cpu iowait 使用率过高"
description: "CPU iowait 使用率超过 50%"
value: "{{ $value }}"
- alert: NodeLoad5High
expr: node_load5 > (count by (instance) (node_cpu_seconds_total{job="node",mode='system'})) * 1.2
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} load(5m) 过高"
description: "Load(5m) 过高,超出cpu核数 1.2倍"
value: "{{ $value }}"
- alert: NodeMemoryHigh
expr: (1 - node_memory_MemAvailable_bytes{job="node"} / node_memory_MemTotal_bytes{job="node"}) * 100 > 90
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} memory 使用率过高"
description: "Memory 使用率超过 90%"
value: "{{ $value }}"
- alert: NodeDiskRootHigh
expr: (1 - node_filesystem_avail_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/"} / node_filesystem_size_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/"}) * 100 > 90
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk(/ 分区) 使用率过高"
description: "Disk(/ 分区) 使用率超过 90%"
value: "{{ $value }}"
- alert: NodeDiskBootHigh
expr: (1 - node_filesystem_avail_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/boot"} / node_filesystem_size_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/boot"}) * 100 > 80
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk(/boot 分区) 使用率过高"
description: "Disk(/boot 分区) 使用率超过 80%"
value: "{{ $value }}"
- alert: NodeDiskReadHigh
expr: irate(node_disk_read_bytes_total{job="node"}[5m]) > 20 * (1024 ^ 2)
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk 读取字节数 速率过高"
description: "Disk 读取字节数 速率超过 20 MB/s"
value: "{{ $value }}"
- alert: NodeDiskWriteHigh
expr: irate(node_disk_written_bytes_total{job="node"}[5m]) > 20 * (1024 ^ 2)
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk 写入字节数 速率过高"
description: "Disk 写入字节数 速率超过 20 MB/s"
value: "{{ $value }}"
- alert: NodeDiskReadRateCountHigh
expr: irate(node_disk_reads_completed_total{job="node"}[5m]) > 3000
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk iops 每秒读取速率过高"
description: "Disk iops 每秒读取速率超过 3000 iops"
value: "{{ $value }}"
- alert: NodeDiskWriteRateCountHigh
expr: irate(node_disk_writes_completed_total{job="node"}[5m]) > 3000
for: 5m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk iops 每秒写入速率过高"
description: "Disk iops 每秒写入速率超过 3000 iops"
value: "{{ $value }}"
- alert: NodeInodeRootUsedPercentHigh
expr: (1 - node_filesystem_files_free{job="node",fstype=~"ext4|xfs",mountpoint="/"} / node_filesystem_files{job="node",fstype=~"ext4|xfs",mountpoint="/"}) * 100 > 80
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk(/ 分区) inode 使用率过高"
description: "Disk (/ 分区) inode 使用率超过 80%"
value: "{{ $value }}"
- alert: NodeInodeBootUsedPercentHigh
expr: (1 - node_filesystem_files_free{job="node",fstype=~"ext4|xfs",mountpoint="/boot"} / node_filesystem_files{job="node",fstype=~"ext4|xfs",mountpoint="/boot"}) * 100 > 80
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} disk(/boot 分区) inode 使用率过高"
description: "Disk (/boot 分区) inode 使用率超过 80%"
value: "{{ $value }}"
- alert: NodeFilefdAllocatedPercentHigh
expr: node_filefd_allocated{job="node"} / node_filefd_maximum{job="node"} * 100 > 80
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} filefd 打开百分比过高"
description: "Filefd 打开百分比 超过 80%"
value: "{{ $value }}"
- alert: NodeNetworkNetinBitRateHigh
expr: avg by (instance) (irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8
for: 3m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} network 接收比特数 速率过高"
description: "Network 接收比特数 速率超过 20MB/s"
value: "{{ $value }}"
- alert: NodeNetworkNetoutBitRateHigh
expr: avg by (instance) (irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8
for: 3m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} network 发送比特数 速率过高"
description: "Network 发送比特数 速率超过 20MB/s"
value: "{{ $value }}"
- alert: NodeNetworkNetinPacketErrorRateHigh
expr: avg by (instance) (irate(node_network_receive_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15
for: 3m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} 接收错误包 速率过高"
description: "Network 接收错误包 速率超过 15个/秒"
value: "{{ $value }}"
- alert: NodeNetworkNetoutPacketErrorRateHigh
expr: avg by (instance) (irate(node_network_transmit_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15
for: 3m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} 发送错误包 速率过高"
description: "Network 发送错误包 速率超过 15个/秒"
value: "{{ $value }}"
- alert: NodeProcessBlockedHigh
expr: node_procs_blocked{job="node"} > 10
for: 10m
labels:
severity: warning
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} 当前被阻塞的任务的数量过多"
description: "Process 当前被阻塞的任务的数量超过 10个"
value: "{{ $value }}"
- alert: NodeTimeOffsetHigh
expr: abs(node_timex_offset_seconds{job="node"}) > 3 * 60
for: 2m
labels:
severity: info
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} 时间偏差过大"
description: "Time 节点的时间偏差超过 3m"
value: "{{ $value }}"
- alert: Web访问异常
expr: probe_http_status_code{not_200 != "yes" } != 200
for: 30s
annotations:
summary: Web 访问异常{{ $labels.instance }}
labels:
Severity: '严重'
- alert: Web访问响应响应时间>3s
expr: probe_duration_seconds >= 3
for: 30s
annotations:
summary: Web 响应异常{{ $labels.instance }}
labels:
Severity: '警告'
- alert: 证书过期时间<30天
expr: probe_ssl_earliest_cert_expiry-time()< 3600*24*30
annotations:
summary: Web 证书将在30天后过期 {{ $labels.instance }}
labels:
Severity: '提醒'
- alert: 证书过期时间<7天
expr: probe_ssl_earliest_cert_expiry-time()< 3600*24*7
annotations:
summary: Web 证书将在30天后过期 {{ $labels.instance }}
labels:
Severity: '严重'
- alert: 证书过期时间<1天
expr: probe_ssl_earliest_cert_expiry-time()< 3600*24*1
annotations:
summary: Web 证书将在30天后过期 {{ $labels.instance }}
labels:
Severity: '灾难'
#groups:
# - name: mysql.rules
# rules:
- alert: MysqlDown
expr: up == 0
for: 0m
labels:
severity: critical
annotations:
title: 'MySQL down'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL instance is down"
- alert: MysqlRestarted
expr: mysql_global_status_uptime < 60
for: 0m
labels:
severity: info
annotations:
title: 'MySQL Restarted'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL has just been restarted, less than one minute ago"
- alert: MysqlTooManyConnections(>80%)
expr: avg by (instance) (rate(mysql_global_status_threads_connected[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL too many connections (> 80%)'
description: "Mysql实例: 【{{ $labels.instance }}】, More than 80% of MySQL connections are in use, Current Value: {{ $value }}%"
- alert: MysqlThreadsRunningHigh
expr: mysql_global_status_threads_running > 40
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL Threads_Running High'
description: "Mysql实例: 【{{ $labels.instance }}】, Threads_Running above the threshold(40), Current Value: {{ $value }}"
- alert: MysqlQpsHigh
expr: sum by (instance) (rate(mysql_global_status_queries[2m])) > 500
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL QPS High'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL QPS above 500"
- alert: MysqlSlowQueries
expr: increase(mysql_global_status_slow_queries[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL slow queries'
description: "Mysql实例: 【{{ $labels.instance }}】, has some new slow query."
- alert: MysqlTooManyAbortedConnections
expr: round(increase(mysql_global_status_aborted_connects[5m])) > 20
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL too many Aborted connections in 2 minutes'
description: "Mysql实例: 【{{ $labels.instance }}】, {{ $value }} Aborted connections within 2 minutes"
- alert: MysqlTooManyAbortedClients
expr: round(increase(mysql_global_status_aborted_clients[120m])) > 5000
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL too many Aborted connections in 2 hours'
description: "Mysql实例: 【{{ $labels.instance }}】, {{ $value }} Aborted Clients within 2 hours"
- alert: MysqlSlaveIoThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
for: 0m
labels:
severity: critical
annotations:
title: 'MySQL Slave IO thread not running'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL Slave IO thread not running"
- alert: MysqlSlaveSqlThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
for: 0m
labels:
severity: critical
annotations:
title: 'MySQL Slave SQL thread not running'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL Slave SQL thread not running"
- alert: MysqlSlaveReplicationLag
expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30
for: 1m
labels:
severity: critical
annotations:
title: 'MySQL Slave replication lag'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL replication lag"
- alert: MysqlInnodbLogWaits
expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
for: 0m
labels:
severity: warning
annotations:
title: 'MySQL InnoDB log waits'
description: "Mysql实例: 【{{ $labels.instance }}】, innodb log writes stalling"
#groups:
# - name: Docker containers monitoring
# rules:
- alert: ContainerKilled
expr: time() - container_last_seen > 60
for: 5m
labels:
severity: warning
annotations:
summary: "Container killed (instance {{ $labels.instance }})"
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ContainerCpuUsage
expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container CPU usage (instance {{ $labels.instance }})"
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ContainerMemoryUsage
expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container Memory usage (instance {{ $labels.instance }})"
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ContainerVolumeUsage
expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container Volume usage (instance {{ $labels.instance }})"
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ContainerVolumeIoUsage
expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container Volume IO usage (instance {{ $labels.instance }})"
description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ContainerHighThrottleRate
expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Container high throttle rate (instance {{ $labels.instance }})"
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PgbouncerActiveConnectinos
expr: pgbouncer_pools_server_active_connections > 200
for: 5m
labels:
severity: warning
annotations:
summary: "PGBouncer active connectinos (instance {{ $labels.instance }})"
description: "PGBouncer pools are filling up\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PgbouncerErrors
expr: increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "PGBouncer errors (instance {{ $labels.instance }})"
description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PgbouncerMaxConnections
expr: rate(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[1m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "PGBouncer max connections (instance {{ $labels.instance }})"
description: "The number of PGBouncer client connections has reached max_client_conn.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: SidekiqQueueSize
expr: sidekiq_queue_size{} > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Sidekiq queue size (instance {{ $labels.instance }})"
description: "Sidekiq queue {{ $labels.name }} is growing\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: SidekiqSchedulingLatencyTooHigh
expr: max(sidekiq_queue_latency) > 120
for: 5m
labels:
severity: critical
annotations:
summary: "Sidekiq scheduling latency too high (instance {{ $labels.instance }})"
description: "Sidekiq jobs are taking more than 2 minutes to be picked up. Users may be seeing delays in background processing.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ConsulServiceHealthcheckFailed
expr: consul_catalog_service_node_healthy == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Consul service healthcheck failed (instance {{ $labels.instance }})"
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ConsulMissingMasterNode
expr: consul_raft_peers < 3
for: 5m
labels:
severity: critical
annotations:
summary: "Consul missing master node (instance {{ $labels.instance }})"
description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ConsulAgentUnhealthy
expr: consul_health_node_status{status="critical"} == 1
for: 5m
labels:
severity: critical
annotations:
summary: "Consul agent unhealthy (instance {{ $labels.instance }})"
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
#groups:
#- name: Redis
# rules:
- alert: RedisDown
expr: redis_up == 0
for: 5m
labels:
severity: error
annotations:
summary: "Redis down (instance {{ $labels.instance }})"
description: "Redis 挂了啊,mmp\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: MissingBackup
expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
for: 5m
labels:
severity: error
annotations:
summary: "Missing backup (instance {{ $labels.instance }})"
description: "Redis has not been backuped for 24 hours\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: OutOfMemory
expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Out of memory (instance {{ $labels.instance }})"
description: "Redis is running out of memory (> 90%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: ReplicationBroken
expr: delta(redis_connected_slaves[1m]) < 0
for: 5m
labels:
severity: error
annotations:
summary: "Replication broken (instance {{ $labels.instance }})"
description: "Redis instance lost a slave\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: TooManyConnections
expr: redis_connected_clients > 10
for: 1m
labels:
severity: warning
annotations:
summary: "Too many connections (instance {{ $labels.instance }})"
description: "Redis instance has too many connections\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: NotEnoughConnections
expr: redis_connected_clients < 5
for: 5m
labels:
severity: warning
annotations:
summary: "Not enough connections (instance {{ $labels.instance }})"
description: "Redis instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: RejectedConnections
expr: increase(redis_rejected_connections_total[1m]) > 0
for: 5m
labels:
severity: error
annotations:
summary: "Rejected connections (instance {{ $labels.instance }})"
description: "Some connections to Redis has been rejected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
# rules_rocketmq.yml
#groups:
#- name: rocketmq
# rules:
- alert: RocketMQ Exporter is Down
expr: up{job="rocketmq"} == 0
for: 1m
labels:
severity: '灾难'
annotations:
summary: RocketMQ {{ $labels.instance }} is down
- alert: RocketMQ 存在消息积压
expr: (sum(irate(rocketmq_producer_offset[1m])) by (topic) - on(topic) group_right sum(irate(rocketmq_consumer_offset[1m])) by (group,topic)) > 5
for: 5m
labels:
severity: '警告'
annotations:
summary: RocketMQ (group={{ $labels.group }} topic={{ $labels.topic }})积压数 = {{ .Value }}
- alert: GroupGetLatencyByStoretime 消费组的消费延时时间过高
expr: rocketmq_group_get_latency_by_storetime/1000 > 5 and rate(rocketmq_group_get_latency_by_storetime[5m]) >0
for: 3m
labels:
severity: 警告
annotations:
description: 'consumer {{$labels.group}} on {{$labels.broker}}, {{$labels.topic}} consume time lag behind message store time and (behind value is {{$value}}).'
summary: 消费组的消费延时时间过高
- alert: RocketMQClusterProduceHigh 集群TPS > 20
expr: sum(rocketmq_producer_tps) by (cluster) >= 20
for: 3m
labels:
severity: 警告
annotations:
description: '{{$labels.cluster}} Sending tps too high. now TPS = {{ .Value }}'
summary: cluster send tps too high
创建prometheus.yaml文件
vi /docker/prometheus/prometheus.yaml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- docker_alertmanager_1:9093
rule_files:
- "*rules.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['192.1.1.12:9090']
- job_name: 'node'
static_configs:
- targets: ['192.1.1.12:9100','192.168.88.11:9100']
- job_name: 'alertmanager'
static_configs:
- targets: ['192.1.1.12:9093']
- job_name: 'cadvisor'
static_configs:
- targets: ['192.168.88.13:8080','192.168.88.11:8080']
- job_name: 'mysql-exporter'
#scrape_interval: 5s
static_configs:
- targets: ['192.168.88.13:9104']
- job_name: 'redis-exporter'
#scrape_interval: 5s
static_configs:
- targets: ['192.168.88.13:9121']
#下面这个6个redis-targets是集群的,不是集群去掉
- job_name: 'redis_exporter_targets'
static_configs:
- targets:
- redis://172.16.8.58:6379
metrics_path: /metrics
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 172.16.0.46:9121
- job_name: 'http_status' # 配置job名
metrics_path: /probe # 定义metric获取的路径
params:
module: [http_2xx] # 这里就是我们在black_exporter中定义的模块名
file_sd_configs: # 因需要监控的地址很多,我们这里将所有地址独立出来,后面会介绍该文件
- files:
- '/etc/prometheus/job-web.yml'
refresh_interval: 30s # 30秒刷新一次,当有新的监控地址时,会自动加载进来不需要重启
relabel_configs:
- source_labels: [__address__] # 当前target的访问地址,比如监控百度则为 https://baidu.com
target_label: __param_target # __param是默认参数前缀,target为参数,这里可以理解为把__address__ 的值赋给__param_target,若监控百度,则target=https://baidu.com
- source_labels: [__param_target]
target_label: instance # 可以理解为把__param_target的值赋给instance标签
- target_label: __address__
replacement: docker_black_exporter_1:9115 # web监控原本的target为站点的地址,但Prometheus不是直接去请求该地址,而是去请求black_exporter,故需要把目标地址替换为black_exporter的地址
- job_name: 'rocketmq-exporter'
#scrape_interval: 5s
static_configs:
- targets: ['rocketmq-exporter的ip:5557']
labels:
project: baidu
instance: 172.30.0.150:9876 # 这里加了一个标签方便检索
app: rocketmq
env: pro
vi /docker/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m #超时,默认5min
#邮箱smtp服务
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: '######' ##qq邮箱token
smtp_require_tls: false
# 定义模板信息
templates:
- 'template/*.tmpl' # 路径
# 路由
route:
group_by: ['alertname'] # 报警分组依据
group_wait: 10s #组等待时间
group_interval: 10s # 发送前等待时间
repeat_interval: 1m #重复周期
receiver: 'mail' # 默认警报接收者
# 警报接收者
receivers:
- name: 'mail' #警报名称
email_configs:
- to: '{{ template "email.to" . }}' #接收警报的email
html: '{{ template "email.to.html" . }}' # 模板
send_resolved: true
# 告警抑制
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
vi /docker/alertmanager/template/em.tmpl
{{ define "email.from" }}[email protected]{{ end }}
{{ define "email.to" }}[email protected],[email protected]{{ end }}
{{ define "email.to.html" }}
{{ range .Alerts }}
=========start==========
告警程序: prometheus_alert
告警级别: {{ .Labels.severity }} 级
告警类型: {{ .Labels.alertname }}
故障主机: {{ .Labels.instance }}
告警主题: {{ .Annotations.summary }}
告警详情: {{ .Annotations.description }}
触发时间: {{ (.StartsAt.Add 28800e9) }}
=========end==========
{{ end }}
{{ end }}
vi /docker/blackbox_exporter/config.yml
modules:
http_2xx:
prober: http
http_post_2xx:
prober: http
http:
method: POST
tcp_connect:
prober: tcp
pop3s_banner:
prober: tcp
tcp:
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
grpc:
prober: grpc
grpc:
tls: true
preferred_ip_protocol: "ip4"
grpc_plain:
prober: grpc
grpc:
tls: false
service: "service1"
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
- send: "SSH-2.0-blackbox-ssh-check"
irc_banner:
prober: tcp
tcp:
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: "^:[^ ]+ 001"
icmp:
prober: icmp
icmp_ttl5:
prober: icmp
timeout: 5s
icmp:
ttl: 5
创建blackweb文件
vi /docker/prometheus/job-web.yml
---
- targets:
- https://www.baidu.com/
labels:
env: pro
app: web
project: 百度
desc: 百度生产
- targets:
- https://blog.csdn.net/
labels:
env: test
app: web
project: CSDN
desc: 测试一下啦
not_200: yes # 这个自定义标签是为了标识某些地址在正常情况下不是返回200状态码
vi /docker/docker-compoes.yml
version: '3.7'
services:
node-exporter:
image: prom/node-exporter:latest
restart: always
ports:
- "9100:9100"
networks:
- prom
# dingtalk:
# image: timonwong/prometheus-webhook-dingtalk:latest
# restart: always
# volumes:
# - type: bind
# source: ./alertmanager/config.yml
# target: /etc/prometheus-webhook-dingtalk/config.yml
# read_only: true
# ports:
# - "8060:8060"
# networks:
# - prom
alertmanager:
#depends_on:
# - dingtalk
image: prom/alertmanager:latest
restart: always
volumes:
- type: bind
source: ./alertmanager/alertmanager.yml
target: /etc/alertmanager/alertmanager.yml
read_only: true
- type: volume
source: alertmanager
target: /etc/alertmanager
ports:
- "9093:9093"
- "9094:9094"
networks:
- prom
prometheus:
depends_on:
- alertmanager
image: prom/prometheus:latest
restart: always
command:
- --config.file=/etc/prometheus/prometheus.yml
- --web.enable-lifecycle
volumes:
- type: bind
source: ./prometheus/prometheus.yml
target: /etc/prometheus/prometheus.yml
read_only: true
- type: bind
source: ./prometheus/rules.yml
target: /etc/prometheus/rules.yml
read_only: true
#- type: bind
#source: ./prometheus/job-web.yml
#target: /etc/prometheus/job-web.yml
#read_only: true
- type: volume
source: prometheus
target: /prometheus
ports:
- "9090:9090"
networks:
- prom
grafana:
depends_on:
- prometheus
image: grafana/grafana:latest
restart: always
volumes:
- type: volume
source: grafana
target: /var/lib/grafana
ports:
- "3000:3000"
networks:
- prom
cadvisor:
image: google/cadvisor:latest
#container_name: cadvisor
hostname: cadvisor
restart: always
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
ports:
- "8080:8080"
networks:
- prom
privileged: true
mysqld-exporter:
image: prom/mysqld-exporter
hostname: mysqld-exporter
restart: always
ports:
- "9104:9104"
environment:
- DATA_SOURCE_NAME=root:12345@(10.211.122.9:3306)/ #username:password@(ip:端口)
networks:
- prom
redis-exporter:
image: oliver006/redis_exporter
#container_name: mysqld-exporter
hostname: redis-exporter
restart: always
ports:
- "9121:9121"
command:
- --redis.addr=redis://10.211.122.9:6379
- --redis.password=123456
networks:
- prom
blackbox_exporter:
#container_name: blackbox_exporter
image: prom/blackbox-exporter:master
restart: always
volumes:
- /docker/blackbox_exporter/config.yml:/etc/blackbox_exporter/config.yml
ports:
- 9115:9115
rocketmq-exporter:
image: sawyerlan/rocketmq-exporter
#container_name: mysqld-exporter
hostname: sawyerlan/rocketmq-exporter
restart: always
ports:
- "5557:5557"
command:
#- --rocketmq.config.namesrvAddr=172.30.0.150:9876;172.30.0.151:9876
- --rocketmq.config.namesrvAddr=172.30.0.150:9876
networks:
- prom
volumes:
prometheus:
driver: local
driver_opts:
type: none
o: bind
device: /docker/prometheus/data
grafana:
driver: local
driver_opts:
type: none
o: bind
device: /docker/grafana
alertmanager:
driver: local
driver_opts:
type: none
o: bind
device: /docker/alertmanager
networks:
prom:
driver: bridge
curl -X POST http://192.1.1.12:9090/-/reload
运行docker-compoes
prometheusinstall.sh https://www.aliyundrive.com/s/X5qiSZMeMDk
黑盒测试模板 14928 14603black_exporter
14612 14883Rocket_exporter
8919 node_exporter
13631 docker容器模板
14934 MySQL模板Buffer Pool Size of Total RAM会 no data问题