[www@c02-jenkins rule]$ cat node/node.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
annotations:
labels:
prometheus: k8s
role: alert-rules
name: node-resources
namespace: kubesphere-monitoring-system
spec:
groups:
- name: "宿主机资源"
rules:
- alert: "CPU使用率"
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[2m])) by (instance) / sum (machine_cpu_cores) by (instance) * 100 > 60
for: 1m
labels:
level: p4
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS CPU使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }}】CPU使用率过高,请检查!!!"
- alert: "CPU使用率"
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[2m])) by (instance) / sum (machine_cpu_cores) by (instance) * 100 > 65
for: 1m
labels:
level: p3
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS CPU使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }}】CPU使用率过高,请检查!!!"
- alert: "CPU使用率"
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[2m])) by (instance) / sum (machine_cpu_cores) by (instance) * 100 > 70
for: 1m
labels:
level: p2
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS CPU使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }}】CPU使用率过高,请检查!!!"
- alert: "CPU使用率"
expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[2m])) by (instance) / sum (machine_cpu_cores) by (instance) * 100 > 75
for: 1m
labels:
level: p1
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS CPU使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }}】CPU使用率过高,请检查!!!"
- alert: "load使用情况"
expr: node_load1 > count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})
for: 1m
labels:
level: p3
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS load过高"
description: "Kubernetes node节点【{{ $labels.instance }}】load过高,请检查!!!"
- alert: "load使用情况"
expr: node_load5 > count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})
for: 1m
labels:
level: p2
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS load过高"
description: "Kubernetes node节点【{{ $labels.instance }}】load过高,请检查!!!"
- alert: "load使用情况"
expr: node_load15 > count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})
for: 1m
labels:
level: p1
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS load过高"
description: "Kubernetes node节点【{{ $labels.instance }}】load过高,请检查!!!"
- alert: "内存使用率"
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes * 100 > 60
for: 1m
labels:
level: p4
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 内存使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }}】内存使用率过高,请检查!!!"
- alert: "内存使用率"
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes * 100 > 65
for: 1m
labels:
level: p3
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 内存使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }}】内存使用率过高,请检查!!!"
- alert: "内存使用率"
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes * 100 > 70
for: 1m
labels:
level: p2
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 内存使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }}】内存使用率过高,请检查!!!"
- alert: "内存使用率"
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes * 100 > 75
for: 1m
labels:
level: p1
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 内存使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }}】内存使用率过高,请检查!!!"
- alert: "Inode使用率"
expr: (node_filesystem_files{fstype=~"ext.?|xfs"} - node_filesystem_files_free{fstype=~"ext.?|xfs"}) / node_filesystem_files{fstype=~"ext.?|xfs"} * 100 > 60
for: 1m
labels:
level: p4
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS Inode使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】Inode使用率过高,请检查!!!"
- alert: "Inode使用率"
expr: (node_filesystem_files{fstype=~"ext.?|xfs"} - node_filesystem_files_free{fstype=~"ext.?|xfs"}) / node_filesystem_files{fstype=~"ext.?|xfs"} * 100 > 65
for: 1m
labels:
level: p3
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS Inode使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】Inode使用率过高,请检查!!!"
- alert: "Inode使用率"
expr: (node_filesystem_files{fstype=~"ext.?|xfs"} - node_filesystem_files_free{fstype=~"ext.?|xfs"}) / node_filesystem_files{fstype=~"ext.?|xfs"} * 100 > 70
for: 1m
labels:
level: p2
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS Inode使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】Inode使用率过高,请检查!!!"
- alert: "Inode使用率"
expr: (node_filesystem_files{fstype=~"ext.?|xfs"} - node_filesystem_files_free{fstype=~"ext.?|xfs"}) / node_filesystem_files{fstype=~"ext.?|xfs"} * 100 > 75
for: 1m
labels:
level: p1
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS Inode使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】Inode使用率过高,请检查!!!"
- alert: "磁盘使用率"
expr: (node_filesystem_size_bytes{device!="rootfs"} - node_filesystem_avail_bytes{device!="rootfs"}) / node_filesystem_size_bytes{device!="rootfs"} * 100 > 65
for: 1m
labels:
level: p4
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 磁盘使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】磁盘使用率过高,请检查!!!"
- alert: "磁盘使用率"
expr: (node_filesystem_size_bytes{device!="rootfs"} - node_filesystem_avail_bytes{device!="rootfs"}) / node_filesystem_size_bytes{device!="rootfs"} * 100 > 70
for: 1m
labels:
level: p3
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 磁盘使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】磁盘使用率过高,请检查!!!"
- alert: "磁盘使用率"
expr: (node_filesystem_size_bytes{device!="rootfs"} - node_filesystem_avail_bytes{device!="rootfs"}) / node_filesystem_size_bytes{device!="rootfs"} * 100 > 75
for: 1m
labels:
level: p2
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 磁盘使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】磁盘使用率过高,请检查!!!"
- alert: "磁盘使用率"
expr: (node_filesystem_size_bytes{device!="rootfs"} - node_filesystem_avail_bytes{device!="rootfs"}) / node_filesystem_size_bytes{device!="rootfs"} * 100 > 80
for: 1m
labels:
level: p1
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 磁盘使用率过高"
description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】磁盘使用率过高,请检查!!!"
- alert: "磁盘I/O使用"
expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 52428800
for: 1m
labels:
level: p4
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 磁盘IO使用过高"
description: "Kubernetes node节点【{{ $labels.instance }}】磁盘IO使用率过高,每秒达到50MB请检查!!!"
- alert: "磁盘I/O使用"
expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 104857600
for: 1m
labels:
level: p3
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 磁盘IO使用过高"
description: "Kubernetes node节点【{{ $labels.instance }}】磁盘IO使用率过高,每秒达到100MB请检查!!!"
- alert: "磁盘I/O使用"
expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 209715200
for: 1m
labels:
level: p2
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 磁盘IO使用过高"
description: "Kubernetes node节点【{{ $labels.instance }}】磁盘IO使用率过高,每秒达到200MB请检查!!!"
- alert: "磁盘I/O使用"
expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 314572800
for: 1m
labels:
level: p1
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 磁盘IO使用过高"
description: "Kubernetes node节点【{{ $labels.instance }}】磁盘IO使用率过高,每秒达到300MB请检查!!!"
- alert: "网络I/O使用"
expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 104857600
for: 1m
labels:
level: p4
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 网络IO使用过高"
description: "Kubernetes node节点【{{ $labels.instance }}】网络IO使用率过高,每秒达到100MB请检查!!!"
- alert: "网络I/O使用"
expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 209715200
for: 1m
labels:
level: p3
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 网络IO使用过高"
description: "Kubernetes node节点【{{ $labels.instance }}】网络IO使用率过高,每秒达到200MB请检查!!!"
- alert: "网络I/O使用"
expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 314572800
for: 1m
labels:
level: p2
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 网络IO使用过高"
description: "Kubernetes node节点【{{ $labels.instance }}】网络IO使用率过高,每秒达到300MB请检查!!!"
- alert: "网络I/O使用"
expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 419430400
for: 1m
labels:
level: p1
resources: node
annotations:
value: "{{ $value }}"
summary: "Kubernetes ECS 网络IO使用过高"
description: "Kubernetes node节点【{{ $labels.instance }}】网络IO使用率过高,每秒达到400MB请检查!!!"