67.k8s 宿主机告警规则

[www@c02-jenkins rule]$ cat node/node.yaml

apiVersion: monitoring.coreos.com/v1

kind: PrometheusRule

metadata:

  annotations:

  labels:

    prometheus: k8s

    role: alert-rules

  name: node-resources

  namespace: kubesphere-monitoring-system


spec:

  groups:

    - name: "宿主机资源"

      rules:

        - alert: "CPU使用率"

          expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[2m])) by (instance) / sum (machine_cpu_cores) by (instance) * 100 > 60

          for: 1m

          labels:

            level: p4

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS CPU使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】CPU使用率过高,请检查!!!"

        - alert: "CPU使用率"

          expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[2m])) by (instance) / sum (machine_cpu_cores) by (instance) * 100 > 65

          for: 1m

          labels:

            level: p3

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS CPU使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】CPU使用率过高,请检查!!!"

        - alert: "CPU使用率"

          expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[2m])) by (instance) / sum (machine_cpu_cores) by (instance) * 100 > 70

          for: 1m

          labels:

            level: p2

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS CPU使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】CPU使用率过高,请检查!!!"

        - alert: "CPU使用率"

          expr: sum (rate (container_cpu_usage_seconds_total{id="/"}[2m])) by (instance) / sum (machine_cpu_cores) by (instance) * 100 > 75

          for: 1m

          labels:

            level: p1

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS CPU使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】CPU使用率过高,请检查!!!"

        - alert: "load使用情况"

          expr: node_load1 > count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})

          for: 1m

          labels:

            level: p3

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS load过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】load过高,请检查!!!"

        - alert: "load使用情况"

          expr: node_load5 > count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})

          for: 1m

          labels:

            level: p2

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS load过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】load过高,请检查!!!"

        - alert: "load使用情况"

          expr: node_load15 > count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})

          for: 1m

          labels:

            level: p1

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS load过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】load过高,请检查!!!"

        - alert: "内存使用率"

          expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes * 100 > 60

          for: 1m

          labels:

            level: p4

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 内存使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】内存使用率过高,请检查!!!"

        - alert: "内存使用率"

          expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes * 100 > 65

          for: 1m

          labels:

            level: p3

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 内存使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】内存使用率过高,请检查!!!"

        - alert: "内存使用率"

          expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes * 100 > 70

          for: 1m

          labels:

            level: p2

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 内存使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】内存使用率过高,请检查!!!"

        - alert: "内存使用率"

          expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes * 100 > 75

          for: 1m

          labels:

            level: p1

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 内存使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】内存使用率过高,请检查!!!"

        - alert: "Inode使用率"

          expr: (node_filesystem_files{fstype=~"ext.?|xfs"} - node_filesystem_files_free{fstype=~"ext.?|xfs"}) / node_filesystem_files{fstype=~"ext.?|xfs"} * 100 > 60

          for: 1m

          labels:

            level: p4

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS Inode使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】Inode使用率过高,请检查!!!"

        - alert: "Inode使用率"

          expr: (node_filesystem_files{fstype=~"ext.?|xfs"} - node_filesystem_files_free{fstype=~"ext.?|xfs"}) / node_filesystem_files{fstype=~"ext.?|xfs"} * 100 > 65

          for: 1m

          labels:

            level: p3

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS Inode使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】Inode使用率过高,请检查!!!"

        - alert: "Inode使用率"

          expr: (node_filesystem_files{fstype=~"ext.?|xfs"} - node_filesystem_files_free{fstype=~"ext.?|xfs"}) / node_filesystem_files{fstype=~"ext.?|xfs"} * 100 > 70

          for: 1m

          labels:

            level: p2

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS Inode使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】Inode使用率过高,请检查!!!"

        - alert: "Inode使用率"

          expr: (node_filesystem_files{fstype=~"ext.?|xfs"} - node_filesystem_files_free{fstype=~"ext.?|xfs"}) / node_filesystem_files{fstype=~"ext.?|xfs"} * 100 > 75

          for: 1m

          labels:

            level: p1

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS Inode使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】Inode使用率过高,请检查!!!"

        - alert: "磁盘使用率"

          expr: (node_filesystem_size_bytes{device!="rootfs"} - node_filesystem_avail_bytes{device!="rootfs"}) / node_filesystem_size_bytes{device!="rootfs"} * 100 > 65

          for: 1m

          labels:

            level: p4

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 磁盘使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】磁盘使用率过高,请检查!!!"

        - alert: "磁盘使用率"

          expr: (node_filesystem_size_bytes{device!="rootfs"} - node_filesystem_avail_bytes{device!="rootfs"}) / node_filesystem_size_bytes{device!="rootfs"} * 100 > 70

          for: 1m

          labels:

            level: p3

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 磁盘使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】磁盘使用率过高,请检查!!!"

        - alert: "磁盘使用率"

          expr: (node_filesystem_size_bytes{device!="rootfs"} - node_filesystem_avail_bytes{device!="rootfs"}) / node_filesystem_size_bytes{device!="rootfs"} * 100 > 75

          for: 1m

          labels:

            level: p2

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 磁盘使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】磁盘使用率过高,请检查!!!"

        - alert: "磁盘使用率"

          expr: (node_filesystem_size_bytes{device!="rootfs"} - node_filesystem_avail_bytes{device!="rootfs"}) / node_filesystem_size_bytes{device!="rootfs"} * 100 > 80

          for: 1m

          labels:

            level: p1

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 磁盘使用率过高"

            description: "Kubernetes node节点【{{ $labels.instance }} 分区{{ $labels.mountpoint }}】磁盘使用率过高,请检查!!!"

        - alert: "磁盘I/O使用"

          expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 52428800

          for: 1m

          labels:

            level: p4

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 磁盘IO使用过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】磁盘IO使用率过高,每秒达到50MB请检查!!!"

        - alert: "磁盘I/O使用"

          expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 104857600

          for: 1m

          labels:

            level: p3

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 磁盘IO使用过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】磁盘IO使用率过高,每秒达到100MB请检查!!!"

        - alert: "磁盘I/O使用"

          expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 209715200

          for: 1m

          labels:

            level: p2

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 磁盘IO使用过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】磁盘IO使用率过高,每秒达到200MB请检查!!!"

        - alert: "磁盘I/O使用"

          expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 314572800

          for: 1m

          labels:

            level: p1

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 磁盘IO使用过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】磁盘IO使用率过高,每秒达到300MB请检查!!!"

        - alert: "网络I/O使用"

          expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 104857600

          for: 1m

          labels:

            level: p4

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 网络IO使用过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】网络IO使用率过高,每秒达到100MB请检查!!!"

        - alert: "网络I/O使用"

          expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 209715200

          for: 1m

          labels:

            level: p3

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 网络IO使用过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】网络IO使用率过高,每秒达到200MB请检查!!!"

        - alert: "网络I/O使用"

          expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 314572800

          for: 1m

          labels:

            level: p2

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 网络IO使用过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】网络IO使用率过高,每秒达到300MB请检查!!!"

        - alert: "网络I/O使用"

          expr: sum(rate(node_disk_read_bytes_total[5m])) by (instance) > 419430400

          for: 1m

          labels:

            level: p1

            resources: node

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes ECS 网络IO使用过高"

            description: "Kubernetes node节点【{{ $labels.instance }}】网络IO使用率过高,每秒达到400MB请检查!!!"

你可能感兴趣的:(67.k8s 宿主机告警规则)