68.k8s告警规则

[www@c02-jenkins container]$ cat hpa.yaml

apiVersion: monitoring.coreos.com/v1

kind: PrometheusRule

metadata:

  annotations:

  labels:

    prometheus: k8s

    role: alert-rules

  name: container-hpa

  namespace: kubesphere-monitoring-system


spec:

  groups:

    - name: "容器伸缩"

      rules:

        - alert: "Pod伸缩异常"

          expr: kube_hpa_status_condition{condition="AbleToScale",status="false"} == 1

          for: 1m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes 伸缩异常"

            description: "Kubernetes 【{{ $labels.instance }}】伸缩异常重启, 请检查!!!"

        - alert: "HPA收集数据异常"

          expr: kube_hpa_status_condition{condition="ScalingActive",status="false"} == 1

          for: 1m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod HPA收集监控数据异常"

            description: "Kubernetes Pod节点【{{ $labels.instance }}】收集指标异常重启, 请检查!!!"

        - alert: "HPA伸缩能力"

          expr: kube_hpa_status_desired_replicas >= kube_hpa_spec_max_replicas

          for: 1m

          labels:

            level: p2

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod HPA扩容节点已经达到最大"

            description: "Kubernetes Pod节点【{{ $labels.instance }}】扩容节点已经达到最大, 请检查!!!"


[www@c02-jenkins container]$ cat resources.yaml

apiVersion: monitoring.coreos.com/v1

kind: PrometheusRule

metadata:

  annotations:

  labels:

    prometheus: k8s

    role: alert-rules

  name: container-resources

  namespace: kubesphere-monitoring-system


spec:

  groups:

    - name: "容器资源"

      rules:

        - alert: "CPU使用率"

          expr: sum(rate(container_cpu_usage_seconds_total{container !="",container!="POD"}[2m])) by (pod,namespace,instance,container) / (sum(container_spec_cpu_quota{container !="",container!="POD"}/100000) by (pod,namespace,instance,container)) * 100 > 60

          for: 1m

          labels:

            level: p4

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod CPU使用率过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】CPU使用率过高,请检查!!!"

        - alert: "CPU使用率"

          expr: sum(rate(container_cpu_usage_seconds_total{container !="",container!="POD"}[2m])) by (pod,namespace,instance,container) / (sum(container_spec_cpu_quota{container !="",container!="POD"}/100000) by (pod,namespace,instance,container)) * 100 > 70

          for: 1m

          labels:

            level: p3

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod CPU使用率过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】CPU使用率过高,请检查!!!"

        - alert: "CPU使用率"

          expr: sum(rate(container_cpu_usage_seconds_total{container !="",container!="POD"}[2m])) by (pod,namespace,instance,container) / (sum(container_spec_cpu_quota{container !="",container!="POD"}/100000) by (pod,namespace,instance,container)) * 100 > 80

          for: 1m

          labels:

            level: p2

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod CPU使用率过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】CPU使用率过高,请检查!!!"

        - alert: "CPU使用率"

          expr: sum(rate(container_cpu_usage_seconds_total{container !="",container!="POD"}[2m])) by (pod,namespace,instance,container) / (sum(container_spec_cpu_quota{container !="",container!="POD"}/100000) by (pod,namespace,instance,container)) * 100 > 90

          for: 1m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod CPU使用率过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】CPU使用率过高,请检查!!!"

        - alert: "内存使用率"

          expr: sum (container_memory_rss{container !="",container!="POD"}) by (pod,namespace,instance,container)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD"}) by (pod,namespace,instance,container) * 100 > 80 

          for: 1m

          labels:

            level: p4

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 内存使用率过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】内存使用率过高,请检查!!!"

        - alert: "内存使用率"

          expr: sum (container_memory_rss{container !="",container!="POD"}) by (pod,namespace,instance,container)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD"}) by (pod,namespace,instance,container) * 100 > 85

          for: 1m

          labels:

            level: p3

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 内存使用率过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】内存使用率过高,请检查!!!"

        - alert: "内存使用率"

          expr: sum (container_memory_rss{container !="",container!="POD"}) by (pod,namespace,instance,container)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD"}) by (pod,namespace,instance,container) * 100 > 90

          for: 1m

          labels:

            level: p2

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 内存使用率过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】内存使用率过高,请检查!!!"

        - alert: "内存使用率"

          expr: sum (container_memory_rss{container !="",container!="POD"}) by (pod,namespace,instance,container)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD"}) by (pod,namespace,instance,container) * 100 > 95

          for: 1m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 内存使用率过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】内存使用率过高,请检查!!!"

        - alert: "磁盘I/O使用"

          expr: sum(rate(container_fs_reads_bytes_total[5m])) by (pod,namespace,instance,container) > 10485760

          for: 1m

          labels:

            level: p4

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 磁盘IO使用过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】磁盘IO使用率过高,每秒达到10MB请检查!!!"

        - alert: "磁盘I/O使用"

          expr: sum(rate(container_fs_reads_bytes_total[5m])) by (pod,namespace,instance,container) > 20971520

          for: 1m

          labels:

            level: p3

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 磁盘IO使用过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】磁盘IO使用率过高,每秒达到20MB请检查!!!"

        - alert: "磁盘I/O使用"

          expr: sum(rate(container_fs_reads_bytes_total[5m])) by (pod,namespace,instance,container) > 31457280

          for: 1m

          labels:

            level: p2

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 磁盘IO使用过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】磁盘IO使用率过高,每秒达到30MB请检查!!!"

        - alert: "磁盘I/O使用"

          expr: sum(rate(container_fs_reads_bytes_total[5m])) by (pod,namespace,instance,container) > 41943040

          for: 1m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 磁盘IO使用过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】磁盘IO使用率过高,每秒达到40MB请检查!!!"

        - alert: "网络I/O使用"

          expr: sum(rate(container_network_receive_bytes_total[5m])) by (pod,namespace,instance,container) > 20971520

          for: 1m

          labels:

            level: p4

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 网络IO使用过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】网络IO使用率过高,每秒达到20MB请检查!!!"

        - alert: "网络I/O使用"

          expr: sum(rate(container_network_receive_bytes_total[5m])) by (pod,namespace,instance,container) > 31457280

          for: 1m

          labels:

            level: p3

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 网络IO使用过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】网络IO使用率过高,每秒达到30MB请检查!!!"

        - alert: "网络I/O使用"

          expr: sum(rate(container_network_receive_bytes_total[5m])) by (pod,namespace,instance,container) > 41943040

          for: 1m

          labels:

            level: p2

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 网络IO使用过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】网络IO使用率过高,每秒达到40MB请检查!!!"

        - alert: "网络I/O使用"

          expr: sum(rate(container_network_receive_bytes_total[5m])) by (pod,namespace,instance,container) > 51943040

          for: 1m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 网络IO使用过高"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】网络IO使用率过高,每秒达到50MB请检查!!!"


[www@c02-jenkins container]$ cat slos.yaml

apiVersion: monitoring.coreos.com/v1

kind: PrometheusRule

metadata:

  annotations:

  labels:

    prometheus: k8s

    role: alert-rules

  name: kube-slos

  namespace: kubesphere-monitoring-system


spec:

  groups:

    - name: "接口错误率"

      rules:

        - alert: "KubeAPI错误预算"

          expr: sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)

          for: 1h

          labels:

            level: p4

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes API错误预算"

            description: "Kubernetes API错误预算较多, 请检查!!!"

        - alert: "KubeAPI错误预算"

          expr: sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)

          for: 1h

          labels:

            level: p3

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes API错误预算"

            description: "Kubernetes API错误预算较多, 请检查!!!"

        - alert: "KubeAPI错误预算"

          expr: sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)

          for: 15m

          labels:

            level: p2

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes API错误预算"

            description: "Kubernetes API错误预算较多, 请检查!!!"

        - alert: "KubeAPI错误预算"

          expr: sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)

          for: 2m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes API错误预算"

            description: "Kubernetes API错误预算较多, 请检查!!!"

        - alert: "KubeStateMetricsList错误率"

          expr: (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) > 0.01

          for: 2m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes kube-state-metrics List错误率"

            description: "Kubernetes kube-state-metrics List错误率较高, 请检查!!!"

        - alert: "KubeStateMetricsWatch错误率"

          expr: (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) > 0.01

          for: 2m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes kube-state-metrics Watch错误率"

            description: "Kubernetes kube-state-metrics Watch错误率较高, 请检查!!!"


[www@c02-jenkins container]$ cat status.yaml

apiVersion: monitoring.coreos.com/v1

kind: PrometheusRule

metadata:

  annotations:

  labels:

    prometheus: k8s

    role: alert-rules

  name: container-status

  namespace: kubesphere-monitoring-system


spec:

  groups:

    - name: "容器状态"

      rules:

        - alert: "Pod异常重启"

          expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m])* 60 * 5 > 0

          for: 1m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 异常重启"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】异常重启, 请检查!!!"

        - alert: "Pod处于准备阶段"

          expr: sum by (pod,namespace,instance,container) (max by(pod,namespace,instance,container) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Pending|Unknown"}) * on(pod,namespace,instance,container) group_left(owner_kind) max by(pod,namespace,instance,container, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0

          for: 2m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 处于Ready阶段"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】启动异常,长时间处于Ready阶段, 请检查!!!"

        - alert: "Pod处于等待状态"

          expr: sum by (namespace, pod, container, instance) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0

          for: 2m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod 处于waiting状态"

            description: "Kubernetes Pod节点【{{ $labels.pod }}】状态异常,长时间处于waiting状态, 请检查!!!"

        - alert: "Deployment部署状态不匹配"

          expr: kube_deployment_status_observed_generation{job="kube-state-metrics"} != kube_deployment_metadata_generation{job="kube-state-metrics"}

          for: 5m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Deployment部署失败"

            description: "Kubernetes 【{{$labels.namespace}} {{ $labels.deployment }}】部署状态异常,这表示部署失败,但尚未回滚, 请检查!!!"

        - alert: "Deployment副本数不匹配"

          expr: (kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"}) and (changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) == 0)

          for: 5m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Deployment期望数量不匹配"

            description: "Kubernetes 【{{$labels.namespace}} {{ $labels.deployment }}】与预期的副本数不匹配, 请检查!!!"

        - alert: "Deployment Pod扩缩容"

          expr: changes(kube_deployment_status_replicas_available[2m]) > 0

          for: 1m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Pod进行了扩缩容"

            description: "Kubernetes 【{{$labels.namespace}} {{ $labels.deployment }}】Pod进行了扩缩容, 请检查!!!"

        - alert: "Deployment Pod异常"

          expr: kube_deployment_status_replicas_unavailable > 0

          for: 1m

          labels:

            level: p1

            resources: container

          annotations:

            value: "{{ $value }}"

            summary: "Kubernetes Deployment 存在不可用Pod"

            description: "Kubernetes 【{{$labels.namespace}} {{ $labels.deployment }}】存在不可用Pod, 请检查!!!"

你可能感兴趣的:(68.k8s告警规则)