[www@c02-jenkins container]$ cat hpa.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
annotations:
labels:
prometheus: k8s
role: alert-rules
name: container-hpa
namespace: kubesphere-monitoring-system
spec:
groups:
- name: "容器伸缩"
rules:
- alert: "Pod伸缩异常"
expr: kube_hpa_status_condition{condition="AbleToScale",status="false"} == 1
for: 1m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes 伸缩异常"
description: "Kubernetes 【{{ $labels.instance }}】伸缩异常重启, 请检查!!!"
- alert: "HPA收集数据异常"
expr: kube_hpa_status_condition{condition="ScalingActive",status="false"} == 1
for: 1m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod HPA收集监控数据异常"
description: "Kubernetes Pod节点【{{ $labels.instance }}】收集指标异常重启, 请检查!!!"
- alert: "HPA伸缩能力"
expr: kube_hpa_status_desired_replicas >= kube_hpa_spec_max_replicas
for: 1m
labels:
level: p2
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod HPA扩容节点已经达到最大"
description: "Kubernetes Pod节点【{{ $labels.instance }}】扩容节点已经达到最大, 请检查!!!"
[www@c02-jenkins container]$ cat resources.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
annotations:
labels:
prometheus: k8s
role: alert-rules
name: container-resources
namespace: kubesphere-monitoring-system
spec:
groups:
- name: "容器资源"
rules:
- alert: "CPU使用率"
expr: sum(rate(container_cpu_usage_seconds_total{container !="",container!="POD"}[2m])) by (pod,namespace,instance,container) / (sum(container_spec_cpu_quota{container !="",container!="POD"}/100000) by (pod,namespace,instance,container)) * 100 > 60
for: 1m
labels:
level: p4
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod CPU使用率过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】CPU使用率过高,请检查!!!"
- alert: "CPU使用率"
expr: sum(rate(container_cpu_usage_seconds_total{container !="",container!="POD"}[2m])) by (pod,namespace,instance,container) / (sum(container_spec_cpu_quota{container !="",container!="POD"}/100000) by (pod,namespace,instance,container)) * 100 > 70
for: 1m
labels:
level: p3
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod CPU使用率过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】CPU使用率过高,请检查!!!"
- alert: "CPU使用率"
expr: sum(rate(container_cpu_usage_seconds_total{container !="",container!="POD"}[2m])) by (pod,namespace,instance,container) / (sum(container_spec_cpu_quota{container !="",container!="POD"}/100000) by (pod,namespace,instance,container)) * 100 > 80
for: 1m
labels:
level: p2
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod CPU使用率过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】CPU使用率过高,请检查!!!"
- alert: "CPU使用率"
expr: sum(rate(container_cpu_usage_seconds_total{container !="",container!="POD"}[2m])) by (pod,namespace,instance,container) / (sum(container_spec_cpu_quota{container !="",container!="POD"}/100000) by (pod,namespace,instance,container)) * 100 > 90
for: 1m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod CPU使用率过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】CPU使用率过高,请检查!!!"
- alert: "内存使用率"
expr: sum (container_memory_rss{container !="",container!="POD"}) by (pod,namespace,instance,container)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD"}) by (pod,namespace,instance,container) * 100 > 80
for: 1m
labels:
level: p4
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 内存使用率过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】内存使用率过高,请检查!!!"
- alert: "内存使用率"
expr: sum (container_memory_rss{container !="",container!="POD"}) by (pod,namespace,instance,container)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD"}) by (pod,namespace,instance,container) * 100 > 85
for: 1m
labels:
level: p3
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 内存使用率过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】内存使用率过高,请检查!!!"
- alert: "内存使用率"
expr: sum (container_memory_rss{container !="",container!="POD"}) by (pod,namespace,instance,container)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD"}) by (pod,namespace,instance,container) * 100 > 90
for: 1m
labels:
level: p2
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 内存使用率过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】内存使用率过高,请检查!!!"
- alert: "内存使用率"
expr: sum (container_memory_rss{container !="",container!="POD"}) by (pod,namespace,instance,container)/ sum(container_spec_memory_limit_bytes{container !="",container!="POD"}) by (pod,namespace,instance,container) * 100 > 95
for: 1m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 内存使用率过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】内存使用率过高,请检查!!!"
- alert: "磁盘I/O使用"
expr: sum(rate(container_fs_reads_bytes_total[5m])) by (pod,namespace,instance,container) > 10485760
for: 1m
labels:
level: p4
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 磁盘IO使用过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】磁盘IO使用率过高,每秒达到10MB请检查!!!"
- alert: "磁盘I/O使用"
expr: sum(rate(container_fs_reads_bytes_total[5m])) by (pod,namespace,instance,container) > 20971520
for: 1m
labels:
level: p3
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 磁盘IO使用过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】磁盘IO使用率过高,每秒达到20MB请检查!!!"
- alert: "磁盘I/O使用"
expr: sum(rate(container_fs_reads_bytes_total[5m])) by (pod,namespace,instance,container) > 31457280
for: 1m
labels:
level: p2
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 磁盘IO使用过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】磁盘IO使用率过高,每秒达到30MB请检查!!!"
- alert: "磁盘I/O使用"
expr: sum(rate(container_fs_reads_bytes_total[5m])) by (pod,namespace,instance,container) > 41943040
for: 1m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 磁盘IO使用过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】磁盘IO使用率过高,每秒达到40MB请检查!!!"
- alert: "网络I/O使用"
expr: sum(rate(container_network_receive_bytes_total[5m])) by (pod,namespace,instance,container) > 20971520
for: 1m
labels:
level: p4
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 网络IO使用过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】网络IO使用率过高,每秒达到20MB请检查!!!"
- alert: "网络I/O使用"
expr: sum(rate(container_network_receive_bytes_total[5m])) by (pod,namespace,instance,container) > 31457280
for: 1m
labels:
level: p3
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 网络IO使用过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】网络IO使用率过高,每秒达到30MB请检查!!!"
- alert: "网络I/O使用"
expr: sum(rate(container_network_receive_bytes_total[5m])) by (pod,namespace,instance,container) > 41943040
for: 1m
labels:
level: p2
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 网络IO使用过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】网络IO使用率过高,每秒达到40MB请检查!!!"
- alert: "网络I/O使用"
expr: sum(rate(container_network_receive_bytes_total[5m])) by (pod,namespace,instance,container) > 51943040
for: 1m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 网络IO使用过高"
description: "Kubernetes Pod节点【{{ $labels.pod }}】网络IO使用率过高,每秒达到50MB请检查!!!"
[www@c02-jenkins container]$ cat slos.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
annotations:
labels:
prometheus: k8s
role: alert-rules
name: kube-slos
namespace: kubesphere-monitoring-system
spec:
groups:
- name: "接口错误率"
rules:
- alert: "KubeAPI错误预算"
expr: sum(apiserver_request:burnrate3d) > (1 * 0.01) and sum(apiserver_request:burnrate6h) > (1 * 0.01)
for: 1h
labels:
level: p4
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes API错误预算"
description: "Kubernetes API错误预算较多, 请检查!!!"
- alert: "KubeAPI错误预算"
expr: sum(apiserver_request:burnrate1d) > (3 * 0.01) and sum(apiserver_request:burnrate2h) > (3 * 0.01)
for: 1h
labels:
level: p3
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes API错误预算"
description: "Kubernetes API错误预算较多, 请检查!!!"
- alert: "KubeAPI错误预算"
expr: sum(apiserver_request:burnrate6h) > (6 * 0.01) and sum(apiserver_request:burnrate30m) > (6 * 0.01)
for: 15m
labels:
level: p2
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes API错误预算"
description: "Kubernetes API错误预算较多, 请检查!!!"
- alert: "KubeAPI错误预算"
expr: sum(apiserver_request:burnrate1h) > (14.4 * 0.01) and sum(apiserver_request:burnrate5m) > (14.4 * 0.01)
for: 2m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes API错误预算"
description: "Kubernetes API错误预算较多, 请检查!!!"
- alert: "KubeStateMetricsList错误率"
expr: (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) > 0.01
for: 2m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes kube-state-metrics List错误率"
description: "Kubernetes kube-state-metrics List错误率较高, 请检查!!!"
- alert: "KubeStateMetricsWatch错误率"
expr: (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) / sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) > 0.01
for: 2m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes kube-state-metrics Watch错误率"
description: "Kubernetes kube-state-metrics Watch错误率较高, 请检查!!!"
[www@c02-jenkins container]$ cat status.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
annotations:
labels:
prometheus: k8s
role: alert-rules
name: container-status
namespace: kubesphere-monitoring-system
spec:
groups:
- name: "容器状态"
rules:
- alert: "Pod异常重启"
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m])* 60 * 5 > 0
for: 1m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 异常重启"
description: "Kubernetes Pod节点【{{ $labels.pod }}】异常重启, 请检查!!!"
- alert: "Pod处于准备阶段"
expr: sum by (pod,namespace,instance,container) (max by(pod,namespace,instance,container) (kube_pod_status_phase{job="kube-state-metrics",phase=~"Pending|Unknown"}) * on(pod,namespace,instance,container) group_left(owner_kind) max by(pod,namespace,instance,container, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
for: 2m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 处于Ready阶段"
description: "Kubernetes Pod节点【{{ $labels.pod }}】启动异常,长时间处于Ready阶段, 请检查!!!"
- alert: "Pod处于等待状态"
expr: sum by (namespace, pod, container, instance) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
for: 2m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod 处于waiting状态"
description: "Kubernetes Pod节点【{{ $labels.pod }}】状态异常,长时间处于waiting状态, 请检查!!!"
- alert: "Deployment部署状态不匹配"
expr: kube_deployment_status_observed_generation{job="kube-state-metrics"} != kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 5m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Deployment部署失败"
description: "Kubernetes 【{{$labels.namespace}} {{ $labels.deployment }}】部署状态异常,这表示部署失败,但尚未回滚, 请检查!!!"
- alert: "Deployment副本数不匹配"
expr: (kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_replicas_available{job="kube-state-metrics"}) and (changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) == 0)
for: 5m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Deployment期望数量不匹配"
description: "Kubernetes 【{{$labels.namespace}} {{ $labels.deployment }}】与预期的副本数不匹配, 请检查!!!"
- alert: "Deployment Pod扩缩容"
expr: changes(kube_deployment_status_replicas_available[2m]) > 0
for: 1m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Pod进行了扩缩容"
description: "Kubernetes 【{{$labels.namespace}} {{ $labels.deployment }}】Pod进行了扩缩容, 请检查!!!"
- alert: "Deployment Pod异常"
expr: kube_deployment_status_replicas_unavailable > 0
for: 1m
labels:
level: p1
resources: container
annotations:
value: "{{ $value }}"
summary: "Kubernetes Deployment 存在不可用Pod"
description: "Kubernetes 【{{$labels.namespace}} {{ $labels.deployment }}】存在不可用Pod, 请检查!!!"