目录
kube-apiserver
coredns
etcd
kube-controller-manager
kubelet
kube-scheduler
kube-state-metrics
node
deployment
ingress-nginx
pod
groups:
- name: kube-apiserver.rule
rules:
- alert: K8SAPIAerverDown
expr: up{job="apiserver"} == 0
for: 1m
labels:
severity: P2
cluster: prodidc
annotations:
description: kube-apiserver {{ $labels.instance }} is down. {{ $labels.instance }} isn't reachable or have disappeared from service
discovery.
summary: kube-apiserver is down
- alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 10m
labels:
severity: P1
cluster: prodidc
annotations:
description: No API servers are reachable or all have disappeared from service
discovery
summary: No API servers are reachable
- alert: K8SApiserverUserCPU
expr: sum(rate(container_cpu_user_seconds_total{pod=~"kube-apiserver.*",container_name!="POD"}[5m]))by(pod) > 1
for: 5m
labels:
severity: P3
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetes apserver {{ $labels.instance }} is user cpu time > 1s. {{ $labels.instance }} isn't reachable"
summary: "kubernetes apserver 负载较高超过1s,当前值为{{$value}}"
- alert: K8SApiserverUserCPU
expr: sum(rate(container_cpu_user_seconds_total{pod=~"kube-apiserver.*",container_name!="POD"}[5m]))by(pod) > 5
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetes apserver {{ $labels.instance }} is user cpu time > 5s. {{ $labels.instance }} isn't reachable"
summary: "kubernetes apserver 负载较高超过5s,当前值为{{$value}}"
- alert: K8SApiserverUserCPU
expr: sum(rate(container_cpu_user_seconds_total{pod=~"kube-apiserver.*",container_name!="POD"}[5m]))by(pod) > 10
for: 5m
labels:
severity: P1
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetes apserver {{ $labels.instance }} is user cpu time > 10s. {{ $labels.instance }} isn't reachable"
summary: "kubernetes apserver 负载较高超过10s,当前值为{{$value}}"
- alert: K8SApiserverUseMemory
expr: sum(rate(container_memory_usage_bytes{pod=~"kube-apserver.*",container_name!="POD"}[5m])/1024/1024)by(pod) > 150
for: 5m
labels:
severity: P3
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetess apserver {{ $labels.instance }} is use memory More than 150MB"
summary: "kubernetes apserver 使用内存超过150MB,当前值为{{$value}}MB"
- alert: K8SApiserverUseMemory
expr: sum(rate(container_memory_usage_bytes{pod=~"kube-apserver.*",container_name!="POD"}[5m])/1024/1024)by(pod) > 300
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetess apserver {{ $labels.instance }} is use memory More than 300MB"
summary: "kubernetes apserver 使用内存超过300MB,当前值为{{$value}}MB"
- alert: K8SApiserverUseMemory
expr: sum(rate(container_memory_usage_bytes{pod=~"kube-apserver.*",container_name!="POD"}[5m])/1024/1024)by(pod) > 600
for: 5m
labels:
severity: P1
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetess apserver {{ $labels.instance }} is use memory More than 600MB"
summary: "kubernetes apserver 使用内存超过600MB,当前值为{{$value}}MB"
- alert: K8SApiserverApiError
expr: sum(rate(apiserver_request_total{job="apiserver",code=~"[45].."}[5m]))by (resource,subresource,verb) /sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.5
for: 10m
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetess apserver {{ $labels.instance }} API 4xx,5xx too many"
summary: "kubernetes apserver 4xx,5xx错误很多,请检查"
- alert: K8SApiserverWorkerQueue
expr: sum(apiserver_current_inflight_requests{job="apiserver"}) > 200
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetess apserver {{ $labels.instance }} worker queue accumulation"
summary: "kubernetes apserver 待处理的请求数量 > {{$value}} "
- alert: K8SApiserverWorkerQueue
expr: sum(apiserver_current_inflight_requests{job="apiserver"}) > 400
for: 5m
labels:
severity: P1
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetess apserver {{ $labels.instance }} worker queue accumulation"
summary: "kubernetes apserver 待处理的请求数量 > {{$value}} "
- alert: K8SApiserverQueueWite
expr: histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{ job="apiserver"}[5m])) by (le)) > 1
for: 5m
labels:
severity: P1
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetess apserver {{ $labels.instance }} worker queue wite"
summary: "kubernetes apserver 工作队列中停留的时间延时较大 > {{$value}} "
- alert: K8SApiserverWorkerAddCount
expr: sum(rate(workqueue_adds_total{job="apiserver"}[5m])) > 100
for: 5m
labels:
severity: P1
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetess apserver {{ $labels.instance }}"
summary: "kubernetes apserver 工作队列处理的添加总数,可能有堆积请检查 > {{$value}} "
#ssl证书
groups:
- name: kubernetes.rules
rules:
- alert: K8s证书即将过期
expr: sum by (job) (rate(apiserver_client_certificate_expiration_seconds_bucket{le="1296000"}[1m])) > 0
labels:
severity: P2
cluster: prodidc
annotations:
description: Kubernetes API Client Certificate is expiring soon (less than 15 days)
summary: Kubernetes API Certificate Client 15 天后将过期
- alert: K8sCertificateExpirationNotice
expr: sum by (job) (rate(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}[1m])) > 0
labels:
severity: P0
cluster: prodidc
annotations:
description: Kubernetes API Certificate Client is expiring in less than 7 day
summary: Kubernetes API Certificate Client 7 天后将过期
groups:
- name: coredns.rule
rules:
- alert: coredns-responses-P1
expr: histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 3
labels:
severity: P1
cluster: prodidc
annotations:
current_value: "{{$value}}"
description: "{{ $labels.instance }} Coredns has longer response time, response More than 3 seconds , please check !"
summary: "{{ $labels.instance }} 警告! 响应时间较长 Coredns response delay 大于 3 秒 当前值为{{$value}}"
- alert: coredns-responses-P2
expr: histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 2.5
labels:
severity: P2
cluster: prodidc
annotations:
current_value: "{{$value}}"
description: "{{ $labels.instance }} Coredns has longer response time, response More than 2.5 seconds , please check !"
summary: "{{ $labels.instance }} 严重! 响应时间较长 Coredns response delay 大于 2.5 秒 当前值为{{$value}}"
- alert: coredns-responses-P3
expr: histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 1.5
labels:
severity: P3
cluster: prodidc
annotations:
current_value: "{{$value}}"
description: "{{ $labels.instance }} Coredns has longer response time, response More than 1.5 seconds , please check !"
summary: "{{ $labels.instance }} 非常严重! 响应时间较长 Coredns response delay 大于 1.5 秒 当前值为{{$value}}"
- alert: coredns abort
expr: coredns_panic_count_total > 0
labels:
severity: P0
cluster: prodidc
annotations:
current_value: "{{$value}}"
description: "{{ $labels.instance }} Error Coredns process abnormal interrupt!!"
summary: "毁灭性错误!{{ $labels.instance }} Coredns 进程异常中断!请检查!"
- alert: Coredns Query per minute P3
expr: sum(irate(coredns_dns_request_count_total{zone !="dropped"}[1m])) > 30000 #计算最近两个数据点每秒查询的次数
labels:
severity: P3
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: Counter of DNS requests made per zone, protocol and family.
summary: "严重!{{ $labels.instance }} coredns 每分钟查询次数高于阀值25000.当前值为{{$value}}"
- alert: Coredns Query per minute P2
expr: sum(irate(coredns_dns_request_count_total{zone !="dropped"}[1m])) > 40000
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: Counter of DNS requests made per zone, protocol and family.
summary: "严重!{{ $labels.instance }} coredns 每分钟查询次数高于阀值30000.当前值为{{$value}}"
- alert: Coredns Query per minute P1
expr: sum(irate(coredns_dns_request_count_total{zone !="dropped"}[1m])) > 50000
labels:
severity: P1
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: Counter of DNS requests made per zone, protocol and family.
summary: "非常严重!{{ $labels.instance }} coredns 每分钟查询次数高于阀值50000.当前值为{{$value}}"
- alert: Coredns SERVFAIL Count
expr: irate(coredns_dns_response_rcode_count_total{rcode!="NOERROR"} [1m]) and irate(coredns_dns_response_rcode_count_total{rcode!="NXDOMAIN"} [1m]) > 10
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: The server failed, the authority server for this domain name refused to respond or REFUSE,
summary: "警告!coredns 服务器处理请求失败超过10次,当前失败的处理次数{{$value}} 原因服务器拒绝响应或响应拒绝."
- alert: Coredns Cache misses P3
expr: irate(coredns_cache_misses_total [1m]) > 10
labels:
severity: P3
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: DNS cache misses statistical
summary: "警告!{{ $labels.instance }} coredns 每分钟服务器缓存misses超过10.当前值为{{$value}}"
- alert: Coredns Cache misses P2
expr: irate(coredns_cache_misses_total [1m]) > 30
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: DNS cache misses statistical
summary: "严重!{{ $labels.instance }} coredns 每分钟服务器缓存misses超过30.当前值为{{$value}}"
- alert: Coredns Cache misses P1
expr: irate(coredns_cache_misses_total [1m]) > 80
labels:
severity: P1
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: DNS cache misses statistical
summary: "非常严重!{{ $labels.instance }} coredns 每分钟服务器缓存misses超过50.当前值为{{$value}}"
- alert: Coredns Request Time P3
expr: histogram_quantile(0.99,sum(rate(coredns_health_request_duration_seconds_bucket [1m]) ) by(server, zone, le)) > 0.05 #请求所用的时间
labels:
severity: P3
cluster: prodidc
annotations:
current_value: '{{$value}}'
summary: "警告!实例 {{ $labels.instance }} 每一分钟请求dns 超过设置阀值0.05秒 当前值为{{$value}}"
- alert: Coredns Request Time P2
expr: histogram_quantile(0.99,sum(rate(coredns_health_request_duration_seconds_bucket [1m]) ) by(server, zone, le)) > 0.1
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
summary: "严重!实例 {{ $labels.instance }} 每一分钟请求dns 超过设置阀值0.1秒 当前值为{{$value}}"
- alert: Coredns Request Time P1
expr: histogram_quantile(0.99,sum(rate(coredns_health_request_duration_seconds_bucket [1m]) ) by(server, zone, le)) > 1
labels:
severity: P1
cluster: prodidc
annotations:
current_value: '{{$value}}'
summary: "非常严重!实例 {{ $labels.instance }} 每一分钟请求dns 超过设置阀值1秒 当前值为{{$value}}"
- alert: Coredns The maximum number open file P1
expr: sum(rate(process_max_fds [1m]) ) > 2000000
labels:
severity: P1
cluster: prodidc
annotations:
current_value: '{{$value}}'
description:
summary: "非常严重!实例 {{ $labels.instance }} 每一分钟打开文件数超过设置阀值200000 当前值为{{$value}}"
- alert: Coredns The maximum number open file P2
expr: sum(rate(process_max_fds [1m]) ) > 1500000
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description:
summary: "严重!实例 {{ $labels.instance }} 每一分钟打开文件数超过设置阀值150000 当前值为{{$value}}"
- alert: Coredns The maximum number open file P3
expr: sum(rate(process_max_fds [1m]) ) > 1000000
labels:
severity: P3
cluster: prodidc
annotations:
current_value: '{{$value}}'
description:
summary: "警告!实例 {{ $labels.instance }} 每一分钟打开文件数超过设置阀值100000 当前值为{{$value}}"
groups:
- name: etcd.rule
rules:
- alert: etcdDown
expr: up{job="etcd"} == 0
labels:
severity: P2
cluster: prodidc
annotations:
description: etcd {{ $labels.instance }} is down. {{ $labels.instance }} isn't reachable and continueed 1 minute.
summary: etcd is down
- alert: NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
labels:
severity: P0
cluster: prodidc
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
- alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[10m]) > 3
labels:
severity: P1
cluster: prodidc
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
changes within the last hour
- record: instance:fd_utilization
expr: process_open_fds / process_max_fds
- alert: FdExhaustionClose
expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
its file descriptors soon'
summary: file descriptors 预测4个小时内耗尽
- alert: FdExhaustionClose
expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
for: 5m
labels:
severity: P1
cluster: prodidc
annotations:
description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
its file descriptors soon'
summary: file descriptors 预测1个小时内耗尽
- alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
> 0.15
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
description: etcd instance {{ $labels.instance }} member communication with
{{ $labels.To }} is slow
summary: etcd member communication is slow
- alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: P2
cluster: prodidc
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
failures within the last hour
summary: etcd 集群大量的提案失败
- alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
> 0.05
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: etcd Wal(预写日志系统)调用的fsync提交的延迟高
- alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
> 0.25
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: etcd 磁盘后端提交持续时间长
- alert: EtcdDiskPressure
expr: sum(etcd_mvcc_db_total_size_in_bytes{job="etcd"}/1024/1024/1024) by (instance) > 200
for: 60m
labels:
severity: P3
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "Etcd {{ $labels.instance }} Disk Pressuer"
summary: "Etcd 数据占用系统盘200G,当前值:{{$value}} "
groups:
- name: kube-controller-manager.rule
rules:
- alert: K8SControllerManagerDown
expr: absent(up{job="kubernetes-controller-manager"} == 1)
for: 1m
labels:
severity: P0
cluster: prodidc
annotations:
description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
summary: No kubernetes controller manager are reachable
- alert: K8SControllerManagerDown
expr: up{job="kubernetes-controller-manager"} == 0
for: 1m
labels:
severity: P2
cluster: prodidc
annotations:
description: kubernetes controller manager {{ $labels.instance }} is down. {{ $labels.instance }} isn't reachable
summary: kubernetes controller manager is down
- alert: K8SControllerManagerUserCPU
expr: sum(rate(container_cpu_user_seconds_total{pod=~"kube-controller-manager.*",container_name!="POD"}[5m]))by(pod) > 5
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
description: kubernetes controller manager {{ $labels.instance }} is user cpu time > 5s. {{ $labels.instance }} isn't reachable
summary: kubernetes controller 负载较高超过5s
- alert: K8SControllerManagerUseMemory
expr: sum(rate(container_memory_usage_bytes{pod=~"kube-controller-manager.*",container_name!="POD"}[5m])/1024/1024)by(pod) > 20
for: 5m
labels:
severity: P3
cluster: prodidc
annotations:
description: kubernetes controller manager {{ $labels.instance }} is use memory More than 20MB
summary: kubernetes controller 使用内存超过20MB
- alert: K8SControllerManagerQueueTimedelay
expr: histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job="kubernetes-controller-manager"}[5m])) by(le)) > 10
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
description: kubernetes controller manager {{ $labels.instance }} is QueueTimedelay More than 10s
summary: kubernetes controller 队列停留时间超过10秒,请检查ControllerManager
groups:
- name: kubelet.rule
rules:
- alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 2m
labels:
severity: P2
cluster: prodidc
annotations:
summary: "Node status is NotReady"
description: "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour"
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
for: 5m
labels:
severity: P1
cluster: prodidc
annotations:
summary: "大量的node节点没有Ready"
description: '{{ $value }}% of Kubernetes nodes are not ready'
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 10
for: 2m
labels:
severity: P3
cluster: prodidc
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: kubelet cannot be scraped
- alert: K8SManyKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
* 100 > 30
for: 10m
labels:
severity: P2
cluster: prodidc
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 50
for: 10m
labels:
severity: P2
cluster: prodidc
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 50
summary: 单节点pod数大于50
groups:
- name: kube-scheduler.rule
rules:
- alert: K8SSchedulerDown
expr: absent(up{job="kubernetes-scheduler"} == 1)
for: 1m
labels:
severity: P0
cluster: prodidc
annotations:
description: "There is no running K8S scheduler. New pods are not being assigned to nodes."
summary: "all k8s scheduler is down"
- alert: K8SSchedulerDown
expr: up{job="kubernetes-scheduler"} == 0
for: 1m
labels:
severity: P2
cluster: prodidc
annotations:
description: "K8S scheduler {{ $labels.instance }} is no running. New pods are not being assigned to nodes."
summary: "k8s scheduler {{ $labels.instance }} is down"
- alert: K8SSchedulerUserCPU
expr: sum(rate(container_cpu_user_seconds_total{pod=~"kube-scheduler.*",container_name!="POD"}[5m]))by(pod) > 1
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetes scheduler {{ $labels.instance }} is user cpu time > 1s. {{ $labels.instance }} isn't reachable"
summary: "kubernetes scheduler 负载较高超过1s,当前值为{{$value}}"
- alert: K8SSchedulerUseMemory
expr: sum(rate(container_memory_usage_bytes{pod=~"kube-scheduler.*",container_name!="POD"}[5m])/1024/1024)by(pod) > 20
for: 5m
labels:
severity: P3
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetess scheduler {{ $labels.instance }} is use memory More than 20MB"
summary: "kubernetes scheduler 使用内存超过20MB,当前值为{{$value}}MB"
- alert: K8SSchedulerPodPending
expr: sum(scheduler_pending_pods{job="kubernetes-scheduler"})by(queue) > 5
for: 5m
labels:
severity: P3
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: "kubernetess scheduler {{ $labels.instance }} is Pending pod More than 5"
summary: "kubernetes scheduler pod无法调度 > 5,当前值为{{$value}}"
- alert: K8SSchedulerPodPending
expr: sum(scheduler_pending_pods{job="kubernetes-scheduler"})by(queue) > 10
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: kubernetess scheduler {{ $labels.instance }} is Pending pod More than 10
summary: "kubernetes scheduler pod无法调度 > 10,当前值为{{$value}}"
- alert: K8SSchedulerPodPending
expr: sum(rate(scheduler_binding_duration_seconds_count{job="kubernetes-scheduler"}[5m])) > 1
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: kubernetess scheduler {{ $labels.instance }}
summary: "kubernetes scheduler pod 无法绑定调度有问题,当前值为{{$value}}"
- alert: K8SSchedulerVolumeSpeed
expr: sum(rate(scheduler_volume_scheduling_duration_seconds_count{job="kubernetes-scheduler"}[5m])) > 1
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: kubernetess scheduler {{ $labels.instance }}
summary: "kubernetes scheduler pod Volume 速度延迟,当前值为{{$value}}"
- alert: K8SSchedulerClientRequestSlow
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job="kubernetes-scheduler"}[5m])) by (verb, url, le)) > 1
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
description: kubernetess scheduler {{ $labels.instance }}
summary: "kubernetes scheduler 客户端请求速度延迟,当前值为{{$value}}"
groups:
- name: kube-state-metrics.rules
rules:
- alert: DaemonSetNotReady
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
* 100 < 100
for: 15m
labels:
severity: P2
cluster: prodidc
annotations:
description: Only {{$value}}% of desired pods scheduled and ready for daemonset {{$labels.namespace}}/{{$labels.daemonset}}
summary: DaemonSet pod status is not ready
- alert: DaemonSetsNotScheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
> 0
for: 10m
labels:
severity: P2
cluster: prodidc
annotations:
description: daemonsets {{$labels.namespace}}/{{$labels.daemonset}} is not scheduled.
summary: Daemonsets are not scheduled correctly
- alert: DaemonSetsMissScheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 2m
labels:
severity: P2
cluster: prodidc
annotations:
description: A number of daemonsets are running where they are not supposed to run.
summary: Daemonsets {{ $labels.daemonset }} are not scheduled correctly
groups:
- name: node.rules
rules:
- alert: NodeMemUseHigh
expr: sum by (node)(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / sum by (node)(node_memory_MemTotal_bytes) > 0.9
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
description: "Node memory usage 高于90%"
summary: "Node memory usage 高于90%"
- alert: NodeUnschedulable
expr: sum(kube_node_spec_unschedulable) > 0
for: 5m
labels:
severity: P4
cluster: prodidc
annotations:
description: a node is unschedulable for 5 minutes
summary: Node is unschedulable
- alert: NodeExporterDown
expr: absent(up{job="prometheus-node-exporter"} == 1)
for: 2m
labels:
severity: P3
cluster: prodidc
annotations:
description: Prometheus could not scrape a node-exporter/{{$labels.node}} for more than 2m, or node-exporters have disappeared from discovery
summary: Prometheus could not scrape a node-exporter
- alert: NodeCpuUseHigh
expr: 1-avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (node) > 0.9
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
description: "node/{{$labels.node}} CPU使用率高于90%"
summary: "node/{{$labels.node}} CPU使用率高于90%"
groups:
- name: Deployment.rule
rules:
- alert: ReplicasUnavailable
expr: kube_deployment_status_replicas_unavailable > 0
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
summary: "Deployment replicas status unavailable"
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} status is unavailable, have {{ $value }} pod is unavailable more than 15 minutes."
- alert: DeploymentReplicasNotUpdated
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
unless (kube_deployment_spec_paused == 1)
for: 15m
labels:
severity: P2
cluster: prodidc
annotations:
description: Replicas are not updated and available for deployment {{$labels.namespace}}/{{$labels.deployment}}
summary: pod个数小于replicas预期值
groups:
- name: ingress-nginx.rule
rules:
- alert: ingressDown
expr: kube_pod_status_ready{condition="true",pod=~"ingress.*-controller.*"} < 1
for: 1m
labels:
severity: P0
cluster: prodidc
annotations:
summary: "ingress nginx namespace:{{$labels.namespace}} podname:{{$labels.pod}} is Down"
description: "ingress nginx namespace:{{$labels.namespace}} podname:{{$labels.pod}}, for more than an minute"
- alert: ingressControllerConn
expr: sum by (instance)(avg_over_time(nginx_ingress_controller_nginx_process_connections{}[2m])) > 100000
for: 5m
labels:
severity: P1
cluster: prodidc
annotations:
summary: "ingress nginx {{$labels.instance}} connection more than 100000"
description: "ingress nginx {{$labels.instance}} connection more than 100000, for more than five minute"
- alert: ingressMemUseage
expr: sum(container_memory_working_set_bytes{pod=~"ingress-.*",container!~"|filebeat|POD"} ) by (namespace,pod,service) / sum(container_spec_memory_limit_bytes{pod=~"ingress-.*",container!~"|filebeat|POD",namespace!=""}) by (namespace,pod,service) * 100 > 90 and (sum(container_memory_working_set_bytes) by (namespace,pod,service)/sum(container_spec_memory_limit_bytes) by (namespace,pod,service)) != Inf
for: 2m
labels:
severity: P1
cluster: prodidc
annotations:
summary: "ingress controller {{$labels.instance}} memory useage moth than 90%"
description: "ingress controller {{$labels.instance}} memory useage moth than 90%, for more than five minute"
- alert: ingressCpuUseage
expr: sum(rate(container_cpu_usage_seconds_total{pod=~"ingress-.*",image!=""}[1m])) by (pod, namespace,service) / (sum(container_spec_cpu_quota{pod=~"ingress-.*",image!=""}/100000) by (pod, namespace,service)) * 100 > 90 and sum by (pod,namespace,service)( rate(container_cpu_usage_seconds_total{image!="", namespace!=""}[1m] ) ) * 100 != Inf
for: 2m
labels:
severity: P1
cluster: prodidc
annotations:
summary: "ingress controller {{$labels.instance}} memory useage moth than 90%"
description: "ingress controller {{$labels.instance}} memory useage moth than 90%, for more than five minute"
- alert: controllerSSLtime
expr: nginx_ingress_controller_ssl_expire_time_seconds < (time() + (20 * 24 * 3600))
labels:
severity: P2
cluster: prodidc
annotations:
summary: "ingress ssl证书有效期小于20天"
description: "ingress controller ssl time less than 20 day"
groups:
- name: Pods.rule
rules:
- alert: PodHighCPUUsage
expr: sum(rate(container_cpu_usage_seconds_total{image!=""}[1m])) by (pod, namespace,service) / (sum(container_spec_cpu_quota{image!=""}/100000) by (pod, namespace,service)) * 100 > 95 and sum by (pod,namespace,service)( rate(container_cpu_usage_seconds_total{image!="", namespace!=""}[1m] ) ) * 100 != Inf
for: 5m
labels:
severity: P2
cluster: prodidc
annotations:
current_value: '{{$value}}'
summary: "Pod cpu Usage 高于95%"
description: "{{ $labels.namespace }}.{{ $labels.pod }} cpu usage is high above 95% for more than 5 minute."
- alert: PodRssMemHigh
expr: sum by(namespace, pod) (container_memory_rss{container!~"|filebeat|POD",namespace!="",pod!=""}) / sum by(namespace, pod) (container_spec_memory_limit_bytes{container!~"|filebeat|POD",namespace!="",pod!=""}) * 100 > 95 and (sum by(namespace, pod) (container_memory_rss) / sum by(namespace, pod) (container_spec_memory_limit_bytes)) != +Inf
for: 10m
labels:
cluster: prodidc
severity: P2
annotations:
description: "Pod ({{$labels.namespace}}/{{$labels.pod}}) Rss memory 高于95%,请检查合理性"
summary: "Pod ({{$labels.namespace}}/{{$labels.pod}}) Rss memory 高于95%,请检查合理性"
- alert: PodNotReady
expr: sum(kube_pod_status_ready{condition='true',pod!~".*-pt-.*"} != 1) by (namespace,pod)
for: 3m
labels:
severity: P0
cluster: prodidc
annotations:
summary: "Pod ({{$labels.namespace}}/{{$labels.pod}}) is NotReady"
description: "服务({{$labels.namespace}}/{{$labels.pod}})异常,请尽快检查"