k8s集群监控指标及告警阀值大全【超级详细,全是干货,直接粘贴复制】

目录

kube-apiserver

coredns

etcd

kube-controller-manager

kubelet

kube-scheduler

kube-state-metrics

node

deployment

ingress-nginx

pod


kube-apiserver

groups:
- name: kube-apiserver.rule
  rules:
  - alert: K8SAPIAerverDown
    expr: up{job="apiserver"} == 0
    for: 1m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: kube-apiserver {{ $labels.instance }} is down. {{ $labels.instance }} isn't reachable or have disappeared from service
        discovery.
      summary: kube-apiserver is down
  - alert: K8SApiserverDown
    expr: absent(up{job="apiserver"} == 1)
    for: 10m
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      description: No API servers are reachable or all have disappeared from service
        discovery
      summary: No API servers are reachable
  - alert: K8SApiserverUserCPU
    expr: sum(rate(container_cpu_user_seconds_total{pod=~"kube-apiserver.*",container_name!="POD"}[5m]))by(pod) > 1 
    for: 5m
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetes apserver {{ $labels.instance }} is user cpu time > 1s. {{ $labels.instance }} isn't reachable"
      summary: "kubernetes apserver 负载较高超过1s,当前值为{{$value}}"
  - alert: K8SApiserverUserCPU
    expr: sum(rate(container_cpu_user_seconds_total{pod=~"kube-apiserver.*",container_name!="POD"}[5m]))by(pod) > 5
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetes apserver {{ $labels.instance }} is user cpu time > 5s. {{ $labels.instance }} isn't reachable"
      summary: "kubernetes apserver 负载较高超过5s,当前值为{{$value}}"
  - alert: K8SApiserverUserCPU
    expr: sum(rate(container_cpu_user_seconds_total{pod=~"kube-apiserver.*",container_name!="POD"}[5m]))by(pod) > 10
    for: 5m
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetes apserver {{ $labels.instance }} is user cpu time > 10s. {{ $labels.instance }} isn't reachable"
      summary: "kubernetes apserver 负载较高超过10s,当前值为{{$value}}"
  - alert: K8SApiserverUseMemory
    expr: sum(rate(container_memory_usage_bytes{pod=~"kube-apserver.*",container_name!="POD"}[5m])/1024/1024)by(pod) > 150
    for: 5m
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetess apserver {{ $labels.instance }} is use memory More than 150MB"
      summary: "kubernetes apserver 使用内存超过150MB,当前值为{{$value}}MB"
  - alert: K8SApiserverUseMemory
    expr: sum(rate(container_memory_usage_bytes{pod=~"kube-apserver.*",container_name!="POD"}[5m])/1024/1024)by(pod) > 300
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetess apserver {{ $labels.instance }} is use memory More than 300MB"
      summary: "kubernetes apserver 使用内存超过300MB,当前值为{{$value}}MB"
  - alert: K8SApiserverUseMemory
    expr: sum(rate(container_memory_usage_bytes{pod=~"kube-apserver.*",container_name!="POD"}[5m])/1024/1024)by(pod) > 600
    for: 5m
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetess apserver {{ $labels.instance }} is use memory More than 600MB"
      summary: "kubernetes apserver 使用内存超过600MB,当前值为{{$value}}MB"
  - alert: K8SApiserverApiError
    expr: sum(rate(apiserver_request_total{job="apiserver",code=~"[45].."}[5m]))by (resource,subresource,verb) /sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.5
    for: 10m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetess apserver {{ $labels.instance }} API 4xx,5xx too many"
      summary: "kubernetes apserver 4xx,5xx错误很多,请检查"
  - alert: K8SApiserverWorkerQueue
    expr: sum(apiserver_current_inflight_requests{job="apiserver"}) > 200
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetess apserver {{ $labels.instance }} worker queue accumulation"
      summary: "kubernetes apserver 待处理的请求数量 > {{$value}} "
  - alert: K8SApiserverWorkerQueue
    expr: sum(apiserver_current_inflight_requests{job="apiserver"}) > 400
    for: 5m
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetess apserver {{ $labels.instance }} worker queue accumulation"
      summary: "kubernetes apserver 待处理的请求数量 > {{$value}} "
  - alert: K8SApiserverQueueWite
    expr: histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{ job="apiserver"}[5m])) by (le)) > 1
    for: 5m
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetess apserver {{ $labels.instance }} worker queue wite"
      summary: "kubernetes apserver 工作队列中停留的时间延时较大 > {{$value}} "
  - alert: K8SApiserverWorkerAddCount
    expr: sum(rate(workqueue_adds_total{job="apiserver"}[5m])) > 100
    for: 5m
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetess apserver {{ $labels.instance }}"
      summary: "kubernetes apserver 工作队列处理的添加总数,可能有堆积请检查 > {{$value}} "

#ssl证书
groups:
- name: kubernetes.rules
  rules:
  - alert: K8s证书即将过期
    expr: sum by (job) (rate(apiserver_client_certificate_expiration_seconds_bucket{le="1296000"}[1m])) > 0
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: Kubernetes API Client Certificate is expiring soon (less than 15 days)
      summary: Kubernetes API Certificate Client 15 天后将过期

  - alert: K8sCertificateExpirationNotice
    expr: sum by (job) (rate(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}[1m])) > 0
    labels:
      severity: P0
      cluster: prodidc
    annotations:
      description: Kubernetes API Certificate Client is expiring in less than 7 day
      summary: Kubernetes API Certificate Client 7 天后将过期

coredns

groups:
- name: coredns.rule
  rules:
  - alert: coredns-responses-P1
    expr:  histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 3
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      current_value: "{{$value}}"
      description:  "{{ $labels.instance }}  Coredns has longer response time, response More than 3 seconds , please check !"
      summary: "{{ $labels.instance }} 警告! 响应时间较长 Coredns response delay 大于 3 秒 当前值为{{$value}}"
  - alert: coredns-responses-P2
    expr:  histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 2.5
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: "{{$value}}"
      description:  "{{ $labels.instance }}  Coredns has longer response time, response More than 2.5 seconds , please check !"
      summary: "{{ $labels.instance }} 严重! 响应时间较长 Coredns response delay 大于 2.5 秒 当前值为{{$value}}"
  - alert: coredns-responses-P3
    expr:  histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 1.5
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      current_value: "{{$value}}"
      description:  "{{ $labels.instance }}  Coredns has longer response time, response More than 1.5 seconds , please check !"
      summary: "{{ $labels.instance }} 非常严重! 响应时间较长 Coredns response delay 大于 1.5 秒 当前值为{{$value}}"
  - alert: coredns abort
    expr:  coredns_panic_count_total > 0 
    labels:
      severity: P0
      cluster: prodidc
    annotations:
      current_value: "{{$value}}"
      description: "{{ $labels.instance }} Error Coredns process abnormal interrupt!!"
      summary: "毁灭性错误!{{ $labels.instance }} Coredns 进程异常中断!请检查!"
  - alert: Coredns Query per minute P3
    expr:  sum(irate(coredns_dns_request_count_total{zone !="dropped"}[1m]))  > 30000 #计算最近两个数据点每秒查询的次数
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: Counter of DNS requests made per zone, protocol and family.
      summary: "严重!{{ $labels.instance }} coredns 每分钟查询次数高于阀值25000.当前值为{{$value}}"
  - alert: Coredns Query per minute P2
    expr:  sum(irate(coredns_dns_request_count_total{zone !="dropped"}[1m]))  > 40000
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: Counter of DNS requests made per zone, protocol and family.
      summary: "严重!{{ $labels.instance }} coredns 每分钟查询次数高于阀值30000.当前值为{{$value}}"
  - alert: Coredns Query per minute P1
    expr:  sum(irate(coredns_dns_request_count_total{zone !="dropped"}[1m]))  > 50000
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: Counter of DNS requests made per zone, protocol and family.
      summary: "非常严重!{{ $labels.instance }} coredns 每分钟查询次数高于阀值50000.当前值为{{$value}}"
  - alert: Coredns SERVFAIL Count
    expr: irate(coredns_dns_response_rcode_count_total{rcode!="NOERROR"} [1m]) and irate(coredns_dns_response_rcode_count_total{rcode!="NXDOMAIN"} [1m]) > 10
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: The server failed, the authority server for this domain name refused to respond or REFUSE,
      summary: "警告!coredns 服务器处理请求失败超过10次,当前失败的处理次数{{$value}} 原因服务器拒绝响应或响应拒绝."
  - alert: Coredns Cache misses P3
    expr: irate(coredns_cache_misses_total  [1m]) > 10
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: DNS cache misses statistical
      summary: "警告!{{ $labels.instance }}  coredns 每分钟服务器缓存misses超过10.当前值为{{$value}}"
  - alert: Coredns Cache misses P2
    expr: irate(coredns_cache_misses_total  [1m]) > 30
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: DNS cache misses statistical
      summary: "严重!{{ $labels.instance }}  coredns 每分钟服务器缓存misses超过30.当前值为{{$value}}"
  - alert: Coredns Cache misses P1
    expr: irate(coredns_cache_misses_total  [1m]) > 80
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: DNS cache misses statistical
      summary: "非常严重!{{ $labels.instance }}  coredns 每分钟服务器缓存misses超过50.当前值为{{$value}}"
  - alert: Coredns Request Time P3
    expr: histogram_quantile(0.99,sum(rate(coredns_health_request_duration_seconds_bucket [1m]) ) by(server, zone, le)) > 0.05 #请求所用的时间
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      summary: "警告!实例 {{ $labels.instance }} 每一分钟请求dns 超过设置阀值0.05秒 当前值为{{$value}}"
  - alert: Coredns Request Time P2
    expr: histogram_quantile(0.99,sum(rate(coredns_health_request_duration_seconds_bucket [1m]) ) by(server, zone, le)) > 0.1
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      summary: "严重!实例 {{ $labels.instance }} 每一分钟请求dns 超过设置阀值0.1秒 当前值为{{$value}}"
  - alert: Coredns Request Time P1
    expr: histogram_quantile(0.99,sum(rate(coredns_health_request_duration_seconds_bucket [1m]) ) by(server, zone, le)) > 1
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      summary: "非常严重!实例 {{ $labels.instance }} 每一分钟请求dns 超过设置阀值1秒 当前值为{{$value}}"
  - alert: Coredns  The maximum number  open file P1
    expr: sum(rate(process_max_fds [1m]) ) > 2000000
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: 
      summary: "非常严重!实例 {{ $labels.instance }} 每一分钟打开文件数超过设置阀值200000 当前值为{{$value}}"
  - alert: Coredns  The maximum number  open file P2
    expr: sum(rate(process_max_fds [1m]) ) > 1500000
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: 
      summary: "严重!实例 {{ $labels.instance }} 每一分钟打开文件数超过设置阀值150000 当前值为{{$value}}"
  - alert: Coredns  The maximum number  open file P3
    expr: sum(rate(process_max_fds [1m]) ) > 1000000
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: 
      summary: "警告!实例 {{ $labels.instance }} 每一分钟打开文件数超过设置阀值100000 当前值为{{$value}}"

 etcd

groups:
- name: etcd.rule
  rules:
  - alert: etcdDown
    expr: up{job="etcd"} == 0
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: etcd {{ $labels.instance }} is down. {{ $labels.instance }} isn't reachable and continueed 1 minute.
      summary: etcd is down
  - alert: NoLeader
    expr: etcd_server_has_leader{job="etcd"} == 0
    labels:
      severity: P0
      cluster: prodidc
    annotations:
      description: etcd member {{ $labels.instance }} has no leader
      summary: etcd member has no leader
  - alert: HighNumberOfLeaderChanges
    expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[10m]) > 3
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
        changes within the last hour
  - record: instance:fd_utilization
    expr: process_open_fds / process_max_fds
  - alert: FdExhaustionClose
    expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
        its file descriptors soon'
      summary: file descriptors 预测4个小时内耗尽
  - alert: FdExhaustionClose
    expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
    for: 5m
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
        its file descriptors soon'
      summary: file descriptors 预测1个小时内耗尽
  - alert: EtcdMemberCommunicationSlow
    expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
      > 0.15
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: etcd instance {{ $labels.instance }} member communication with
        {{ $labels.To }} is slow
      summary: etcd member communication is slow
  - alert: HighNumberOfFailedProposals
    expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
        failures within the last hour
      summary: etcd 集群大量的提案失败
  - alert: HighFsyncDurations
    expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
      > 0.05
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: etcd instance {{ $labels.instance }} fync durations are high
      summary: etcd Wal(预写日志系统)调用的fsync提交的延迟高
  - alert: HighCommitDurations
    expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
      > 0.25
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: etcd instance {{ $labels.instance }} commit durations are high
      summary: etcd 磁盘后端提交持续时间长
  - alert: EtcdDiskPressure
    expr: sum(etcd_mvcc_db_total_size_in_bytes{job="etcd"}/1024/1024/1024) by (instance) > 200
    for: 60m
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "Etcd {{ $labels.instance }} Disk Pressuer"
      summary: "Etcd 数据占用系统盘200G,当前值:{{$value}} "

kube-controller-manager

groups:
- name: kube-controller-manager.rule
  rules:
  - alert: K8SControllerManagerDown
    expr: absent(up{job="kubernetes-controller-manager"} == 1)
    for: 1m
    labels:
      severity: P0
      cluster: prodidc
    annotations:
      description: There is no running K8S controller manager. Deployments and replication controllers are not making progress.
      summary: No kubernetes controller manager are reachable

  - alert: K8SControllerManagerDown
    expr: up{job="kubernetes-controller-manager"} == 0
    for: 1m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: kubernetes controller manager {{ $labels.instance }} is down. {{ $labels.instance }} isn't reachable
      summary: kubernetes controller manager is down

  - alert: K8SControllerManagerUserCPU
    expr: sum(rate(container_cpu_user_seconds_total{pod=~"kube-controller-manager.*",container_name!="POD"}[5m]))by(pod) > 5
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: kubernetes controller manager {{ $labels.instance }} is user cpu time > 5s. {{ $labels.instance }} isn't reachable
      summary: kubernetes controller 负载较高超过5s

  - alert: K8SControllerManagerUseMemory
    expr: sum(rate(container_memory_usage_bytes{pod=~"kube-controller-manager.*",container_name!="POD"}[5m])/1024/1024)by(pod) > 20
    for: 5m
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      description: kubernetes controller manager {{ $labels.instance }} is use memory More than 20MB
      summary: kubernetes controller 使用内存超过20MB

  - alert: K8SControllerManagerQueueTimedelay
    expr: histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job="kubernetes-controller-manager"}[5m])) by(le)) > 10
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: kubernetes controller manager {{ $labels.instance }} is QueueTimedelay More than 10s
      summary: kubernetes controller 队列停留时间超过10秒,请检查ControllerManager

kubelet

groups:
- name: kubelet.rule
  rules:
  - alert: K8SNodeNotReady
    expr: kube_node_status_condition{condition="Ready",status="true"} == 0
    for: 2m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      summary: "Node status is NotReady"
      description: "The Kubelet on {{ $labels.node }} has not checked in with the API, or has set itself to NotReady, for more than an hour"
  - alert: K8SManyNodesNotReady
    expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
      > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
      0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
    for: 5m
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      summary: "大量的node节点没有Ready"
      description: '{{ $value }}% of Kubernetes nodes are not ready'
  - alert: K8SKubeletDown
    expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 10
    for: 2m
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      description: Prometheus failed to scrape {{ $value }}% of kubelets.
      summary: kubelet cannot be scraped
  - alert: K8SManyKubeletDown
    expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
      * 100 > 30
    for: 10m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets have disappeared from service discovery.
      summary: Many Kubelets cannot be scraped
  - alert: K8SKubeletTooManyPods
    expr: kubelet_running_pod_count > 50
    for: 10m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: Kubelet {{$labels.instance}} is running {{$value}} pods, close to the limit of 50
      summary: 单节点pod数大于50

kube-scheduler

groups:
- name: kube-scheduler.rule
  rules:
  - alert: K8SSchedulerDown
    expr: absent(up{job="kubernetes-scheduler"} == 1)
    for: 1m
    labels:
      severity: P0
      cluster: prodidc
    annotations:
      description: "There is no running K8S scheduler. New pods are not being assigned to nodes."
      summary: "all k8s scheduler is down"
  - alert: K8SSchedulerDown
    expr: up{job="kubernetes-scheduler"} == 0
    for: 1m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: "K8S scheduler {{ $labels.instance }} is no running. New pods are not being assigned to nodes."
      summary: "k8s scheduler {{ $labels.instance }} is down"
  - alert: K8SSchedulerUserCPU
    expr: sum(rate(container_cpu_user_seconds_total{pod=~"kube-scheduler.*",container_name!="POD"}[5m]))by(pod) > 1
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetes scheduler {{ $labels.instance }} is user cpu time > 1s. {{ $labels.instance }} isn't reachable"
      summary: "kubernetes scheduler 负载较高超过1s,当前值为{{$value}}"

  - alert: K8SSchedulerUseMemory
    expr: sum(rate(container_memory_usage_bytes{pod=~"kube-scheduler.*",container_name!="POD"}[5m])/1024/1024)by(pod) > 20
    for: 5m
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetess scheduler {{ $labels.instance }} is use memory More than 20MB"
      summary: "kubernetes scheduler 使用内存超过20MB,当前值为{{$value}}MB"

  - alert: K8SSchedulerPodPending
    expr: sum(scheduler_pending_pods{job="kubernetes-scheduler"})by(queue) > 5
    for: 5m
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: "kubernetess scheduler {{ $labels.instance }} is Pending pod More than 5"
      summary: "kubernetes scheduler pod无法调度 > 5,当前值为{{$value}}"

  - alert: K8SSchedulerPodPending
    expr: sum(scheduler_pending_pods{job="kubernetes-scheduler"})by(queue) > 10
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: kubernetess scheduler {{ $labels.instance }} is Pending pod More than 10
      summary: "kubernetes scheduler pod无法调度 > 10,当前值为{{$value}}"

  - alert: K8SSchedulerPodPending
    expr: sum(rate(scheduler_binding_duration_seconds_count{job="kubernetes-scheduler"}[5m])) > 1
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: kubernetess scheduler {{ $labels.instance }}
      summary: "kubernetes scheduler pod 无法绑定调度有问题,当前值为{{$value}}"

  - alert: K8SSchedulerVolumeSpeed
    expr: sum(rate(scheduler_volume_scheduling_duration_seconds_count{job="kubernetes-scheduler"}[5m])) > 1
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: kubernetess scheduler {{ $labels.instance }}
      summary: "kubernetes scheduler pod Volume 速度延迟,当前值为{{$value}}"

  - alert: K8SSchedulerClientRequestSlow
    expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job="kubernetes-scheduler"}[5m])) by (verb, url, le)) > 1
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      description: kubernetess scheduler {{ $labels.instance }}
      summary: "kubernetes scheduler 客户端请求速度延迟,当前值为{{$value}}"

kube-state-metrics

groups:
- name: kube-state-metrics.rules
  rules:
  - alert: DaemonSetNotReady
    expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
      * 100 < 100
    for: 15m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: Only {{$value}}% of desired pods scheduled and ready for daemonset {{$labels.namespace}}/{{$labels.daemonset}}
      summary: DaemonSet pod status is not ready
  - alert: DaemonSetsNotScheduled
    expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
      > 0
    for: 10m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: daemonsets {{$labels.namespace}}/{{$labels.daemonset}} is not scheduled.
      summary: Daemonsets are not scheduled correctly
  - alert: DaemonSetsMissScheduled
    expr: kube_daemonset_status_number_misscheduled > 0
    for: 2m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: A number of daemonsets are running where they are not supposed to run.
      summary: Daemonsets {{ $labels.daemonset }} are not scheduled correctly

node

groups:
- name: node.rules
  rules:
  - alert: NodeMemUseHigh
    expr: sum by (node)(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / sum by (node)(node_memory_MemTotal_bytes) > 0.9
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: "Node memory usage 高于90%"
      summary: "Node memory usage 高于90%"
  - alert: NodeUnschedulable
    expr: sum(kube_node_spec_unschedulable) > 0
    for: 5m
    labels:
      severity: P4
      cluster: prodidc
    annotations:
      description: a node is unschedulable for 5 minutes
      summary: Node is unschedulable
  - alert: NodeExporterDown
    expr: absent(up{job="prometheus-node-exporter"} == 1)
    for: 2m
    labels:
      severity: P3
      cluster: prodidc
    annotations:
      description: Prometheus could not scrape a node-exporter/{{$labels.node}} for more than 2m, or node-exporters have disappeared from discovery
      summary: Prometheus could not scrape a node-exporter

  - alert: NodeCpuUseHigh
    expr: 1-avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (node) > 0.9
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: "node/{{$labels.node}} CPU使用率高于90%"
      summary: "node/{{$labels.node}} CPU使用率高于90%"

deployment

groups:
- name: Deployment.rule
  rules:
  - alert: ReplicasUnavailable
    expr: kube_deployment_status_replicas_unavailable > 0
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      summary: "Deployment replicas status unavailable"
      description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} status is unavailable, have {{ $value }} pod is unavailable more than 15 minutes."
  - alert: DeploymentReplicasNotUpdated
    expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
      or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
      unless (kube_deployment_spec_paused == 1)
    for: 15m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      description: Replicas are not updated and available for deployment {{$labels.namespace}}/{{$labels.deployment}}
      summary: pod个数小于replicas预期值

ingress-nginx

groups:
- name: ingress-nginx.rule
  rules:
  - alert: ingressDown
    expr: kube_pod_status_ready{condition="true",pod=~"ingress.*-controller.*"} < 1
    for: 1m
    labels:
      severity: P0
      cluster: prodidc
    annotations:
      summary: "ingress nginx namespace:{{$labels.namespace}} podname:{{$labels.pod}} is Down"
      description: "ingress nginx namespace:{{$labels.namespace}} podname:{{$labels.pod}}, for more than an minute"

  - alert: ingressControllerConn
    expr: sum by (instance)(avg_over_time(nginx_ingress_controller_nginx_process_connections{}[2m])) > 100000 
    for: 5m
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      summary: "ingress nginx {{$labels.instance}} connection more than 100000"
      description: "ingress nginx {{$labels.instance}} connection more than 100000, for more than five minute"

  - alert: ingressMemUseage
    expr: sum(container_memory_working_set_bytes{pod=~"ingress-.*",container!~"|filebeat|POD"} ) by (namespace,pod,service) / sum(container_spec_memory_limit_bytes{pod=~"ingress-.*",container!~"|filebeat|POD",namespace!=""}) by (namespace,pod,service) * 100 > 90  and (sum(container_memory_working_set_bytes) by (namespace,pod,service)/sum(container_spec_memory_limit_bytes) by (namespace,pod,service)) != Inf
    for: 2m
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      summary: "ingress controller {{$labels.instance}} memory useage moth than 90%"
      description: "ingress controller {{$labels.instance}} memory useage moth than 90%, for more than five minute"

  - alert: ingressCpuUseage
    expr: sum(rate(container_cpu_usage_seconds_total{pod=~"ingress-.*",image!=""}[1m])) by (pod, namespace,service) / (sum(container_spec_cpu_quota{pod=~"ingress-.*",image!=""}/100000) by (pod, namespace,service)) * 100  > 90 and  sum by (pod,namespace,service)( rate(container_cpu_usage_seconds_total{image!="", namespace!=""}[1m] ) ) * 100  != Inf
    for: 2m
    labels:
      severity: P1
      cluster: prodidc
    annotations:
      summary: "ingress controller {{$labels.instance}} memory useage moth than 90%"
      description: "ingress controller {{$labels.instance}} memory useage moth than 90%, for more than five minute"

  - alert: controllerSSLtime
    expr: nginx_ingress_controller_ssl_expire_time_seconds < (time() + (20 * 24 * 3600))
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      summary: "ingress ssl证书有效期小于20天"
      description: "ingress controller ssl time less than 20 day"

pod

groups:
- name: Pods.rule
  rules:
  - alert: PodHighCPUUsage
    expr: sum(rate(container_cpu_usage_seconds_total{image!=""}[1m])) by (pod, namespace,service) / (sum(container_spec_cpu_quota{image!=""}/100000) by (pod, namespace,service)) * 100  > 95 and  sum by (pod,namespace,service)( rate(container_cpu_usage_seconds_total{image!="", namespace!=""}[1m] ) ) * 100  != Inf
    for: 5m
    labels:
      severity: P2
      cluster: prodidc
    annotations:
      current_value: '{{$value}}'
      summary:  "Pod cpu Usage 高于95%"
      description: "{{ $labels.namespace }}.{{ $labels.pod  }} cpu usage is high above 95%  for more than 5 minute."

  - alert: PodRssMemHigh
    expr: sum by(namespace, pod) (container_memory_rss{container!~"|filebeat|POD",namespace!="",pod!=""}) / sum by(namespace, pod) (container_spec_memory_limit_bytes{container!~"|filebeat|POD",namespace!="",pod!=""}) * 100 > 95 and (sum by(namespace, pod) (container_memory_rss) / sum by(namespace, pod) (container_spec_memory_limit_bytes)) != +Inf
    for: 10m
    labels:
      cluster: prodidc
      severity: P2
    annotations:
      description: "Pod ({{$labels.namespace}}/{{$labels.pod}}) Rss memory 高于95%,请检查合理性"
      summary: "Pod ({{$labels.namespace}}/{{$labels.pod}}) Rss memory 高于95%,请检查合理性"

  - alert: PodNotReady
    expr: sum(kube_pod_status_ready{condition='true',pod!~".*-pt-.*"} != 1) by (namespace,pod)
    for: 3m
    labels:
      severity: P0
      cluster: prodidc
    annotations:
      summary: "Pod ({{$labels.namespace}}/{{$labels.pod}}) is NotReady"
      description: "服务({{$labels.namespace}}/{{$labels.pod}})异常,请尽快检查"

你可能感兴趣的:(kubernetes,1024程序员节,监控告警,kubernetes,rules)