Kubernetes集群运行很多业务pod,因此监控Kubernetes集群,提前发现即将出现的问题或实时发现问题是很重要的,借助监控来保证Kubernetes集群的稳定性显得尤为重要。
为什么选用prometheus监控kubernetes集群???
容器监控的实现方式对比虚拟机或者物理机来说有比较大的区别,⽐如容器在k8s环境中可以任意横向扩容与缩容,那么就需要监控服务能够⾃动对新创建的容器进⾏监控,当容器删除后⼜能够及时的从监控服务中删除,⽽传统的zabbix的监控⽅式需要在每⼀个容器中安装启动agent,并且在容器⾃动发现注册及模板关联⽅⾯并没有⽐较好的实现⽅式。
采用prometheus-operator来监控kubernetes集群。Operator部署器是基于已经编写好的yaml文件,可以将prometheus server 、alertmanager、grafana、node-exporter、cadvisor、kube-state-metrics等组件一键批量部署。
# git clone -b release-0.11 https://github.com/prometheus-operator/kubeprometheus.git
#
cd kube-prometheus/
# kubectl apply -f manifests/setup
# grep image: manifests/ -R | grep "k8s.gcr.io" #有部分镜像⽆法下载,需要自行解决
sum (container_memory_rss{container!=""}) by (node) #节点内存容器实际使用率
sum(kube_pod_container_resource_memory_limits{resource="memory"}) by (node) #节点内存limit总和
sum(kube_pod_container_resource_memory_requests{resource="memory"}) by (node) #节点内存request总和
sum(kube_node_status_allocatable_memory_bytes) by (node) #节点可用内存
sum(node_memory_MemTotal_bytes)-sum(node_memory_MemAvailable_bytes) #已经使用的总内存
sum (container_memory_rss{container!=""}) by (namespace) #命名空间级容器实际使用率
sum(kube_pod_container_resource_memory_limits{resource="memory"}) by (namespace) #命名空间级内存limit总和
sum(kube_pod_container_resource_memory_requests{resource="memory"}) by (namespace) #命名空间级内存request总和
``
count(kube_node_labels{ label_kubernetes_io_role="node"}) #获取node节点数量
count(kube_node_labels{ label_kubernetes_io_role="master"}) #获取master节点数量
count(kube_pod_status_phase{phase="Running"}) #获取pod数量
sum(kube_node_status_allocatable_memory_bytes) #集群总内存
sum (machine_memory_bytes{node=~"^.*$"}) #机器总内存,包括预留的资源
sum(kube_node_status_allocatable_cpu_cores) #cpu总内存
sum(node_filesystem_size_bytes{device!~"rootfs|HarddiskVolume.+",node=~"^.*$"})#集群总磁盘
sum(node_filesystem_size_bytes{device!~"rootfs|HarddiskVolume.+",node=~"^.*$"}) - sum(node_filesystem_free_bytes{device!~"rootfs|HarddiskVolume.+",node=~"^.*$"}) #集群已使用磁盘
pod-cpu-request不合理查询promsql:
ceil( sort_desc((sum(kube_pod_container_resource_requests_cpu_cores {namespace=~"xxx" }) by (namespace,pod)/ sum(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate [30d])) by (namespace,pod) >10) and sum (kube_pod_container_resource_requests_cpu_cores>0.3) by (namespace,pod) ))
pod-memory-request不合理查询promsql:
ceil(sort_desc(((sum(kube_pod_container_resource_requests_memory_bytes{namespace=~"xxx"})by (namespace,pod)/sum(max_over_time(node_namespace_pod_container:container_memory_working_set_bytes[30d]))by (namespace,pod))>2) and sum (kube_pod_container_resource_requests_memory_bytes>500*1024*1024)by (namespace,pod)))
pod-memory-limit不合理查询promsql:
ceil(sort_desc(((sum(kube_pod_container_resource_limits_memory_bytes{namespace=~"xxx"})by (namespace,pod)/sum(max_over_time(node_namespace_pod_container:container_memory_working_set_bytes[30d]))by (namespace,pod))>10) and sum (kube_pod_container_resource_limits_memory_bytes>500*1024*1024)by (namespace,pod)))
pod-cpu-limit不合理查询promsql:
ceil( sort_desc((sum(kube_pod_container_resource_limits_cpu_cores {namespace=~"xxx" }) by (namespace,pod)/ sum(max_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate [30d])) by (namespace,pod) >10) and sum (kube_pod_container_resource_limits_cpu_cores>0.3) by (namespace,pod) ))
参考1:https://help.aliyun.com/document_detail/176180.html? spm=a2c4g.11186623.6.659.598c2d39N3EVnR
参考2:https://help.aliyun.com/document_detail/436511.html
磁盘监控不能简单的通过使用百分比来做告警判断,因为1个G使用80%和1个T使用80%区别是很大的,所以我们应该监控增长趋势以及方向,根据6h的磁盘增长情况来预测在未来4h内是否会把磁盘空间用完。
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 10
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
磁盘是否有损坏
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0
node_md_disks{state="failed"} > 0
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
node_textfile_scrape_error{job="node-exporter"} == 1
(
node_timex_offset_seconds > 0.05
and
deriv(node_timex_offset_seconds[5m]) >= 0
)
or
(
node_timex_offset_seconds < -0.05
and
deriv(node_timex_offset_seconds[5m]) <= 0
)
min_over_time(node_timex_sync_status[5m]) == 0
and
node_timex_maxerror_seconds >= 16
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
pod状态为CrashLooping
pod之前启动后出现未知错误后异常退出了,根据restartPolicy可能重新启动
max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1 #KubePodCrashLooping
pod状态为NotReady
sum by (namespace, pod, cluster) (
max by(namespace, pod, cluster) (
kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}
) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0 #KubePodNotReady
pod cpu使用率大于80%
100 * (sum(rate(container_cpu_usage_seconds_total[1m])) by (pod_name) / sum(label_replace(kube_pod_container_resource_limits_cpu_cores, "pod_name", "$1", "pod", "(.*)")) by (pod_name))>80
Pod的内存使用率大于80%
100 * (sum(container_memory_working_set_bytes) by (pod_name) / sum(label_replace(kube_pod_container_resource_limits_memory_bytes, "pod_name", "$1", "pod", "(.*)")) by (pod_name))>80
Pod的状态为未运行
sum (kube_pod_status_phase{phase!="Running"}) by (pod,phase)
Pod的内存大于4GB
(sum (container_memory_working_set_bytes{id!="/"})by (pod_name,container_name) /1024/1024/1024)>4
Pod重启
sum (increase (kube_pod_container_status_restarts_total{}[2m])) by (namespace,pod) >0
kube_deployment_status_observed_generation{job="kube-state-metrics"}
!=
kube_deployment_metadata_generation{job="kube-state-metrics"} #KubeDeploymentGenerationMismatch
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics"} #KubeStatefulSetGenerationMismatch
(
kube_deployment_spec_replicas{job="kube-state-metrics"}
>
kube_deployment_status_replicas_available{job="kube-state-metrics"}
) and (
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m])
==
0
) #KubeDeploymentReplicasMismatch
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics"}
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m])
==
0
) # KubeStatefulSetReplicasMismatch
(
max without (revision) (
kube_statefulset_status_current_revision{job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
)
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m])
==
0
) #KubeStatefulSetUpdateNotRolledOut
(
(
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
) or (
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"}
!=
0
) or (
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
) or (
kube_daemonset_status_number_available{job="kube-state-metrics"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
)
) and (
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"}[5m])
==
0
) #KubeDaemonSetRolloutStuck
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"}
and
kube_job_status_active{job="kube-state-metrics"} > 0) > 43200 #notCompleted
任务执行失败
kube_job_failed{job="kube-state-metrics"} > 0
副本数不匹配
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"}
!=
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}
>
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}
<
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"})
and
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}[15m]) == 0
达到最大副本数
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}
==
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu
"})) > 0
and
(sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
and
(sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 0.9 < 1
远程存储使用占比监控
(
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on(namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on(namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
pv状态监控
(
kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on(namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on(namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*
).*"))) > 1
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (cluster, instance, job, namespace)
/
sum(rate(rest_client_requests_total[5m])) by (cluster, instance, job, namespace))
> 0.01
apiserver请求超过预期
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
and
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
客户端证书是否过期监控
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate
_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
聚合api错误监控
sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
KubeAPIDown
absent(up{job="apiserver"} == 1)
KubeAPITerminatedRequests
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_t
erminations_total{job="apiserver"}[10m])) ) > 0.20
KubeletDown
absent(up{job="kubelet", metrics_path="/metrics"} == 1)
KubeNodeNotReady
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
KubeNodeUnreachable
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-
state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
KubeletTooManyPods
count by(cluster, node) (
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,clust
er) (1, kube_pod_info{job="kube-state-metrics"})
)
/
max by(cluster, node) (
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
) > 0.95
KubeNodeReadinessFlapping -node就绪状态不稳定
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (cluster, node) > 2
Kubelet Pod生命周期事件生成器需要太长时间才能重新登录。
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
KubeletPodStartUpLatencyHigh
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster
, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
KubeSchedulerDown
absent(up{job="kube-scheduler"} == 1)
KubeControllerManagerDown
absent(up{job="kube-controller-manager"} == 1)
etcdDown
absent(up{job="etcd"} == 1)