本文借鉴于监控–Prometheus部署篇
1. prometheus权限设置
prometheus-rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: kube-system
2. 配置文件
prometheus-comfigMap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
labels:
monitor: prometheus
namespace: kube-system
data:
hoststats.rules: |
groups:
- name: nodehost
rules:
# Alert for 85% usage of Cluster CPU for > 1m
- alert: hostCpuUsageAlert
expr: sum(avg (irate(node_cpu{mode!="idle",job="kubernetes_node"}[5m])) without (cpu)) by (instance) > 0.85
for: 3m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} CPU usgae high"
description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})"
- alert: hostMemUsageAlert
expr: (node_memory_MemTotal-node_memory_MemAvailable)/node_memory_MemTotal{job="kubernetes_node"} * 100 > 85
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }}MEM usage high"
description: "Instance {{ $labels.instance }} Memory usage above 85% (current value: {{ $value }})"
- alert: hostDiskUsageAlert
expr: container_fs_usage_bytes{device=~"^/dev/[sv]d[a-z][1-9]$",id="/"}/container_fs_limit_bytes{device=~"^/dev/[sv]d[a-z][1-9]$",id="/"} > 0.8
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }}Disk usage high"
description: "Instance {{ $labels.instance }} Disk usage above 80% (current value: {{ $value }})"
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- /etc/prometheus/rules/*.rules
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module: [http_2xx]
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
- job_name: 'kubernetes-ingresses'
kubernetes_sd_configs:
- role: ingress
relabel_configs:
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
action: keep
regex: true
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'kubernetes_node'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
# 基于endpoint的服务发现,不再经过service代理层面
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_endpoint_port_name]
regex: true;prometheus-node-exporter
action: keep
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
# 去掉label name中的前缀__meta_kubernetes_service_label_
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
# 为了区分所属node,把instance 从node-exporter ep的实例,替换成ep所在node的ip
- source_labels: [__meta_kubernetes_pod_host_ip]
regex: '(.*)'
replacement: '${1}'
target_label: instance
3. app部署
prometheus.rc.yaml
apiVersion: v1
kind: ReplicationController
metadata:
labels:
monitor: prometheus
name: prometheus
namespace: kube-system
spec:
replicas: 1
template:
metadata:
name: prometheus
labels:
monitor: prometheus
spec:
containers:
- image: prom/prometheus:v2.0.0
name: prometheus
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=24h"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- name: data
mountPath: "/prometheus"
- name: config-volume
mountPath: "/etc/prometheus/prometheus.yml"
subPath: prometheus.yml
- name: config-volume
mountPath: "/etc/prometheus/rules/hoststats.rules"
subPath: hoststats.rules
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 500m
memory: 2500Mi
serviceAccountName: prometheus
volumes:
- name: data
emptyDir: {}
- name: config-volume
configMap:
name: prometheus-config
---
kind: Service
apiVersion: v1
metadata:
labels:
monitor: prometheus
name: prometheus
namespace: kube-system
spec:
type: NodePort
ports:
- port: 9090 # 访问服务名即可跳转到对于目标容器端口
protocol: TCP
targetPort: 9090 # 容器开发端口
nodePort: 39090 # 绑定集群节点端口
selector:
monitor: prometheus
4. 节点采集器部署
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: prometheus-node-exporter
namespace: kube-system
labels:
k8s-app: prometheus-node-exporter
spec:
template:
metadata:
labels:
k8s-app: prometheus-node-exporter
spec:
tolerations:
- key: node-role.kubernetes.io/master
operation: Equal
effect: NoSchedule
containers:
- image: prom/node-exporter:v0.14.0
name: node-exporter
ports:
- containerPort: 9100
protocol: TCP
name: http
hostPort: 9101
volumeMounts:
- mountPath: /host/proc
name: proc
- mountPath: /host/sys
name: sys
args:
- -collector.procfs=/host/proc
- -collector.sysfs=/host/sys
- -collector.filesystem.ignored-mount-points
- "^/(dev|proc|sys|host|etc|rootfs|docker)($|/)"
volumes:
- hostPath:
path: /proc
name: proc
- hostPath:
path: /sys
name: sys
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
prometheus.io/app-metrics: 'true'
prometheus.io/app-metrics-path: '/metrics'
name: prometheus-node-exporter
namespace: kube-system
labels:
app: prometheus-node-exporter
spec:
clusterIP: None
ports:
- name: prometheus-node-exporter
port: 9100
protocol: TCP
selector:
k8s-app: prometheus-node-exporter
type: ClusterIP