一、监控选型
Requirements:
1)kube-state-metrics插件
2)node-exporter插件
3)prometheus插件
4)black-box-exporter插件
5)alertmanager插件
6)prometheus-webhook-dingtalk插件
二、布署
1、namspace: kube-system
2、rbac:
serviceaccount: prometheus
clusterrole: prometheus
clusterrolebinding: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions", "apps"]
resources: ["deployments"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch", "extensions"]
resources: ["jobs"]
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: kube-system
---
3、deployment: prometheus
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-deployment
namespace: kube-system
#annotations:
# used to scrape app's metrics which deployed in pod
# prometheus.io/scrape: 'true'
# prometheus scrape path, default /metrics
# prometheus.io/path: '/metrics'
# prometheus.io/port relvant port
spec:
replicas: 1
selector:
matchLabels:
app: prometheus-server
template:
metadata:
labels:
app: prometheus-server
spec:
securityContext:
runAsUser: 0
containers:
- name: prometheus
image: prom/prometheus
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus/"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- name: gluster-volume
mountPath: /prometheus
- name: config-volume
mountPath: /etc/prometheus
serviceAccountName: prometheus
volumes:
- name: gluster-volume
emptyDir: {}
#persistentVolumeClaim:
# claimName: gluster-prometheus
- name: config-volume
configMap:
name: prometheus-server-conf
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: kube-system
labels:
k8s-app: prometheus
spec:
selector:
app: prometheus-server
type: NodePort
ports:
- name: web
port: 9090
targetPort: 9090
4、deployment: kube-state-metrics
kind: Deployment
apiVersion: apps/v1
metadata:
name: kube-state-metrics
namespace: kube-system
selfLink: /apis/apps/v1/namespaces/kube-system/deployments/kube-state-metrics
uid: dc0e1a14-17a9-11e9-bbbc-fa163eb8f89d
resourceVersion: '10913395'
generation: 1
creationTimestamp: '2019-01-14T03:09:52Z'
labels:
grafanak8sapp: 'true'
k8s-app: kube-state-metrics
annotations:
deployment.kubernetes.io/revision: '1'
spec:
replicas: 1
selector:
matchLabels:
grafanak8sapp: 'true'
k8s-app: kube-state-metrics
template:
metadata:
creationTimestamp: null
labels:
grafanak8sapp: 'true'
k8s-app: kube-state-metrics
spec:
serviceAccountName: prometheus
containers:
- name: kube-state-metrics
image: 'quay.io/coreos/kube-state-metrics:v1.1.0'
ports:
- name: http-metrics
containerPort: 8080
protocol: TCP
resources: {}
readinessProbe:
httpGet:
path: /healthz
port: 8080
scheme: HTTP
initialDelaySeconds: 5
timeoutSeconds: 5
periodSeconds: 10
successThreshold: 1
failureThreshold: 3
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
imagePullPolicy: IfNotPresent
restartPolicy: Always
terminationGracePeriodSeconds: 30
dnsPolicy: ClusterFirst
securityContext: {}
schedulerName: default-scheduler
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 25%
maxSurge: 25%
revisionHistoryLimit: 2
progressDeadlineSeconds: 600
5、daemonset: node-exporter
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
creationTimestamp: 2019-01-14T03:09:52Z
generation: 1
labels:
daemon: node-exporter
grafanak8sapp: "true"
name: node-exporter
namespace: kube-system
resourceVersion: "10913368"
selfLink: /apis/extensions/v1beta1/namespaces/kube-system/daemonsets/node-exporter
uid: dc1d24f8-17a9-11e9-bbbc-fa163eb8f89d
spec:
revisionHistoryLimit: 10
selector:
matchLabels:
daemon: node-exporter
grafanak8sapp: "true"
template:
metadata:
creationTimestamp: null
labels:
daemon: node-exporter
grafanak8sapp: "true"
name: node-exporter
spec:
serviceAccountName: prometheus
containers:
- args:
- --path.procfs=/proc_host
- --path.sysfs=/host_sys
image: quay.io/prometheus/node-exporter:v0.15.0
imagePullPolicy: IfNotPresent
name: node-exporter
ports:
- containerPort: 9100
hostPort: 9100
name: node-exporter
protocol: TCP
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /host_sys
name: sys
readOnly: true
- mountPath: /proc_host
name: proc
readOnly: true
dnsPolicy: ClusterFirst
hostNetwork: true
hostPID: true
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
volumes:
- hostPath:
path: /proc
type: ""
name: proc
- hostPath:
path: /sys
type: ""
name: sys
templateGeneration: 1
updateStrategy:
type: OnDelete
6、configmap:
kind: ConfigMap
apiVersion: v1
metadata:
name: prometheus-server-conf
namespace: kube-system
labels:
name: prometheus-server-conf
data:
prometheus.yml: >-
global:
scrape_interval: 30s
evaluation_interval: 30s
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
rule_files:
- /etc/prometheus/rules.yml
scrape_configs:
- job_name: 'kubernetes-kubelet'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-kube-state'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- source_labels: [__meta_kubernetes_pod_label_grafanak8sapp]
regex: .*true.*
action: keep
- source_labels: ['__meta_kubernetes_pod_label_daemon', '__meta_kubernetes_pod_node_name']
regex: 'node-exporter;(.*)'
action: replace
target_label: nodename
- job_name: 'kubernetes-service-http-probe'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: service
# 将metrics_path由默认的/metrics改为/probe
metrics_path: /probe
# Optional HTTP URL parameters.
# 生成__param_module="http_2xx"的label
params:
module: [http_2xx]
relabel_configs:
# 只保留含有label为prometheus/io=scrape的service
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_service_annotation_prometheus_io_http_probe]
regex: true;true
action: keep
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_namespace, __meta_kubernetes_service_annotation_prometheus_io_http_probe_port, __meta_kubernetes_service_annotation_prometheus_io_http_probe_path]
action: replace
target_label: __param_target
regex: (.+);(.+);(.+);(.+)
replacement: $1.$2:$3$4
# 用__address__这个label的值创建一个名为__param_target的label为blackbox-exporter,值为内部service的访问地址,作为blackbox-exporter采集用
#- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_http_probe_path]
# action: replace
# target_label: __param_target
# regex: (.+);(.+)
# replacement: $1$2
# 用blackbox-exporter的service地址值”prometheus-blackbox-exporter:9115"替换原__address__的值
- target_label: __address__
replacement: prometheus-blackbox-exporter:9115
- source_labels: [__param_target]
target_label: instance
# 去掉label name中的前缀__meta_kubernetes_service_annotation_prometheus_io_app_info_
- action: labelmap
regex: __meta_kubernetes_service_annotation_prometheus_io_app_info_(.+)
#- source_labels: [__meta_kubernetes_namespace]
# target_label: kubernetes_namespace
#- source_labels: [__meta_kubernetes_service_name]
# target_label: kubernetes_name
## kubernetes-services and kubernetes-ingresses are blackbox_exporter related
# Example scrape config for probing services via the Blackbox Exporter.
#
# The relabeling allows the actual service scrape endpoint to be configured
# for all or only some services.
- job_name: 'kubernetes-service-tcp-probe'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: service
# 将metrics_path由默认的/metrics改为/probe
metrics_path: /probe
# Optional HTTP URL parameters.
# 生成__param_module="tcp_connect"的label
params:
module: [tcp_connect]
relabel_configs:
# 只保留含有label为prometheus/io=scrape的service
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_service_annotation_prometheus_io_tcp_probe]
regex: true;true
action: keep
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_namespace, __meta_kubernetes_service_annotation_prometheus_io_tcp_probe_port]
action: replace
target_label: __param_target
regex: (.+);(.+);(.+)
replacement: $1.$2:$3
# 用__address__这个label的值创建一个名为__param_target的label为blackbox-exporter,值为内部service的访问地址,作为blackbox-exporter采集用
#- source_labels: [__address__]
# target_label: __param_target
# 用blackbox-exporter的service地址值”prometheus-blackbox-exporter:9115"替换原__address__的值
- target_label: __address__
replacement: prometheus-blackbox-exporter:9115
- source_labels: [__param_target]
target_label: instance
# 去掉label name中的前缀__meta_kubernetes_service_annotation_prometheus_io_app_info_
- action: labelmap
regex: __meta_kubernetes_service_annotation_prometheus_io_app_info_(.+)
rules.yml: |
groups:
- name: citest
rules:
- alert: ContainerNotReady
expr: kube_pod_container_status_ready{} == 0
for: 5m
labels:
team: container
severity: warning
annotations:
summary: "{{$labels.pod}}: kube_pod_container_status_not ready"
description: "{{$labels.pod}}: kube_pod_container_status_ready is {{ $value }}"
- alert: ContainerWating
expr: kube_pod_container_status_waiting{namespace!="kube-system"} == 1
for: 3m
labels:
team: container
severity: warning
annotations:
summary: "{{$labels.pod}}: kube_pod_container_waiting"
- alert: ServiceHealthCheck
expr: probe_http_status_code == 0
for: 1m
labels:
team: container
severity: warning
annotations:
summary: "{{$labels.instance}}: no health!!"
7、black-box-exporter:
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app: prometheus-blackbox-exporter
name: prometheus-blackbox-exporter
namespace: kube-system
data:
blackbox.yml: |-
modules:
http_2xx:
prober: http
timeout: 10s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
valid_status_codes: []
method: GET
preferred_ip_protocol: "ip4"
http_post_2xx: # http post 监测模块
prober: http
timeout: 10s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
method: POST
preferred_ip_protocol: "ip4"
tcp_connect:
prober: tcp
timeout: 10s
icmp:
prober: icmp
timeout: 10s
icmp:
preferred_ip_protocol: "ip4"
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: prometheus-blackbox-exporter
namespace: kube-system
spec:
selector:
matchLabels:
app: prometheus-blackbox-exporter
replicas: 1
template:
metadata:
labels:
app: prometheus-blackbox-exporter
spec:
restartPolicy: Always
containers:
- name: prometheus-blackbox-exporter
image: prom/blackbox-exporter:v0.12.0
imagePullPolicy: IfNotPresent
ports:
- name: blackbox-port
containerPort: 9115
readinessProbe:
tcpSocket:
port: 9115
initialDelaySeconds: 5
timeoutSeconds: 5
resources:
requests:
memory: 50Mi
cpu: 100m
limits:
memory: 60Mi
cpu: 200m
volumeMounts:
- name: config
mountPath: /etc/blackbox_exporter
args:
- --config.file=/etc/blackbox_exporter/blackbox.yml
- --log.level=debug
- --web.listen-address=:9115
volumes:
- name: config
configMap:
name: prometheus-blackbox-exporter
---
apiVersion: v1
kind: Service
metadata:
labels:
app: prometheus-blackbox-exporter
name: prometheus-blackbox-exporter
namespace: kube-system
annotations:
prometheus.io/scrape: 'true'
spec:
type: NodePort
selector:
app: prometheus-blackbox-exporter
ports:
- name: blackbox
port: 9115
targetPort: 9115
nodePort: 30009
protocol: TCP
8)alertmanager
apiVersion: v1
kind: ConfigMap
metadata:
name: altermanger-config
labels:
name: altermanger-config
namespace: kube-system
data:
altermanger-config.yml: |-
global:
resolve_timeout: 5m
route:
receiver: webhook
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
group_by: [citest]
routes:
- receiver: webhook
group_wait: 10s
match:
team: container
receivers:
- name: webhook
webhook_configs:
- url: http://prometheus-webhook-dingtalk:39093/dingtalk/node/send
send_resolved: true
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: alertmanager
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: alertmanager
template:
metadata:
name: alertmanager
labels:
app: alertmanager
spec:
containers:
- name: alertmanager
image: quay.io/prometheus/alertmanager:v0.15.0
args:
- '--config.file=/etc/alertmanager/altermanger-config.yml'
- '--storage.path=/alertmanager'
ports:
- name: alertmanager
containerPort: 9093
volumeMounts:
- name: config-volume
mountPath: /etc/alertmanager
- name: alertmanager
mountPath: /alertmanager
serviceAccountName: prometheus
volumes:
- name: config-volume
configMap:
name: altermanger-config
- name: alertmanager
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
prometheus.io/path: '/metrics'
labels:
name: alertmanager
name: alertmanager
namespace: kube-system
spec:
selector:
app: alertmanager
type: NodePort
ports:
- name: alertmanager
protocol: TCP
port: 9093
targetPort: 9093
9)prometheus-webhook-dingtalk
---
apiVersion: apps/v1beta2
kind: Deployment
metadata:
labels:
name: prometheus-webhook-dingtalk
name: prometheus-webhook-dingtalk
namespace: kube-system
spec:
replicas: 2
selector:
matchLabels:
app: prometheus-webhook-dingtalk
template:
metadata:
labels:
app: prometheus-webhook-dingtalk
spec:
containers:
- image: timonwong/prometheus-webhook-dingtalk
name: prometheus-webhook-dingtalk
args:
- "--ding.profile=node=https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxx"
- "--web.listen-address=:8080"
ports:
- containerPort: 8080
protocol: TCP
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 500m
memory: 2500Mi
imagePullSecrets:
- name: IfNotPresent
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
prometheus.io/path: '/metrics'
labels:
name: prometheus-webhook-dingtalk
name: prometheus-webhook-dingtalk
namespace: kube-system
spec:
selector:
app: prometheus-webhook-dingtalk
type: NodePort
ports:
- name: prometheus-webhook-dingtalk
protocol: TCP
port: 39093
targetPort: 8080
三、监控
1、导入模版;
2、添加数据源为prometheus类型