监控指标 | 具体实现 | 备注 |
---|---|---|
pod | cAdvisor | 集成在(kubelet)中 |
node | node-exporter | daemonset |
k8s资源 | kube-state-metrics | deployment |
prometheus_configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: prometheus
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: kubernetes-node-exporters
honor_timestamps: true
scrape_interval: 30s
scrape_timeout: 30s
metrics_path: /metrics
scheme: http
kubernetes_sd_configs:
- role: node
relabel_configs:
- separator: ;
regex: __meta_kubernetes_node_label_(.+)
replacement: $1
action: labelmap
- source_labels: [__meta_kubernetes_node_name]
separator: ;
regex: (.+)
target_label: __address__
replacement: ${1}:9100
action: replace
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module: [http_2xx]
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
- job_name: 'kubernetes-ingresses'
kubernetes_sd_configs:
- role: ingress
relabel_configs:
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: kubernetes_name
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
prometheus.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus-serviceaccount
namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus-serviceaccount
namespace: prometheus
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
name: prometheus-deployment
name: prometheus
namespace: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus-serviceaccount
containers:
- image: prom/prometheus:v2.13.1
name: prometheus
command:
- "/bin/prometheus"
args:
- "--config.file=/prometheus/conf/prometheus.yml"
- "--storage.tsdb.path=/prometheus/data"
- "--storage.tsdb.retention=30d"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: "/prometheus/data"
name: data
- mountPath: "/prometheus/conf"
name: config-volume
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 5000m
memory: 2500Mi
volumes:
- emptyDir: {}
name: data
- configMap:
name: prometheus-config
name: config-volume
hostAliases:
- ip: 192.168.0.193
hostnames:
- online-k8s-node1
---
apiVersion: v1
kind: Service
metadata:
annotations:
name: prometheus
namespace: prometheus
labels:
name: prometheus-service
spec:
ports:
- name: http
port: 9090
protocol: TCP
targetPort: 9090
selector:
app: prometheus
sessionAffinity: None
type: NodePort
node-exporter-ds.yml
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
addonmanager.kubernetes.io/mode: Reconcile
k8s-app: node-exporter
kubernetes.io/cluster-service: "true"
version: v0.18.1
name: node-exporter
namespace: kube-system
spec:
revisionHistoryLimit: 10
selector:
matchLabels:
k8s-app: node-exporter
version: v0.18.1
template:
metadata:
creationTimestamp: null
labels:
k8s-app: node-exporter
version: v0.18.1
spec:
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
value: ""
operator: "Equal"
containers:
- args:
- --log.level=info
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/rootfs
- --collector.vmstat
- --collector.vmstat.fields=.*
- --collector.netstat
- --collector.netstat.fields=.*
- --collector.filesystem.ignored-mount-points=^/(proc|tmpfs|shm|sys|var/lib/docker/.+)($|/)
- --collector.filesystem.ignored-fs-types=^(autofs|tmpfs|shm|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipef
s|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
image: prom/node-exporter:v0.18.1
imagePullPolicy: IfNotPresent
name: prometheus-node-exporter
ports:
- containerPort: 9100
hostPort: 9100
name: metrics
protocol: TCP
resources:
limits:
memory: 50Mi
requests:
cpu: 100m
memory: 50Mi
securityContext:
privileged: true
runAsUser: 0
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /host/proc
name: proc
readOnly: true
- mountPath: /host/sys
name: sys
readOnly: true
- mountPath: /rootfs
name: root
readOnly: true
dnsPolicy: ClusterFirst
hostIPC: true
hostNetwork: true
hostPID: true
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
volumes:
- hostPath:
path: /proc
type: ""
name: proc
- hostPath:
path: /sys
type: ""
name: sys
- hostPath:
path: /
type: ""
name: root
updateStrategy:
rollingUpdate:
maxUnavailable: 1
type: RollingUpdate
kube-state-metrics
yaml文件下载
https://github.com/kubernetes/kube-state-metrics/tree/master/examples/standard
修改对应的镜像即可
例如mirrorgooglecontainers/kube-state-metrics:v1.7.2
原因是解析不到主机名导致的,两种解决方案
一、在部署promethus的时候添加hostname
hostAliases:
- ip: 192.168.0.111
hostnames:
- online-k8s-node2
- online-k8s-node2
二、修改prometheus配置文件
- job_name: kubernetes-node-exporters
honor_timestamps: true
scrape_interval: 30s
scrape_timeout: 30s
metrics_path: /metrics
scheme: http
kubernetes_sd_configs:
- role: node
relabel_configs:
- separator: ;
regex: __meta_kubernetes_node_label_(.+)
replacement: $1
action: labelmap
- source_labels: [__meta_kubernetes_node_address_InternalIP]
separator: ;
regex: (.+)
target_label: __address__
replacement: ${1}:9100
action: replace
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
value: ""
operator: "Equal"
在yaml文件中修改 - "–storage.tsdb.retention=10d"参数
需要添加annotations
容器需要有pod开放了/metrics
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: redis
namespace: kube-system
spec:
template:
metadata:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9121"
labels:
app: redis
spec:
containers:
- name: redis
image: redis:4
resources:
requests:
cpu: 100m
memory: 100Mi
ports:
- containerPort: 6379
- name: redis-exporter
image: oliver006/redis_exporter:latest
resources:
requests:
cpu: 100m
memory: 100Mi
ports:
- containerPort: 9121
- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
grafana-deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana-core
namespace: kube-system
labels:
app: grafana
component: core
spec:
replicas: 1
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
component: core
spec:
containers:
- image: grafana/grafana:7.0.3
name: grafana-core
imagePullPolicy: IfNotPresent
# env:
resources:
# keep request = limit to keep this container in guaranteed class
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 100m
memory: 100Mi
env:
# The following env variables set up basic auth twith the default admin user and admin password.
- name: GF_AUTH_BASIC_ENABLED
value: "true"
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "false"
# - name: GF_AUTH_ANONYMOUS_ORG_ROLE
# value: Admin
# does not really work, because of template variables in exported dashboards:
# - name: GF_DASHBOARDS_JSON_ENABLED
# value: "true"
readinessProbe:
httpGet:
path: /login
port: 3000
# initialDelaySeconds: 30
# timeoutSeconds: 1
grafana-svc.yaml
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: kube-system
labels:
app: grafana
component: core
spec:
type: NodePort
ports:
- port: 3000
selector:
app: grafana
component: core
官网下载dashboards,然后导入即可
https://grafana.com/grafana/dashboards