K8s可视化监控告警【1】--Prometheus部署

本文借鉴于监控–Prometheus部署篇
1. prometheus权限设置
prometheus-rbac.yaml

apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups: [""]
  resources:
  - nodes
  - nodes/proxy
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
- apiGroups:
  - extensions
  resources:
  - ingresses
  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: kube-system

2. 配置文件
prometheus-comfigMap.yaml

apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  labels:
    monitor: prometheus
  namespace: kube-system
data:
  hoststats.rules: |
    groups:
    - name: nodehost
      rules:
      # Alert for 85% usage of Cluster CPU for > 1m
      - alert: hostCpuUsageAlert
        expr: sum(avg (irate(node_cpu{mode!="idle",job="kubernetes_node"}[5m])) without (cpu)) by (instance) > 0.85
        for: 3m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }} CPU usgae high"
          description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})"
      - alert: hostMemUsageAlert
        expr: (node_memory_MemTotal-node_memory_MemAvailable)/node_memory_MemTotal{job="kubernetes_node"} * 100 > 85
        for: 5m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }}MEM usage high"
          description: "Instance {{ $labels.instance }} Memory usage above 85% (current value: {{ $value }})"
      - alert: hostDiskUsageAlert
        expr: container_fs_usage_bytes{device=~"^/dev/[sv]d[a-z][1-9]$",id="/"}/container_fs_limit_bytes{device=~"^/dev/[sv]d[a-z][1-9]$",id="/"} > 0.8
        for: 5m
        labels:
          severity: page
        annotations:
          summary: "Instance {{ $labels.instance }}Disk usage high"
          description: "Instance {{ $labels.instance }} Disk usage above 80% (current value: {{ $value }})"

  prometheus.yml: |
    global:
      scrape_interval:     15s
      evaluation_interval: 15s
    rule_files:
    - /etc/prometheus/rules/*.rules
    alerting:
      alertmanagers:
        - static_configs:
          - targets: ["alertmanager:9093"]
    scrape_configs:

    - job_name: 'kubernetes-apiservers'
      kubernetes_sd_configs:
      - role: endpoints
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: default;kubernetes;https

    - job_name: 'kubernetes-cadvisor'
      kubernetes_sd_configs:
      - role: node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor

    - job_name: 'kubernetes-service-endpoints'
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        action: replace
        target_label: kubernetes_name

    - job_name: 'kubernetes-services'
      kubernetes_sd_configs:
      - role: service
      metrics_path: /probe
      params:
        module: [http_2xx]
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
        action: keep
        regex: true
      - source_labels: [__address__]
        target_label: __param_target
      - target_label: __address__
        replacement: blackbox-exporter.example.com:9115
      - source_labels: [__param_target]
        target_label: instance
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        target_label: kubernetes_name

    - job_name: 'kubernetes-ingresses'
      kubernetes_sd_configs:
      - role: ingress
      relabel_configs:
      - source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
        action: keep
        regex: true
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: kubernetes_pod_name
        
    - job_name: 'kubernetes_node'
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      kubernetes_sd_configs:
      # 基于endpoint的服务发现,不再经过service代理层面
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_endpoint_port_name]
        regex: true;prometheus-node-exporter
        action: keep
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: (.+)(?::\d+);(\d+)
        replacement: $1:$2
      # 去掉label name中的前缀__meta_kubernetes_service_label_
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      # 为了区分所属node,把instance 从node-exporter ep的实例,替换成ep所在node的ip
      - source_labels: [__meta_kubernetes_pod_host_ip]
        regex: '(.*)'
        replacement: '${1}'
        target_label: instance

3. app部署
prometheus.rc.yaml

apiVersion: v1
kind: ReplicationController
metadata:
  labels:
    monitor: prometheus
  name: prometheus
  namespace: kube-system
spec:
  replicas: 1
  template:
    metadata:
      name: prometheus
      labels:
        monitor: prometheus
    spec:
      containers:
      - image: prom/prometheus:v2.0.0
        name: prometheus
        command:
        - "/bin/prometheus"
        args:
        - "--config.file=/etc/prometheus/prometheus.yml"
        - "--storage.tsdb.path=/prometheus"
        - "--storage.tsdb.retention=24h"
        ports:
        - containerPort: 9090
          protocol: TCP
        volumeMounts:
        - name: data
          mountPath: "/prometheus"
        - name: config-volume
          mountPath: "/etc/prometheus/prometheus.yml"
          subPath: prometheus.yml
        - name: config-volume
          mountPath: "/etc/prometheus/rules/hoststats.rules"
          subPath: hoststats.rules
        resources:
          requests:
            cpu: 100m
            memory: 100Mi
          limits:
            cpu: 500m
            memory: 2500Mi
      serviceAccountName: prometheus    
      volumes:
      - name: data
        emptyDir: {}
      - name: config-volume
        configMap:
          name: prometheus-config
---
kind: Service
apiVersion: v1
metadata:
  labels:
    monitor: prometheus
  name: prometheus
  namespace: kube-system
spec:
  type: NodePort
  ports:
  - port: 9090              # 访问服务名即可跳转到对于目标容器端口
    protocol: TCP
    targetPort: 9090      # 容器开发端口
    nodePort: 39090       # 绑定集群节点端口
  selector:
    monitor: prometheus

4. 节点采集器部署

apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
  name: prometheus-node-exporter
  namespace: kube-system
  labels:
    k8s-app: prometheus-node-exporter
spec:
  template:
    metadata:
      labels:
        k8s-app: prometheus-node-exporter
    spec:
      tolerations:
      - key: node-role.kubernetes.io/master
        operation: Equal
        effect: NoSchedule
      containers:
      - image: prom/node-exporter:v0.14.0
        name: node-exporter
        ports:
        - containerPort: 9100
          protocol: TCP
          name: http
          hostPort: 9101
        volumeMounts:
        - mountPath: /host/proc
          name: proc
        - mountPath: /host/sys
          name: sys
        args:
        - -collector.procfs=/host/proc 
        - -collector.sysfs=/host/sys 
        - -collector.filesystem.ignored-mount-points
        - "^/(dev|proc|sys|host|etc|rootfs|docker)($|/)"
      volumes:
      - hostPath:
          path: /proc
        name: proc
      - hostPath:
          path: /sys
        name: sys
---
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: 'true'
    prometheus.io/app-metrics: 'true'
    prometheus.io/app-metrics-path: '/metrics'
  name: prometheus-node-exporter
  namespace: kube-system
  labels:
    app: prometheus-node-exporter
spec:
  clusterIP: None
  ports:
    - name: prometheus-node-exporter
      port: 9100
      protocol: TCP
  selector:
    k8s-app: prometheus-node-exporter
  type: ClusterIP

你可能感兴趣的:(Kubernetes,Prometheus)