Prometheus+Grafana+Alertmanager构建企业级监控系统
环境
HOST-NAME | IP | K8S Role |
---|---|---|
master1 | 192.168.1.180/24 | master |
node1 | 192.168.1.181/24 | work |
node-exporter 可以采集机器(物理机、虚拟机、云主机等)的监控指标数据,能够采集到的指标包
括 CPU, 内存,磁盘,网络,文件数等信息。
#查看一下k8s节点
[root@master1 .kube]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
master1 Ready control-plane,master 12d v1.20.6
node1 Ready worker 12d v1.20.6
#创建一个monitor-sa命名空间
[root@master1 .kube]# kubectl create namespace monitor-sa
namespace/monitor-sa created
#上传node-exporter.tar.gz到master1和node1的家目录
[root@master1 ~]# docker load -i node-exporter.tar.gz
ad68498f8d86: Loading layer [==================================================>] 4.628MB/4.628MB
ad8512dce2a7: Loading layer [==================================================>] 2.781MB/2.781MB
cc1adb06ef21: Loading layer [==================================================>] 16.9MB/16.9MB
Loaded image: prom/node-exporter:v0.16.0
[root@master1 ~]#
[root@node1 ~]# docker load -i node-exporter.tar.gz
ad68498f8d86: Loading layer [==================================================>] 4.628MB/4.628MB
ad8512dce2a7: Loading layer [==================================================>] 2.781MB/2.781MB
cc1adb06ef21: Loading layer [==================================================>] 16.9MB/16.9MB
Loaded image: prom/node-exporter:v0.16.0
[root@node1 ~]#
#说明 获取node-exporter的方法
在dockerhub的官网搜索
https://hub.docker.com/
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nlOYwB94-1655106671762)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654670966393.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rLX7tj43-1655106671762)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654670983083.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Diput0ie-1655106671763)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654671144462.png)]
[root@master1 prometheus]# cat > /root/prometheus/node-export.yaml <
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitor-sa
labels:
name: node-exporter
spec:
selector:
matchLabels:
name: node-exporter
template:
metadata:
labels:
name: node-exporter
spec:
hostPID: true
hostIPC: true
hostNetwork: true
containers:
- name: node-exporter
image: prom/node-exporter:v0.16.0
ports:
- containerPort: 9100
resources:
requests:
cpu: 0.15
securityContext:
privileged: true
args:
- --path.procfs
- /host/proc
- --path.sysfs
- /host/sys
- --collector.filesystem.ignored-mount-points
- '"^/(sys|proc|dev|host|etc)($|/)"'
volumeMounts:
- name: dev
mountPath: /host/dev
- name: proc
mountPath: /host/proc
- name: sys
mountPath: /host/sys
- name: rootfs
mountPath: /rootfs
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Exists"
effect: "NoSchedule"
volumes:
- name: proc
hostPath:
path: /proc
- name: dev
hostPath:
path: /dev
- name: sys
hostPath:
path: /sys
- name: rootfs
hostPath:
path: /
END
[root@master1 prometheus]# kubectl apply -f node-export.yaml
daemonset.apps/node-exporter created
[root@master1 prometheus]# kubectl get pods -n monitor-sa -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
node-exporter-92k4d 1/1 Running 0 58s 192.168.1.181 node1 <none> <none>
node-exporter-d44k4 1/1 Running 0 58s 192.168.1.180 master1 <none> <none>
curl http://主机 ip:9100/metrics
#node-export 默认的监听端口是 9100,可以看到当前主机获取到的所有监控数据
curl http://192.168.1.180:9100/metrics | grep node_cpu_seconds
显示 192.168.1.180 主机 cpu 的使用情况
**# HELP node_cpu_seconds_total Seconds the cpus spent in each mode.
**# TYPE node_cpu_seconds_total counter
node_cpu_seconds_total{cpu="0",mode="idle"} 72963.37
node_cpu_seconds_total{cpu="0",mode="iowait"} 9.35
node_cpu_seconds_total{cpu="0",mode="irq"} 0
node_cpu_seconds_total{cpu="0",mode="nice"} 0
node_cpu_seconds_total{cpu="0",mode="softirq"} 151.4
node_cpu_seconds_total{cpu="0",mode="steal"} 0
node_cpu_seconds_total{cpu="0",mode="system"} 656.12
node_cpu_seconds_total{cpu="0",mode="user"} 267.1
#HELP:解释当前指标的含义,上面表示在每种模式下 node 节点的 cpu 花费的时间,以 s 为单位
#TYPE:说明当前指标的数据类型,上面是 counter 类型
node_cpu_seconds_total{cpu="0",mode="idle"} :
cpu0 上 idle 进程占用 CPU 的总时间,CPU 占用时间是一个只增不减的度量指标,从类型中也可以看
出 node_cpu 的数据类型是 counter(计数器)
**counter 计数器:只是采集递增的指标
curl http://192.168.40.180:9100/metrics | grep node_load
**# HELP node_load1 1m load average.
**# TYPE node_load1 gauge
node_load1 0.1
node_load1 该指标反映了当前主机在最近一分钟以内的负载情况,系统的负载情况会随系统资源的
使用而变化,因此 node_load1 反映的是当前状态,数据可能增加也可能减少,从注释中可以看出当前指
标类型为 gauge(标准尺寸)
gauge 标准尺寸:统计的指标可增加可减少
#创建一个 sa 账号 monitor
[root@master1 prometheus]# kubectl create serviceaccount monitor -n monitor-sa
serviceaccount/monitor created
[root@master1 prometheus]# kubectl get serviceaccount -n monitor-sa
NAME SECRETS AGE
default 1 79m
monitor 1 30s
#把 serviceaccount 账号 monitor 通过 clusterrolebinding 绑定到 clusterrole 上
[root@master1 prometheus]# kubectl create clusterrolebinding monitor-clusterrolebinding -n monitor-sa --clusterrole=cluster-admin --serviceaccount=monitor-sa:monitor
clusterrolebinding.rbac.authorization.k8s.io/monitor-clusterrolebinding created
#在 k8s 集群的 node1 节点上创建数据存储目录
[root@node1 ~]# mkdir /data
[root@node1 ~]# chmod 777 /data/
[root@node1 ~]# ls -ld /data
drwxrwxrwx. 2 root root 6 Jun 8 16:00 /data
[root@master1 prometheus]# cat > /root/prometheus/prometheus-cfg.yaml <
---
kind: ConfigMap
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus-config
namespace: monitor-sa
data:
prometheus.yml: |
global:
scrape_interval: 15s #采集目标主机监控数据的时间间隔
scrape_timeout: 10s #数据采集超时时间,默认10s
evaluation_interval: 1m #触发告警检测的是境,默认是1m
scrape_configs: #配置数据源,称为target,每个target用job_name命名。又分为静态配置 #和服务发现
- job_name: 'kubernetes-node'
kubernetes_sd_configs: #使用的是k8s的服务发现
- role: node
#使用node角色,它使用默认的kubelet提供的http端口来发现集群中的每个node节点
relabel_configs: #重新标记
- source_labels: [__address__] #配置的原始标签,匹配地址
regex: '(.*):10250' #匹配带有10250端口的url
replacement: '${1}:9100' #把匹配到的ip:10250的ip保留
target_label: __address__ #新生成的url是${1}获取的ip:9100
action: replace
- action: labelmap
#匹配到下面正则表达式的标签会被保留,如果不做regex正则的话,默认只是会显示instance标签
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-node-cadvisor'
# 抓取cAdvisor数据,是获取kubelet上/metrics/cadvisor接口数据来获取容器的资源使用情况
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap #把匹配到的标签保留
regex: __meta_kubernetes_node_label_(.+)
#保留匹配到的具有__meta_kubernetes_node_label的标签
- target_label: __address__
#获取到的地址: __address__="192.168.1.180:10250"
replacement: kubernetes.default.svc:443
#把获取到的地址替换成新的地址kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
#把原始标签中__meta_kubernetes_node_name值匹配到
target_label: __metrics_path__
#获取__metrics_path__对应的值
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
kubernetes_sd_configs:
- role: endpoints
#使用k8s中的endpoint服务发现,采集apiserver 6443端口获取到的数据
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
#重新打标仅抓取到的具有“prometheus.io/scrape:true"的annotation的端点,意思是说如果某个service具有prometheus.io/scrape = ture annotation声明则抓取,annotation本身也是键值结构,所以这里的源标签设置为键,而regex设置值true,当值匹配到regex设定的内容时则执行keep动作也就是保留,其余则丢弃。
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
#重新设置 scheme,匹配源标签__meta_kubernetes_service_annotation_prometheus_io_scheme 也就是 prometheus.io/scheme annotation,如果源标签的值匹配到 regex,则把值替换为__scheme__对应的值。
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
# 应用中自定义暴露的指标,也许你暴露的 API 接口不是/metrics 这个路径,那么你可以在这个POD 对应的 service 中做一个"prometheus.io/path = /mymetrics" 声明,上面的意思就是把你声明的这个路径赋值给__metrics_path__,其实就是让 prometheus 来获取自定义应用暴露的 metrices 的具体路径,不过这里写的要和 service 中做好约定,如果 service 中这样写 prometheus.io/app-metricspath: '/metrics' 那么你这里就要 __meta_kubernetes_service_annotation_prometheus_io_app_metrics_path 这样写。
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
# 暴露自定义的应用的端口,就是把地址和你在 service 中定义的 "prometheus.io/port = " 声明做一个拼接,然后赋值给__address__,这样 prometheus 就能获取自定义应用的端口,然后通过这个端口再结合__metrics_path__来获取指标,如果__metrics_path__值不是默认的/metrics 那么就要使用上面的标签替换来获取真正暴露的具体路径。
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
END
[root@master1 prometheus]# kubectl apply -f prometheus-cfg.yaml
configmap/prometheus-config created
[root@master1 prometheus]# kubectl get configmap -n monitor-sa
NAME DATA AGE
kube-root-ca.crt 1 3h57m
prometheus-config 1 18s
[root@master1 prometheus]# kubectl describe configmap prometheus-config -n monitor-sa
Name: prometheus-config
Namespace: monitor-sa
Labels: app=prometheus
Annotations: <none>
Data
====
prometheus.yml:
----
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 1m
scrape_configs:
- job_name: 'kubernetes-node'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
action: replace
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-node-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
Events: <none>
安装 prometheus 需要的镜像 prometheus-2-2-1.tar.gz ,上传到 k8s 的工作节点 node1 上,手动解压
这个镜像可以从hub.docker.com dockerhub上下载,也可以通过如下指令pull
docker pull prom/prometheus:v2.2.1
[root@node1 ~]# ls prometheus-2-2-1.tar.gz
prometheus-2-2-1.tar.gz
[root@node1 ~]# du -sh prometheus-2-2-1.tar.gz
110M prometheus-2-2-1.tar.gz
[root@node1 ~]# docker load -i prometheus-2-2-1.tar.gz
6a749002dd6a: Loading layer [==================================================>] 1.338MB/1.338MB
5f70bf18a086: Loading layer [==================================================>] 1.024kB/1.024kB
1692ded805c8: Loading layer [==================================================>] 2.629MB/2.629MB
035489d93827: Loading layer [==================================================>] 66.18MB/66.18MB
8b6ef3a2ab2c: Loading layer [==================================================>] 44.5MB/44.5MB
ff98586f6325: Loading layer [==================================================>] 3.584kB/3.584kB
017a13aba9f4: Loading layer [==================================================>] 12.8kB/12.8kB
4d04d79bb1a5: Loading layer [==================================================>] 27.65kB/27.65kB
75f6c078fa6b: Loading layer [==================================================>] 10.75kB/10.75kB
5e8313e8e2ba: Loading layer [==================================================>] 6.144kB/6.144kB
Loaded image: prom/prometheus:v2.2.1
[root@node1 ~]#
[root@master1 prometheus]# cat > /root/prometheus/prometheus-deploy.yaml <
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-server
namespace: monitor-sa
labels:
app: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
component: server
#matchExpressions:
#- {key: app, operator: In, values: [prometheus]}
#- {key: component, operator: In, values: [server]}
template:
metadata:
labels:
app: prometheus
component: server
annotations:
prometheus.io/scrape: 'false'
spec:
nodeName: node1
serviceAccountName: monitor
containers:
- name: prometheus
image: prom/prometheus:v2.2.1
imagePullPolicy: IfNotPresent
command:
- prometheus
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention=720h
- --web.enable-lifecycle ##启用prometheus热加载
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: /etc/prometheus/prometheus.yml
name: prometheus-config
subPath: prometheus.yml
- mountPath: /prometheus/
name: prometheus-storage-volume
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
items:
- key: prometheus.yml
path: prometheus.yml
mode: 0644
- name: prometheus-storage-volume
hostPath:
path: /data
type: Directory
END
[root@master1 prometheus]# kubectl apply -f prometheus-deploy.yaml
deployment.apps/prometheus-server created
[root@master1 prometheus]# kubectl get pods -n monitor-sa -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
node-exporter-92k4d 1/1 Running 0 4h14m 192.168.1.181 node1 <none> <none>
node-exporter-d44k4 1/1 Running 0 4h14m 192.168.1.180 master1 <none> <none>
prometheus-server-657bd8cb4d-zrmk4 1/1 Running 0 42s 10.244.166.185 node1 <none> <none>
[root@master1 prometheus]#
[root@master1 prometheus]# cat > /root/prometheus/prometheus-svc.yaml <
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitor-sa
labels:
app: prometheus
spec:
type: NodePort
ports:
- port: 9090
targetPort: 9090
protocol: TCP
selector:
app: prometheus
component: server
END
[root@master1 prometheus]# kubectl apply -f prometheus-svc.yaml
service/prometheus created
[root@master1 prometheus]# kubectl get service -n monitor-sa
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
prometheus NodePort 10.103.238.66 <none> 9090:31935/TCP 37s
[root@master1 prometheus]# kubectl get endpoints -n monitor-sa
NAME ENDPOINTS AGE
prometheus 10.244.166.185:9090 3m50s
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-TEexacDR-1655106671763)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654689013529.png)]
#为了每次修改配置文件可以热加载 prometheus,也就是不停止 prometheus,就可以使配置生效,
#想要使配置生效可用如下热加载命令:
[root@master1 prometheus]# kubectl get pods -n monitor-sa -o wide -l app=prometheus
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
prometheus-server-657bd8cb4d-zrmk4 1/1 Running 0 64m 10.244.166.185 node1 <none> <none>
[root@master1 prometheus]#
#10.244.166.185是 prometheus 的 pod 的 ip 地址
想要使配置生效可用如下命令热加载:
[root@master1 prometheus]# curl -X POST http://10.244.166.185:9090/-/reload
#热加载速度比较慢,可以暴力重启 prometheus,如修改上面的 prometheus-cfg.yaml 文件之后,可
执行如下强制删除:
kubectl delete -f prometheus-cfg.yaml
kubectl delete -f prometheus-deploy.yaml
然后再通过 apply 更新:
kubectl apply -f prometheus-cfg.yaml
kubectl apply -f prometheus-deploy.yaml
注意:
线上最好热加载,暴力删除可能造成监控数据的丢失
**Grafana 是一个跨平台的开源的度量分析和可视化工具,可以将采集的数据可视化的展示,并及时通
知给告警接收方。它主要有以下六大特点:
1、展示方式:快速灵活的客户端图表,面板插件有许多不同方式的可视化指标和日志,官方库中具
有丰富的仪表盘插件,比如热图、折线图、图表等多种展示方式;
2、数据源:Graphite,InfluxDB,OpenTSDB,Prometheus,Elasticsearch,CloudWatch 和
KairosDB 等;
3、通知提醒:以可视方式定义最重要指标的警报规则,Grafana 将不断计算并发送通知,在数据达
到阈值时通过 Slack、PagerDuty 等获得通知;
4、混合展示:在同一图表中混合使用不同的数据源,可以基于每个查询指定数据源,甚至自定义数
据源;
5、注释:使用来自不同数据源的丰富事件注释图表,将鼠标悬停在事件上会显示完整的事件元数据
和标记。
**安装 Grafana 需要的镜像 heapster-grafana-amd64_v5_0_4.tar.gz,把镜像上传到 k8s 的工作节点
node1 上,手动解压:
[root@node1 prometheus]# ls
heapster-grafana-amd64_v5_0_4.tar.gz
[root@node1 prometheus]# du -sh heapster-grafana-amd64_v5_0_4.tar.gz
165M heapster-grafana-amd64_v5_0_4.tar.gz
[root@node1 prometheus]# docker load -i heapster-grafana-amd64_v5_0_4.tar.gz
6816d98be637: Loading layer [==================================================>] 4.642MB/4.642MB
523feee8e0d3: Loading layer [==================================================>] 161.5MB/161.5MB
43d2638621da: Loading layer [==================================================>] 230.4kB/230.4kB
f24c0fa82e54: Loading layer [==================================================>] 2.56kB/2.56kB
334547094992: Loading layer [==================================================>] 5.826MB/5.826MB
Loaded image: k8s.gcr.io/heapster-grafana-amd64:v5.0.4
这个镜像可以在hub.docker.com上搜索下载
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uSOaxG4h-1655106671764)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654740970254.png)]
#在控制节点master1上生成如下文件
[root@master1 prometheus]# cat > /root/prometheus/grafana.yaml <
apiVersion: apps/v1
kind: Deployment
metadata:
name: monitoring-grafana
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
task: monitoring
k8s-app: grafana
template:
metadata:
labels:
task: monitoring
k8s-app: grafana
spec:
containers:
- name: grafana
image: k8s.gcr.io/heapster-grafana-amd64:v5.0.4
ports:
- containerPort: 3000
protocol: TCP
volumeMounts:
- mountPath: /etc/ssl/certs
name: ca-certificates
readOnly: true
- mountPath: /var
name: grafana-storage
env:
- name: INFLUXDB_HOST
value: monitoring-influxdb
- name: GF_SERVER_HTTP_PORT
value: "3000"
# The following env variables are required to make Grafana accessible via
# the kubernetes api-server proxy. On production clusters, we recommend
# removing these env variables, setup auth for grafana, and expose the grafana
# service using a LoadBalancer or a public IP.
- name: GF_AUTH_BASIC_ENABLED
value: "false"
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "true"
- name: GF_AUTH_ANONYMOUS_ORG_ROLE
value: Admin
- name: GF_SERVER_ROOT_URL
# If you're only using the API Server proxy, set this value instead:
# value: /api/v1/namespaces/kube-system/services/monitoring-grafana/proxy
value: /
volumes:
- name: ca-certificates
hostPath:
path: /etc/ssl/certs
- name: grafana-storage
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
labels:
# For use as a Cluster add-on (https://github.com/kubernetes/kubernetes/tree/master/cluster/addons)
# If you are NOT using this as an addon, you should comment out this line.
kubernetes.io/cluster-service: 'true'
kubernetes.io/name: monitoring-grafana
name: monitoring-grafana
namespace: kube-system
spec:
# In a production setup, we recommend accessing Grafana through an external Loadbalancer
# or through a public IP.
# type: LoadBalancer
# You could also use NodePort to expose the service at a randomly-generated port
# type: NodePort
ports:
- port: 80
targetPort: 3000
selector:
k8s-app: grafana
type: NodePort
END
[root@master1 prometheus]# kubectl apply -f grafana.yaml
deployment.apps/monitoring-grafana created
service/monitoring-grafana created
[root@master1 prometheus]# kubectl get deploy -n kube-system
NAME READY UP-TO-DATE AVAILABLE AGE
calico-kube-controllers 1/1 1 1 13d
coredns 2/2 2 2 13d
monitoring-grafana 1/1 1 1 27s
[root@master1 prometheus]# kubectl get rs -n kube-system
NAME DESIRED CURRENT READY AGE
calico-kube-controllers-6949477b58 1 1 1 13d
coredns-7f89b7bc75 2 2 2 13d
monitoring-grafana-675798bf47 1 1 1 37s
[root@master1 prometheus]# kubectl get pods -n kube-system
NAME READY STATUS RESTARTS AGE
calico-kube-controllers-6949477b58-phvxx 1/1 Running 0 20h
calico-node-n5j7r 1/1 Running 0 13d
calico-node-r26rb 1/1 Running 0 13d
coredns-7f89b7bc75-8h7vd 1/1 Running 0 11d
coredns-7f89b7bc75-txs9t 1/1 Running 0 20h
etcd-master1 1/1 Running 0 13d
fluentd-elasticsearch-6qzdk 1/1 Running 0 3d23h
fluentd-elasticsearch-rxsgw 1/1 Running 0 3d23h
kube-apiserver-master1 1/1 Running 0 13d
kube-controller-manager-master1 1/1 Running 0 13d
kube-proxy-6jdfc 1/1 Running 0 13d
kube-proxy-n4gx7 1/1 Running 0 13d
kube-scheduler-master1 1/1 Running 0 13d
monitoring-grafana-675798bf47-x8sm8 1/1 Running 0 45s
[root@master1 prometheus]# kubectl get svc -n kube-system
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kube-dns ClusterIP 10.96.0.10 <none> 53/UDP,53/TCP,9153/TCP 13d
monitoring-grafana NodePort 10.102.90.185 <none> 80:32738/TCP 55s
使用k8s节点IP加32738可以在浏览器访问了
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lHFLoDso-1655106671764)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654741961142.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-k0qRPFJN-1655106671765)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654742074660.png)]
查看 grafana 前端的 service
[root@master1 prometheus]# kubectl get service -n kube-system|grep grafana
monitoring-grafana NodePort 10.102.90.185 <none> 80:32738/TCP 15m
1)登陆grafana,在浏览器访问
192.168.1.180:32738
看到如下 内容
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1XzaZdOG-1655106671765)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654742074660.png)]
2)配置 grafana 界面:
开始配置 grafana 的 web 界面:
选择 Create your first data source
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EECDTIBR-1655106671766)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654743048070.png)]
导入的监控模板,可在如下链接搜索
https://grafana.com/dashboards?dataSource=prometheus&search=kubernetes
可直接导入 node_exporter.json 监控模板,这个可以把 node 节点指标显示出来
怎么导入监控模板,按如下步骤:
上面 Save & Test 测试没问题之后,就可以返回 Grafana 主页面,点击左侧的+号下面的Import
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-G6e3C2Si-1655106671766)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654744509244.png)]
出现如下界面
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KmHRCHlT-1655106671766)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654744577982.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rrHKBso9-1655106671766)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654744789936.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XIQzK3ZC-1655106671766)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654744821590.png)]
可直接导入 docker_rev1.json,显示容器资源指标的,
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-S7DYnFKb-1655106671767)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654763653783.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-avADWxxh-1655106671767)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654763795653.png)]
**kube-state-metrics 是什么?
kube-state-metrics 通过监听 API Server 生成有关资源对象的状态指标,比如 Deployment、
Node、Pod,需要注意的是 kube-state-metrics 只是简单的提供一个 metrics 数据,并不会存储这
些指标数据,所以我们可以使用 Prometheus 来抓取这些数据然后存储,主要关注的是业务相关的一
些元数据,比如 Deployment、Pod、副本状态等;调度了多少个 replicas?现在可用的有几个?多
少个 Pod 是 running/stopped/terminated 状态?Pod 重启了多少次?我有多少 job 在运行中。
安装 kube-state-metrics 组件
1)创建 sa,并对 sa 授权
[root@master1 prometheus]# cat > /root/prometheus/kube-state-metrics-rbac.yaml <
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
verbs: ["list", "watch"]
- apiGroups: ["extensions"]
resources: ["daemonsets", "deployments", "replicasets"]
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources: ["cronjobs", "jobs"]
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: kube-system
END
[root@master1 prometheus]# kubectl apply -f kube-state-metrics-rbac.yaml
serviceaccount/kube-state-metrics created
clusterrole.rbac.authorization.k8s.io/kube-state-metrics created
clusterrolebinding.rbac.authorization.k8s.io/kube-state-metrics created
[root@master1 prometheus]# kubectl get serviceaccount -n kube-system|grep kube-state
kube-state-metrics 1 78s
[root@master1 prometheus]# kubectl get ClusterRole -n kube-system|grep kube-state
kube-state-metrics 2022-06-10T07:03:54Z
[root@master1 prometheus]# kubectl get ClusterRoleBinding -n kube-system|grep kube-state
kube-state-metrics ClusterRole/kube-state-metrics
安装 kube-state-metrics 组件需要的镜像上传到 k8s 工作节点node1上,手动解压:
这个镜像也可在hub.docker.com上找到
[root@node1 prometheus]# ls kube-state-metrics_1_9_0.tar.gz
kube-state-metrics_1_9_0.tar.gz
[root@node1 prometheus]# du -sh kube-state-metrics_1_9_0.tar.gz
33M kube-state-metrics_1_9_0.tar.gz
[root@node1 prometheus]#
[root@node1 prometheus]# docker load -i kube-state-metrics_1_9_0.tar.gz
932da5156413: Loading layer [==================================================>] 3.062MB/3.062MB
bd8df7c22fdb: Loading layer [==================================================>] 31MB/31MB
Loaded image: quay.io/coreos/kube-state-metrics:v1.9.0
在控制节点上生成yaml文件
[root@master1 prometheus]# cat > /root/prometheus/kube-state-metrics-deploy.yaml <
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: quay.io/coreos/kube-state-metrics:v1.9.0
ports:
- containerPort: 8080
END
[root@master1 prometheus]# kubectl apply -f kube-state-metrics-deploy.yaml
deployment.apps/kube-state-metrics created
[root@master1 prometheus]# kubectl get pods -n kube-system
NAME READY STATUS RESTARTS AGE
calico-kube-controllers-6949477b58-phvxx 1/1 Running 0 2d1h
calico-node-n5j7r 1/1 Running 0 14d
calico-node-r26rb 1/1 Running 0 14d
coredns-7f89b7bc75-8h7vd 1/1 Running 0 13d
coredns-7f89b7bc75-txs9t 1/1 Running 0 2d1h
etcd-master1 1/1 Running 0 14d
fluentd-elasticsearch-6qzdk 1/1 Running 0 5d4h
fluentd-elasticsearch-rxsgw 1/1 Running 0 5d4h
kube-apiserver-master1 1/1 Running 0 14d
kube-controller-manager-master1 1/1 Running 0 14d
kube-proxy-6jdfc 1/1 Running 0 14d
kube-proxy-n4gx7 1/1 Running 0 14d
kube-scheduler-master1 1/1 Running 0 14d
kube-state-metrics-58d4957bc5-v6tn2 1/1 Running 0 7m38s
monitoring-grafana-675798bf47-x8sm8 1/1 Running 0 29h
[root@master1 prometheus]# cat > /root/prometheus/kube-state-metrics-svc.yaml
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
name: kube-state-metrics
namespace: kube-system
labels:
app: kube-state-metrics
spec:
ports:
- name: kube-state-metrics
port: 8080
protocol: TCP
selector:
app: kube-state-metrics
END
[root@master1 prometheus]# kubectl apply -f kube-state-metrics-svc.yaml
service/kube-state-metrics created
[root@master1 prometheus]# kubectl get svc -n kube-system
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kube-dns ClusterIP 10.96.0.10 <none> 53/UDP,53/TCP,9153/TCP 15d
kube-state-metrics ClusterIP 10.102.207.144 <none> 8080/TCP 46s
monitoring-grafana NodePort 10.102.90.185 <none> 80:32738/TCP 29h
[root@master1 prometheus]# kubectl get endpoints -n kube-system
NAME ENDPOINTS AGE
kube-dns 10.244.137.66:53,10.244.137.69:53,10.244.137.66:53 + 3 more... 15d
kube-state-metrics 10.244.166.187:8080 71s
monitoring-grafana 10.244.166.186:3000 29h
在 grafana web 界面导入 Kubernetes Cluster (Prometheus)-1577674936972.json 和 Kubernetes
cluster monitoring (via Prometheus) (k8s 1.16)-1577691996738.json
文件在grafana.com上找到
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-O5bLmZRn-1655106671767)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654847818392.png)]
导入json
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vXHqZNCu-1655106671767)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654847948954.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gfgBRQkQ-1655106671768)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654848699926.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fXenx5m2-1655106671768)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654848670153.png)]
报警:指 prometheus 将监测到的异常事件发送给 alertmanager
通知:alertmanager 将报警信息发送到邮件、微信、钉钉等
创建 alertmanager 配置文件
在 k8s 的控制节点master1创建 alertmanager-cm.yaml 文件
[root@master1 prometheus]# ls alertmanager-cm.yaml
alertmanager-cm.yaml
[root@master1 prometheus]# du -sh alertmanager-cm.yaml
4.0K alertmanager-cm.yaml
[root@master1 prometheus]# cat alertmanager-cm.yaml
kind: ConfigMap
apiVersion: v1
metadata:
name: alertmanager
namespace: monitor-sa
data:
alertmanager.yml: |-
global:
resolve_timeout: 1m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: '[email protected]'
smtp_auth_username: 'xxxxxxxx'
smtp_auth_password: 'xxxxxxxxxx'
smtp_require_tls: false
route: #用于配置告警分发策略
group_by: [alertname] #采用哪个标签来作为分组依据
group_wait: 10s #组告警等待时间。也就是告警产生后等待 10s,如果有同组告警一起发出
group_interval: 10s #上下两组发送告警的间隔时间
repeat_interval: 10m #重复发送告警的时间,减少相同邮件的发送频率,默认是 1h
receiver: default-receiver ##定义谁来接收告警
receivers:
- name: 'default-receiver'
email_configs:
- to: '[email protected]'
send_resolved: true
[root@master1 prometheus]#
[root@master1 prometheus]# kubectl apply -f alertmanager-cm.yaml
configmap/alertmanager created
[root@master1 prometheus]# kubectl get cm -n monitor-sa
NAME DATA AGE
alertmanager 1 23s
alertmanager 配置文件解释说明:
smtp_smarthost: 'smtp.163.com:25'
#163 邮箱的 SMTP 服务器地址+端口
smtp_from: '[email protected]'
#这是指定从哪个邮箱发送报警
smtp_auth_username: 'xxxxxxxx' #这是发送邮箱的认证用户,不是邮箱名
smtp_auth_password: 'xxxxxxxxxx' #这是发送邮箱的授权码而不是登录密码,你们需要用自己的,不要用我的,用我的你会发不出来报警
email_configs:
- to: '[email protected]'
#to 后面指定发送到哪个邮箱,我发送到我的 qq 邮箱,大家需要写自己的邮箱地址,不应该跟
smtp_from 的邮箱名字重复
route: #用于设置告警的分发策略
group_by: [alertname]
#alertmanager 会根据 group_by 配置将 Alert 分组
group_wait: 10s
# 分组等待时间。也就是告警产生后等待 10s,如果有同组告警一起发出
group_interval: 10s # 上下两组发送告警的间隔时间
repeat_interval: 10m # 重复发送告警的时间,减少相同邮件的发送频率,默认是 1h
receiver: default-receiver #定义谁来收告警
Prometheus 一条告警的触发流程、等待时间
报警处理流程如下:
'scrape_interval’定义的时间间隔,定期采集目标主机上监控数据。
停止尝试。这时候把接口的状态变为“DOWN”。
\3. Prometheus 同时根据配置的"evaluation_interval"的时间间隔,定期(默认 1min)的对 Alert
Rule 进行评估;当到达评估周期的时候,发现接口 A 为 DOWN,即 UP=0 为真,激活 Alert,进入
“PENDING”状态,并记录当前 active 的时间;
是否已经超出 rule 里的‘for’ 持续时间,如果未超出,则进入下一个评估周期;如果时间超出,
则 alert 的状态变为“FIRING”;同时调用 Alertmanager 接口,发送相关报警数据。
“group_wait”时间先进行等待。等 wait 时间过后再发送报警信息。
功发出,那么间隔“group_interval”的时间间隔后再重新发送报警信息。比如配置的是邮件报警,
那么同属一个 group 的报警信息会汇总在一个邮件里进行发送。
隔之后再重复发送相同的报警邮件;如果之前的警报没有成功发送,则相当于触发第 6 条条件,则需
要等待 group_interval 时间间隔后重复发送。
同时最后至于警报信息具体发给谁,满足什么样的条件下指定警报接收人,设置不同报警发送频率,
这里有 alertmanager 的 route 路由规则进行配置。
#创建 prometheus 和告警规则配置文件
在 k8s 的控制节点生成一个 prometheus-alertmanager-cfg.yaml 文件并上传到 k8s 的 master1 节点
[root@master1 prometheus]# cat > /root/prometheus/prometheus-alertmanager-cfg.yaml <
kind: ConfigMap
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus-config
namespace: monitor-sa
data:
prometheus.yml: |
rule_files:
- /etc/prometheus/rules.yml
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"]
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 1m
scrape_configs:
- job_name: 'kubernetes-node'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
action: replace
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-node-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
- job_name: 'kubernetes-schedule'
scrape_interval: 5s
static_configs:
- targets: ['192.168.1.180:10251']
- job_name: 'kubernetes-controller-manager'
scrape_interval: 5s
static_configs:
- targets: ['192.168.1.180:10252']
- job_name: 'kubernetes-kube-proxy'
scrape_interval: 5s
static_configs:
- targets: ['192.168.1.180:10249','192.168.1.181:10249']
- job_name: 'kubernetes-etcd'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/ca.crt
cert_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.crt
key_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.key
scrape_interval: 5s
static_configs:
- targets: ['192.168.1.180:2379']
rules.yml: |
groups:
- name: example
rules:
- alert: kube-proxy的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-kube-proxy"}[1m]) * 100 > 80
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: kube-proxy的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-kube-proxy"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: scheduler的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-schedule"}[1m]) * 100 > 80
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: scheduler的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-schedule"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: controller-manager的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-controller-manager"}[1m]) * 100 > 80
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: controller-manager的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-controller-manager"}[1m]) * 100 > 0
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: apiserver的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-apiserver"}[1m]) * 100 > 80
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: apiserver的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-apiserver"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: etcd的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-etcd"}[1m]) * 100 > 80
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过80%"
- alert: etcd的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{job=~"kubernetes-etcd"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}组件的cpu使用率超过90%"
- alert: kube-state-metrics的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-state-metrics"}[1m]) * 100 > 80
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过80%"
value: "{{ $value }}%"
threshold: "80%"
- alert: kube-state-metrics的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-state-metrics"}[1m]) * 100 > 0
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过90%"
value: "{{ $value }}%"
threshold: "90%"
- alert: coredns的cpu使用率大于80%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-dns"}[1m]) * 100 > 80
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过80%"
value: "{{ $value }}%"
threshold: "80%"
- alert: coredns的cpu使用率大于90%
expr: rate(process_cpu_seconds_total{k8s_app=~"kube-dns"}[1m]) * 100 > 90
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.k8s_app}}组件的cpu使用率超过90%"
value: "{{ $value }}%"
threshold: "90%"
- alert: kube-proxy打开句柄数>600
expr: process_open_fds{job=~"kubernetes-kube-proxy"} > 600
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kube-proxy打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-kube-proxy"} > 1000
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-schedule打开句柄数>600
expr: process_open_fds{job=~"kubernetes-schedule"} > 600
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-schedule打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-schedule"} > 1000
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-controller-manager打开句柄数>600
expr: process_open_fds{job=~"kubernetes-controller-manager"} > 600
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-controller-manager打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-controller-manager"} > 1000
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-apiserver打开句柄数>600
expr: process_open_fds{job=~"kubernetes-apiserver"} > 600
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-apiserver打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-apiserver"} > 1000
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: kubernetes-etcd打开句柄数>600
expr: process_open_fds{job=~"kubernetes-etcd"} > 600
for: 2s
labels:
severity: warnning
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>600"
value: "{{ $value }}"
- alert: kubernetes-etcd打开句柄数>1000
expr: process_open_fds{job=~"kubernetes-etcd"} > 1000
for: 2s
labels:
severity: critical
annotations:
description: "{{$labels.instance}}的{{$labels.job}}打开句柄数>1000"
value: "{{ $value }}"
- alert: coredns
expr: process_open_fds{k8s_app=~"kube-dns"} > 600
for: 2s
labels:
severity: warnning
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 打开句柄数超过600"
value: "{{ $value }}"
- alert: coredns
expr: process_open_fds{k8s_app=~"kube-dns"} > 1000
for: 2s
labels:
severity: critical
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 打开句柄数超过1000"
value: "{{ $value }}"
- alert: kube-proxy
expr: process_virtual_memory_bytes{job=~"kubernetes-kube-proxy"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: scheduler
expr: process_virtual_memory_bytes{job=~"kubernetes-schedule"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kubernetes-controller-manager
expr: process_virtual_memory_bytes{job=~"kubernetes-controller-manager"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kubernetes-apiserver
expr: process_virtual_memory_bytes{job=~"kubernetes-apiserver"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kubernetes-etcd
expr: process_virtual_memory_bytes{job=~"kubernetes-etcd"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: kube-dns
expr: process_virtual_memory_bytes{k8s_app=~"kube-dns"} > 2000000000
for: 2s
labels:
severity: warnning
annotations:
description: "插件{{$labels.k8s_app}}({{$labels.instance}}): 使用虚拟内存超过2G"
value: "{{ $value }}"
- alert: HttpRequestsAvg
expr: sum(rate(rest_client_requests_total{job=~"kubernetes-kube-proxy|kubernetes-kubelet|kubernetes-schedule|kubernetes-control-manager|kubernetes-apiservers"}[1m])) > 1000
for: 2s
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): TPS超过1000"
value: "{{ $value }}"
threshold: "1000"
- alert: Pod_restarts
expr: kube_pod_container_status_restarts_total{namespace=~"kube-system|default|monitor-sa"} > 0
for: 2s
labels:
severity: warnning
annotations:
description: "在{{$labels.namespace}}名称空间下发现{{$labels.pod}}这个pod下的容器{{$labels.container}}被重启,这个监控指标是由{{$labels.instance}}采集的"
value: "{{ $value }}"
threshold: "0"
- alert: Pod_waiting
expr: kube_pod_container_status_waiting_reason{namespace=~"kube-system|default"} == 1
for: 2s
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}启动异常等待中"
value: "{{ $value }}"
threshold: "1"
- alert: Pod_terminated
expr: kube_pod_container_status_terminated_reason{namespace=~"kube-system|default|monitor-sa"} == 1
for: 2s
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.pod}}下的{{$labels.container}}被删除"
value: "{{ $value }}"
threshold: "1"
- alert: Etcd_leader
expr: etcd_server_has_leader{job="kubernetes-etcd"} == 0
for: 2s
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 当前没有leader"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_leader_changes
expr: rate(etcd_server_leader_changes_seen_total{job="kubernetes-etcd"}[1m]) > 0
for: 2s
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 当前leader已发生改变"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_failed
expr: rate(etcd_server_proposals_failed_total{job="kubernetes-etcd"}[1m]) > 0
for: 2s
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}): 服务失败"
value: "{{ $value }}"
threshold: "0"
- alert: Etcd_db_total_size
expr: etcd_debugging_mvcc_db_total_size_in_bytes{job="kubernetes-etcd"} > 10000000000
for: 2s
labels:
team: admin
annotations:
description: "组件{{$labels.job}}({{$labels.instance}}):db空间超过10G"
value: "{{ $value }}"
threshold: "10G"
- alert: Endpoint_ready
expr: kube_endpoint_address_not_ready{namespace=~"kube-system|default"} == 1
for: 2s
labels:
team: admin
annotations:
description: "空间{{$labels.namespace}}({{$labels.instance}}): 发现{{$labels.endpoint}}不可用"
value: "{{ $value }}"
threshold: "1"
- name: 物理节点状态-监控告警
rules:
- alert: 物理节点cpu使用率
expr: 100-avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)*100 > 90
for: 2s
labels:
severity: ccritical
annotations:
summary: "{{ $labels.instance }}cpu使用率过高"
description: "{{ $labels.instance }}的cpu使用率超过90%,当前使用率[{{ $value }}],需要排查处理"
- alert: 物理节点内存使用率
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 90
for: 2s
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}内存使用率过高"
description: "{{ $labels.instance }}的内存使用率超过90%,当前使用率[{{ $value }}],需要排查处理"
- alert: InstanceDown
expr: up == 0
for: 2s
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}: 服务器宕机"
description: "{{ $labels.instance }}: 服务器延时超过2分钟"
- alert: 物理节点磁盘的IO性能
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
for: 2s
labels:
severity: critical
annotations:
summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})"
- alert: 入网流量带宽
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
for: 2s
labels:
severity: critical
annotations:
summary: "{{$labels.mountpoint}} 流入网络带宽过高!"
description: "{{$labels.mountpoint }}流入网络带宽持续5分钟高于100M. RX带宽使用率{{$value}}"
- alert: 出网流量带宽
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
for: 2s
labels:
severity: critical
annotations:
summary: "{{$labels.mountpoint}} 流出网络带宽过高!"
description: "{{$labels.mountpoint }}流出网络带宽持续5分钟高于100M. RX带宽使用率{{$value}}"
- alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 1000
for: 2s
labels:
severity: critical
annotations:
summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
description: "{{$labels.mountpoint }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
for: 2s
labels:
severity: critical
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前使用:{{$value}}%)"
END
[root@master1 prometheus]# kubectl delete -f prometheus-cfg.yaml
configmap "prometheus-config" deleted
[root@master1 prometheus]# kubectl apply -f prometheus-alertmanager-cfg.yaml
configmap/prometheus-config created
[root@master1 prometheus]# kubectl get cm -n monitor-sa
NAME DATA AGE
alertmanager 1 75m
kube-root-ca.crt 1 2d20h
prometheus-config 2 5s
安装 prometheus 和 alertmanager
需要把 alertmanager.tar.gz 镜像包上传的 k8s 的各个工作节点,这个环境仅有node1,手动解压:
[root@node1 prometheus]# ls alertmanager.tar.gz
alertmanager.tar.gz
[root@node1 prometheus]# du -sh alertmanager.tar.gz
32M alertmanager.tar.gz
[root@node1 prometheus]# docker load -i alertmanager.tar.gz
4febd3792a1f: Loading layer [==================================================>] 1.36MB/1.36MB
68d1a8b41cc0: Loading layer [==================================================>] 2.586MB/2.586MB
5f70bf18a086: Loading layer [==================================================>] 1.024kB/1.024kB
30d4e7b232e4: Loading layer [==================================================>] 12.77MB/12.77MB
6b961451fcb0: Loading layer [==================================================>] 16.59MB/16.59MB
b5abc4736d3f: Loading layer [==================================================>] 6.144kB/6.144kB
Loaded image: prom/alertmanager:v0.14.0
在 k8s 的控制节点master1生成一个 prometheus-alertmanager-deploy.yaml
[root@master1 prometheus]# cat > /root/prometheus/prometheus-alertmanager-deploy.yaml <
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-server
namespace: monitor-sa
labels:
app: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
component: server
#matchExpressions:
#- {key: app, operator: In, values: [prometheus]}
#- {key: component, operator: In, values: [server]}
template:
metadata:
labels:
app: prometheus
component: server
annotations:
prometheus.io/scrape: 'false'
spec:
nodeName: node1
serviceAccountName: monitor
containers:
- name: prometheus
image: prom/prometheus:v2.2.1
imagePullPolicy: IfNotPresent
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=24h"
- "--web.enable-lifecycle"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: /etc/prometheus
name: prometheus-config
- mountPath: /prometheus/
name: prometheus-storage-volume
- name: k8s-certs
mountPath: /var/run/secrets/kubernetes.io/k8s-certs/etcd/
- name: localtime
mountPath: /etc/localtime
- name: alertmanager
image: prom/alertmanager:v0.14.0
imagePullPolicy: IfNotPresent
args:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--log.level=debug"
ports:
- containerPort: 9093
protocol: TCP
name: alertmanager
volumeMounts:
- name: alertmanager-config
mountPath: /etc/alertmanager
- name: alertmanager-storage
mountPath: /alertmanager
- name: localtime
mountPath: /etc/localtime
volumes:
- name: prometheus-config
configMap:
name: prometheus-config
- name: prometheus-storage-volume
hostPath:
path: /data
type: Directory
- name: k8s-certs
secret:
secretName: etcd-certs
- name: alertmanager-config
configMap:
name: alertmanager
- name: alertmanager-storage
hostPath:
path: /data/alertmanager
type: DirectoryOrCreate
- name: localtime
hostPath:
path: /usr/share/zoneinfo/Asia/Shanghai
END
#生成一个 etcd-certs,这个在部署 prometheus 需要
[root@master1 prometheus]# kubectl create secret generic etcd-certs -n monitor-sa --from-file=/etc/kubernetes/pki/etcd/server.key --from-file=/etc/kubernetes/pki/etcd/server.crt --from-file=/etc/kubernetes/pki/etcd/ca.crt
secret/etcd-certs created
You have new mail in /var/spool/mail/root
[root@master1 prometheus]# kubectl get secret -n monitor-sa
NAME TYPE DATA AGE
default-token-78mw6 kubernetes.io/service-account-token 3 2d20h
etcd-certs Opaque 3 24s
monitor-token-gbxmj kubernetes.io/service-account-token 3 2d19h
[root@master1 prometheus]# kubectl describe secret etcd-certs -n monitor-sa
Name: etcd-certs
Namespace: monitor-sa
Labels: <none>
Annotations: <none>
Type: Opaque
Data
====
server.key: 1679 bytes
ca.crt: 1058 bytes
server.crt: 1176 bytes
通过 kubectl apply 更新资源清单 yaml 文件
[root@master1 prometheus]# kubectl get pods -n monitor-sa
NAME READY STATUS RESTARTS AGE
node-exporter-92k4d 1/1 Running 0 2d20h
node-exporter-d44k4 1/1 Running 0 2d20h
prometheus-server-657bd8cb4d-zrmk4 1/1 Running 0 2d15h
[root@master1 prometheus]# kubectl delete -f prometheus-deploy.yaml
deployment.apps "prometheus-server" deleted
[root@master1 prometheus]# kubectl get pods -n monitor-sa
NAME READY STATUS RESTARTS AGE
node-exporter-92k4d 1/1 Running 0 2d20h
node-exporter-d44k4 1/1 Running 0 2d20h
[root@master1 prometheus]# kubectl apply -f prometheus-alertmanager-deploy.yaml
deployment.apps/prometheus-server created
[root@master1 prometheus]# kubectl get deploy -n monitor-sa
NAME READY UP-TO-DATE AVAILABLE AGE
prometheus-server 1/1 1 1 6s
[root@master1 prometheus]# kubectl get rs -n monitor-sa
NAME DESIRED CURRENT READY AGE
prometheus-server-55cd9cb6d7 1 1 1 24s
[root@master1 prometheus]# kubectl get pods -n monitor-sa -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
node-exporter-92k4d 1/1 Running 0 2d20h 192.168.1.181 node1 <none> <none>
node-exporter-d44k4 1/1 Running 0 2d20h 192.168.1.180 master1 <none> <none>
prometheus-server-55cd9cb6d7-v7rwh 2/2 Running 0 32s 10.244.166.188 node1 <none> <none>
部署 alertmanager 的 service,方便在浏览器访问
在 k8s 的控制节点生成一个 alertmanager-svc.yaml 文件
[root@master1 prometheus]# cat > /root/prometheus/alertmanager-svc.yaml <
---
apiVersion: v1
kind: Service
metadata:
labels:
name: prometheus
kubernetes.io/cluster-service: 'true'
name: alertmanager
namespace: monitor-sa
spec:
ports:
- name: alertmanager
nodePort: 30066
port: 9093
protocol: TCP
targetPort: 9093
selector:
app: prometheus
sessionAffinity: None
type: NodePort
END
[root@master1 prometheus]# ls alertmanager-svc.yaml
alertmanager-svc.yaml
[root@master1 prometheus]# du -sh alertmanager-svc.yaml
4.0K alertmanager-svc.yaml
[root@master1 prometheus]# kubectl apply -f alertmanager-svc.yaml
service/alertmanager created
You have new mail in /var/spool/mail/root
[root@master1 prometheus]# kubectl get svc -n monitor-sa
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
alertmanager NodePort 10.110.81.137 <none> 9093:30066/TCP 16s
prometheus NodePort 10.103.238.66 <none> 9090:31935/TCP 2d16h
使用master1的IP访问alertmanager UI
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ce9rcwB7-1655106671768)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654919654087.png)]
访问 prometheus 的 web 界面
点击 status->targets,可看到如下
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gQLG0yM1-1655106671769)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654919879645.png)]
从上面可以发现 kubernetes-controller-manager 和 kubernetes-schedule 都显示连接不上对应的端
口
可按如下方法处理:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-4wyg8uGS-1655106671769)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654935268318.png)]
因为kube-proxy是受控制器管理的,所以删除后会自动重建
kubectl get pods -n kube-system|grep kube-proxy|awk '{print $1}'|xargs kubectl delete pods -n kube-system
可以看到相应的端口已经被物理机监听了
检查邮箱发现已收到邮件报警
扩展:暴力更新配置文件
修改 prometheus 任何一个配置文件之后,可通过 kubectl apply 使配置生效,执行顺序如下:
kubectl delete -f alertmanager-cm.yaml
kubectl apply -f alertmanager-cm.yaml
kubectl delete -f prometheus-alertmanager-cfg.yaml
kubectl apply -f prometheus-alertmanager-cfg.yaml
kubectl delete -f prometheus-alertmanager-deploy.yaml
kubectl apply -f prometheus-alertmanager-deploy.yaml
打开电脑版钉钉创建机器人
1.创建钉钉机器人
打开电脑版钉钉,创建一个群,创建自定义机器人,按如下步骤创建
https://ding-doc.dingtalk.com/doc#/serverapi2/qf2nxq
https://developers.dingtalk.com/document/app/custom-robot-access
我创建的机器人如下:
群设置–>智能群助手–>添加机器人–>自定义–>添加
机器人名称:test
接收群组:钉钉报警测试
安全设置:
自定义关键词:cluster1
上面配置好之后点击完成即可,这样就会创建一个 test 的报警机器人,创建机器人成功之后怎么查
看 webhook,按如下:
点击智能群助手,可以看到刚才创建的 test 这个机器人,点击 test,就会进入到 test 机器人的设
置界面
出现如下内容:
机器人名称:test
接受群组:钉钉报警测试
消息推送:开启
webhook:
这个每个人得到的不一样,复制备用
安全设置:
自定义关键词:cluster1
安装钉钉的 webhook 插件,在 k8s 的控制节点 master1 操作
prometheus-webhook-dingtalk-0.3.0.linux-amd64.tar.gz 压缩包所在的百度网盘地址如下:
链接:https://pan.baidu.com/s/1bxkiE83Nv5dEvLB1ZldEcw
提取码:ndm4
tar zxvf prometheus-webhook-dingtalk-0.3.0.linux-amd64.tar.gz
cd prometheus-webhook-dingtalk-0.3.0.linux-amd64
启动钉钉报警插件
nohup ./prometheus-webhook-dingtalk --web.listen-address=“0.0.0.0:8060” –
ding.profile=“cluster1=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx” &
这里的xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx是前面复制的webhook
对原来的 alertmanager-cm.yaml 文件做备份
cp alertmanager-cm.yaml alertmanager-cm.yaml.bak
重新生成一个新的 alertmanager-cm.yaml 文件
[root@master1 prometheus]# cat > /root/prometheus/alertmanager-cm.yaml <
kind: ConfigMap
apiVersion: v1
metadata:
name: alertmanager
namespace: monitor-sa
data:
alertmanager.yml: |-
global:
resolve_timeout: 1m
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '[email protected]'
smtp_auth_username: 'xxxxx'
smtp_auth_password: 'xxxxxxxxx'
smtp_require_tls: false
route:
group_by: [alertname]
group_wait: 10s
group_interval: 10s
repeat_interval: 10m
receiver: cluster1
receivers:
- name: 'cluster1'
webhook_configs:
- url: 'http://192.168.1.180:8060/dingtalk/cluster1/send'
send_resolved: true
END
[root@master1 prometheus]# kubectl delete -f alertmanager-cm.yaml
configmap "alertmanager" deleted
[root@master1 prometheus]# kubectl apply -f alertmanager-cm.yaml
configmap/alertmanager created
[root@master1 prometheus]# kubectl delete -f prometheus-alertmanager-cfg.yaml
configmap "prometheus-config" deleted
[root@master1 prometheus]# kubectl apply -f prometheus-alertmanager-cfg.yaml
configmap/prometheus-config created
[root@master1 prometheus]# kubectl delete -f prometheus-alertmanager-deploy.yaml
deployment.apps "prometheus-server" deleted
[root@master1 prometheus]# kubectl apply -f prometheus-alertmanager-deploy.yaml
deployment.apps/prometheus-server created
注册企业微信
登陆网址:
https://work.weixin.qq.com/
找到应用管理,创建应用
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-HzPI9RnJ-1655106671769)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654950024388.png)]
应用名字 wechat
创建成功之后显示如下:
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-zDYQJTEe-1655106671770)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654950450717.png)]
[root@master1 prometheus]# cat > /root/prometheus/alertmanager-cm.yaml <
kind: ConfigMap
apiVersion: v1
metadata:
name: alertmanager
namespace: monitor-sa
data:
alertmanager.yml: |-
global:
resolve_timeout: 1m
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '[email protected]'
smtp_auth_username: 'xxxxx'
smtp_auth_password: 'xxxxxxx'
smtp_require_tls: false
route:
group_by: [alertname]
group_wait: 10s
group_interval: 10s
repeat_interval: 10m
receiver: prometheus
receivers:
- name: 'prometheus'
wechat_configs:
- corp_id: ww7xxxxxx #换成你自已的
to_user: '@all'
agent_id: 1000003
api_secret: xxxxxxxxxxxxxxxxxxxx #换成你自已的
END
[root@master1 prometheus]# kubectl delete -f alertmanager-cm.yaml
configmap "alertmanager" deleted
[root@master1 prometheus]# kubectl apply -f alertmanager-cm.yaml
configmap/alertmanager created
[root@master1 prometheus]# kubectl delete -f prometheus-alertmanager-deploy.yaml
[root@master1 prometheus]# kubectl apply -f prometheus-alertmanager-deploy.yaml
deployment.apps/prometheus-server created
[root@master1 prometheus]# kubectl get pods -n monitor-sa
NAME READY STATUS RESTARTS AGE
node-exporter-92k4d 1/1 Running 0 3d5h
node-exporter-d44k4 1/1 Running 2 3d5h
prometheus-server-55cd9cb6d7-6pj7z 2/2 Running 0 28s
发送成功
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-i2F6TD6f-1655106671770)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1654952016431.png)]
tomcat_exporter地址
链接:https://pan.baidu.com/s/1E2nDbVX3VcRxTxxowVaNtQ
提取码:nrml
下面在k8s-node节点操作
(1)制作tomcat镜像,按如下步骤
mkdir /root/tomcat_image
把上面的war包和jar包传到这个目录下
cd /root/tomcat_image
cat > Dockerfile <<END
FROM tomcat
ADD metrics.war /usr/local/tomcat/webapps/
ADD simpleclient-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_common-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_hotspot-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_servlet-0.8.0.jar /usr/local/tomcat/lib/
ADD tomcat_exporter_client-0.0.12.jar /usr/local/tomcat/lib/
END
[root@node1 tomcat_image]# docker build -t='mack/tomcat_prometheus:v1' .
[root@node1 tomcat_image]# docker images
REPOSITORY TAG IMAGE ID CREATED SIZE
mack/tomcat_prometheus v1 d60e7a86371f 7 minutes ago 680MB
基于上面的镜像创建一个tomcat实例
下面操作在master1节点进行
cat > /root/tomcat_deploy.yaml <
创建一个service,可操作也可不操作
cat > /root/tomcat-service.yaml <<END
kind: Service #service 类型
apiVersion: v1
metadata:
# annotations:
# prometheus.io/scrape: 'true'
name: tomcat-service
spec:
selector:
app: tomcat
ports:
- nodePort: 31360
port: 80
protocol: TCP
targetPort: 8080
type: NodePort
END
kubectl apply -f tomcat-service.yaml
在promethues上可以看到监控到tomcat的pod了
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hzumjvyw-1655106671770)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1655024399571.png)]
#master1
[root@master1 ~]# yum install mysql -y
[root@master1 ~]# yum install mariadb* -y
[root@master1 ~]# systemctl start mariadb
[root@master1 ~]# ls mysqld_exporter-0.10.0.linux-amd64.tar.gz
mysqld_exporter-0.10.0.linux-amd64.tar.gz
[root@master1 ~]# du -sh mysqld_exporter-0.10.0.linux-amd64.tar.gz
3.3M mysqld_exporter-0.10.0.linux-amd64.tar.gz
[root@master1 ~]# tar -zxvf mysqld_exporter-0.10.0.linux-amd64.tar.gz
mysqld_exporter-0.10.0.linux-amd64/
mysqld_exporter-0.10.0.linux-amd64/LICENSE
mysqld_exporter-0.10.0.linux-amd64/NOTICE
mysqld_exporter-0.10.0.linux-amd64/mysqld_exporter
[root@master1 ~]#
cd mysqld_exporter-0.10.0.linux-amd64
cp -ar mysqld_exporter /usr/local/bin/
[root@master1 mysqld_exporter-0.10.0.linux-amd64]# cd /usr/local/bin/
[root@master1 bin]# ll
total 10176
-rwxr-xr-x 1 1000 1000 10419174 Apr 25 2017 mysqld_exporter
登陆 mysql 为 mysql_exporter 创建账号并授权
创建数据库用户。
mysql
CREATE USER ‘mysql_exporter’@‘localhost’ IDENTIFIED BY ‘Abcdef123!.’;
对 mysql_exporter 用户授权
mysql
GRANT PROCESS, REPLICATION CLIENT, SELECT ON . TO ‘mysql_exporter’@‘localhost’;
exit 退出 mysql
创建 mysql 配置文件、运行时可免密码连接数据库:
cd mysqld_exporter-0.10.0.linux-amd64
cat > my.cnf <<END
[client]
user=mysql_exporter
password=Abcdef123!.
END
启动 mysql_exporter 客户端
nohup ./mysqld_exporter --config.my-cnf=./my.cnf &
mysqld_exporter 的监听端口是 9104
修改 prometheus-alertmanager-cfg.yaml 文件,添加如下
- job_name: 'mysql'
static_configs:
- targets: ['192.168.40.180:9104']
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1MXnUv62-1655106671770)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1655026431623.png)]
[root@master1 prometheus]# kubectl delete -f prometheus-alertmanager-cfg.yaml
configmap "prometheus-config" deleted
[root@master1 prometheus]# kubectl apply -f prometheus-alertmanager-cfg.yaml
configmap/prometheus-config created
[root@master1 prometheus]# kubectl delete -f prometheus-alertmanager-deploy.yaml
deployment.apps "prometheus-server" deleted
[root@master1 prometheus]# kubectl apply -f prometheus-alertmanager-deploy.yaml
deployment.apps/prometheus-server created
grafana 导入 mysql 监控图表
mysql-overview_rev5.json
所需要的文件下载地址
链接:https://pan.baidu.com/s/1SD1TPFNjBnf9wLIfiQVqHg
提取码:uf3f
#1,在master1上下载nginx-module-vts模块
[root@master1 prometheus]# ls nginx-module-vts-master.zip
nginx-module-vts-master.zip
[root@master1 prometheus]# du -sh nginx-module-vts-master.zip
400K nginx-module-vts-master.zip
[root@master1 prometheus]# unzip nginx-module-vts-master.zip
[root@master1 prometheus]# mv nginx-module-vts-master /usr/local/
#2,安装nginx
[root@master1 prometheus]# ls nginx-1.15.7.tar.gz
nginx-1.15.7.tar.gz
[root@master1 prometheus]# du -sh nginx-1.15.7.tar.gz
1004K nginx-1.15.7.tar.gz
[root@master1 nginx-1.15.7]# cd nginx-1.15.7/
[root@master1 nginx-1.15.7]# ./configure --prefix=/usr/local/nginx --with-http_gzip_static_module --with-http_stub_status_module --with-http_ssl_module --with-pcre --with-file-aio --with-http_realip_module --add-module=/usr/local/nginx-module-vts-master
[root@master1 nginx-1.15.7]# make && make install
修改nginx配置文件:
vim /usr/local/nginx/conf/nginx.conf
#server下添加如下:
location /status {
vhost_traffic_status_display;
vhost_traffic_status_display_format html;
}
# http中添加如下:
vhost_traffic_status_zone;
#测试nginx配置文件是否正确:
[root@master1 nginx-1.15.7]# /usr/local/nginx/sbin/nginx -t
nginx: the configuration file /usr/local/nginx/conf/nginx.conf syntax is ok
nginx: configuration file /usr/local/nginx/conf/nginx.conf test is successful
#如果正确没问题,启动nginx
#启动nginx:
[root@master1 nginx-1.15.7]# /usr/local/nginx/sbin/nginx
#访问192.168.1.180/status可以看到nginx监控数据
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-16Flps63-1655106671771)(C:\Users\mack\AppData\Roaming\Typora\typora-user-images\1655105057783.png)]
#3,安装nginx-vts-exporter
[root@master1 prometheus]# ls nginx-vts-exporter-0.5.zip
nginx-vts-exporter-0.5.zip
[root@master1 prometheus]# du -sh nginx-vts-exporter-0.5.zip
3.2M nginx-vts-exporter-0.5.zip
[root@master1 prometheus]# unzip nginx-vts-exporter-0.5.zip
[root@master1 prometheus]# mv nginx-vts-exporter-0.5 /usr/local/
[root@master1 prometheus]# cd /usr/local/nginx-vts-exporter-0.5/bin/
[root@master1 bin]# ls -l
total 8932
-rw-r--r-- 1 root root 9145803 Mar 16 2017 nginx-vts-exporter
[root@master1 bin]# chmod a+x nginx-vts-exporter
You have new mail in /var/spool/mail/root
[root@master1 bin]# ls -l
total 8932
-rwxr-xr-x 1 root root 9145803 Mar 16 2017 nginx-vts-exporter
[root@master1 bin]# nohup ./nginx-vts-exporter -nginx.scrape_uri http://192.168.1.180/status/format/json &
[3] 17102
You have new mail in /var/spool/mail/root
[root@master1 bin]# nohup: ignoring input and appending output to ‘nohup.out’
[root@master1 bin]#
[root@master1 bin]# cat nohup.out
2022/06/13 15:31:46 Starting nginx_vts_exporter (version=0.4, branch=fix-docker-error, revision=0f3dbb44a86340d65bf3d6abbcc0ee88663cb419)
2022/06/13 15:31:46 Build context (go=go1.8, user=Administrator@LS--20151110SAS, date=20170316-03:16:26)
2022/06/13 15:31:46 Starting Server at : :9913
2022/06/13 15:31:46 Metrics endpoint: /metrics
2022/06/13 15:31:46 Metrics namespace: nginx
2022/06/13 15:31:46 Scraping information from : http://192.168.1.180/status/format/json
#nginx-vts-exporter的监听端口是9913
#4 修改prometheus-alertmanager-cfg.yaml文件
添加如下job
- job_name: 'nginx'
scrape_interval: 5s
static_configs:
- targets: ['192.168.1.180:9913']
[root@master1 prometheus]# kubectl delete -f prometheus-alertmanager-cfg.yaml
configmap "prometheus-config" deleted
[root@master1 prometheus]# kubectl apply -f prometheus-alertmanager-cfg.yaml
configmap/prometheus-config created
[root@master1 prometheus]# kubectl delete -f prometheus-alertmanager-deploy.yaml
deployment.apps "prometheus-server" deleted
[root@master1 prometheus]# kubectl apply -f prometheus-alertmanager-deploy.yaml
deployment.apps/prometheus-server created
#5 在grafana界面导入nginx json
nfiguration file /usr/local/nginx/conf/nginx.conf test is successful
#如果正确没问题,启动nginx
#启动nginx:
[root@master1 nginx-1.15.7]# /usr/local/nginx/sbin/nginx
#访问192.168.1.180/status可以看到nginx监控数据
[外链图片转存中...(img-16Flps63-1655106671771)]
```bash
#3,安装nginx-vts-exporter
[root@master1 prometheus]# ls nginx-vts-exporter-0.5.zip
nginx-vts-exporter-0.5.zip
[root@master1 prometheus]# du -sh nginx-vts-exporter-0.5.zip
3.2M nginx-vts-exporter-0.5.zip
[root@master1 prometheus]# unzip nginx-vts-exporter-0.5.zip
[root@master1 prometheus]# mv nginx-vts-exporter-0.5 /usr/local/
[root@master1 prometheus]# cd /usr/local/nginx-vts-exporter-0.5/bin/
[root@master1 bin]# ls -l
total 8932
-rw-r--r-- 1 root root 9145803 Mar 16 2017 nginx-vts-exporter
[root@master1 bin]# chmod a+x nginx-vts-exporter
You have new mail in /var/spool/mail/root
[root@master1 bin]# ls -l
total 8932
-rwxr-xr-x 1 root root 9145803 Mar 16 2017 nginx-vts-exporter
[root@master1 bin]# nohup ./nginx-vts-exporter -nginx.scrape_uri http://192.168.1.180/status/format/json &
[3] 17102
You have new mail in /var/spool/mail/root
[root@master1 bin]# nohup: ignoring input and appending output to ‘nohup.out’
[root@master1 bin]#
[root@master1 bin]# cat nohup.out
2022/06/13 15:31:46 Starting nginx_vts_exporter (version=0.4, branch=fix-docker-error, revision=0f3dbb44a86340d65bf3d6abbcc0ee88663cb419)
2022/06/13 15:31:46 Build context (go=go1.8, user=Administrator@LS--20151110SAS, date=20170316-03:16:26)
2022/06/13 15:31:46 Starting Server at : :9913
2022/06/13 15:31:46 Metrics endpoint: /metrics
2022/06/13 15:31:46 Metrics namespace: nginx
2022/06/13 15:31:46 Scraping information from : http://192.168.1.180/status/format/json
#nginx-vts-exporter的监听端口是9913
#4 修改prometheus-alertmanager-cfg.yaml文件
添加如下job
- job_name: 'nginx'
scrape_interval: 5s
static_configs:
- targets: ['192.168.1.180:9913']
[root@master1 prometheus]# kubectl delete -f prometheus-alertmanager-cfg.yaml
configmap "prometheus-config" deleted
[root@master1 prometheus]# kubectl apply -f prometheus-alertmanager-cfg.yaml
configmap/prometheus-config created
[root@master1 prometheus]# kubectl delete -f prometheus-alertmanager-deploy.yaml
deployment.apps "prometheus-server" deleted
[root@master1 prometheus]# kubectl apply -f prometheus-alertmanager-deploy.yaml
deployment.apps/prometheus-server created
#5 在grafana界面导入nginx json