接上一篇:K8S集群部署kube-Prometheus监控tomcat jvm
今天来实施k8s集群部署Kube-Prometheus 监控K8S集群内pod容器应用的jvm
有一个jar包需要在K8S集群运行起来。要做到监控jar包应用的jvm按以下几步。
1、将jvm-exporter整合进我们的应用,修改Dockerfile 加入jmx_exporter。
2、配置Prometheus服务自动发现
3、在Prometheus管理页面中查看服务发现
4、添加报警规则
5、添加报警接收人
整合过程很简单,只需要将jvm-exporter作为javaagent加入到我们的java启动命令就可以了
[root@k8s01 ability_jobs]# ls
config Dockerfile 'Dockerfile - 副本' prometheus
[root@k8s01 ability_jobs]# ls -l prometheus/
total 476
-rw-r--r-- 1 root root 1211 Nov 11 10:23 jmx-exporter.yaml
-rw-r--r-- 1 root root 482947 Nov 11 10:23 jmx_prometheus_javaagent-0.16.0.jar
[root@k8s01 ability_jobs]# cat prometheus/jmx-exporter.yaml
---
lowercaseOutputLabelNames: true
lowercaseOutputName: true
rules:
- pattern: 'Catalina<>(\w+):'
name: tomcat_$3_total
labels:
port: "$2"
protocol: "$1"
help: Tomcat global $3
type: COUNTER
- pattern: 'Catalina<>(requestCount|maxTime|processingTime|errorCount):'
name: tomcat_servlet_$3_total
labels:
module: "$1"
servlet: "$2"
help: Tomcat servlet $3 total
type: COUNTER
- pattern: 'Catalina<>(currentThreadCount|currentThreadsBusy|keepAliveCount|pollerThreadCount|connectionCount):'
name: tomcat_threadpool_$3
labels:
port: "$2"
protocol: "$1"
help: Tomcat threadpool $3
type: GAUGE
- pattern: 'Catalina<>(processingTime|sessionCounter|rejectedSessions|expiredSessions):'
name: tomcat_session_$3_total
labels:
context: "$2"
host: "$1"
help: Tomcat session $3 total
type: COUNTER
ADD ./prometheus /opt/apps/prometheus #将jmx-exporter 和配置文件添加到镜像中
ENTRYPOINT ["/bin/sh","-c","set -e &&java -javaagent:/opt/apps/prometheus/jmx_prometheus_javaagent-0.16.0.jar=9901:/opt/apps/prometheus/jmx-exporter.yaml -Xms1024m -Xmx1024m -Duser.timezone=GMT+08 -jar ability_jobs.jar"] #修改启动命令 将jmx-exporter 添加进来,还是使用9901端口
From harbor.creditgogogo/ops/centos7-jdk8
RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo "Asia/Shanghai" > /etc/timezone
# 修改Centos 默认时区
WORKDIR /opt/apps
ADD ability_jobs.jar /opt/apps/ability_jobs.jar
ADD ./prometheus /opt/apps/prometheus
ADD ./config /opt/apps/config
ADD ./lib /opt/apps/lib
EXPOSE 8090
ENTRYPOINT ["/bin/sh","-c","set -e &&java -javaagent:/opt/apps/prometheus/jmx_prometheus_javaagent-0.16.0.jar=9901:/opt/apps/prometheus/jmx-exporter.yaml -Xms1024m -Xmx1024m -Duser.timezone=GMT+08 -jar ability_jobs.jar"]
对于有Service暴露的服务我们可以用 prometheus-operator 项目定义的ServiceMonitorCRD来配置服务发现,配置模板如下:
--- # ServiceMonitor 服务自动发现规则
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor # prometheus-operator 定义的CRD
metadata:
name: jmx-metrics
namespace: monitoring
labels:
k8s-apps: jmx-metrics
spec:
jobLabel: metrics #监控数据的job标签指定为metrics label的值,即加上数据标签job=jmx-metrics
selector:
matchLabels:
metrics: jmx-metrics # 自动发现 label中有metrics: jmx-metrics 的service
namespaceSelector:
matchNames: # 配置需要自动发现的命名空间,可以配置多个
- my-namespace
endpoints:
- port: http-metrics # 拉去metric的端口,这个写的是 service的端口名称,即 service yaml的spec.ports.name
interval: 15s # 拉取metric的时间间隔
--- # 服务service模板
apiVersion: v1
kind: Service
metadata:
labels:
metrics: jmx-metrics # ServiceMonitor 自动发现的关键label
name: jmx-metrics
namespace: my-namespace
spec:
ports:
- name: http-metrics #对应 ServiceMonitor 中spec.endpoints.port
port: 9093 # jmx-exporter 暴露的服务端口
targetPort: http-metrics # pod yaml 暴露的端口名
selector:
metrics: jmx-metrics # service本身的标签选择器
以上配置了my-namespace命名空间的 jmx-metrics Service的服务自动发现,Prometheus会将这个service 的所有关联pod自动加入监控,并从apiserver获取到最新的pod列表,这样当我们的服务副本扩充时也能自动添加到监控系统中。
那么对于没有创建 Service 的服务,比如以HostPort对集群外暴露服务的实例,我们可以使用 PodMonitor 来做服务发现,相关样例如下:
--- # PodMonitor 服务自动发现规则,最新的版本支持,旧版本可能不支持
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor # prometheus-operator 定义的CRD
metadata:
name: jmx-metrics
namespace: monitoring
labels:
k8s-apps: jmx-metrics
spec:
jobLabel: metrics #监控数据的job标签指定为metrics label的值,即加上数据标签job=jmx-metrics
selector:
matchLabels:
metrics: jmx-metrics # 自动发现 label中有metrics: jmx-metrics 的pod
namespaceSelector:
matchNames: # 配置需要自动发现的命名空间,可以配置多个
- my-namespace
podMetricsEndpoints:
- port: http-metrics # Pod yaml中 metric暴露端口的名称 即 spec.ports.name
interval: 15s # 拉取metric的时间间隔
--- # 需要监控的Pod模板
apiVersion: v1
kind: Pod
metadata:
labels:
metrics: jmx-metrics
name: jmx-metrics
namespace: my-namespace
spec:
containers:
- image: tomcat:9.0
name: tomcat
ports:
- containerPort: 9093
name: http-metrics
--- # 在对应的ns中创建角色
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: my-namespace
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
--- # 绑定角色 prometheus-k8s 角色到 Role
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: my-namespace
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s # Prometheus 容器使用的 serviceAccount,kube-prometheus默认使用prometheus-k8s这个用户
namespace: monitoring
K8S自动发现服务
[root@k8s01 manifests]# cat prometheus-jvm-metrices.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: jmx-metrics
namespace: monitoring
labels:
k8s-apps: jmx-metrics
spec:
jobLabel: metrics
selector:
matchLabels:
metrics: jmx-metrics
namespaceSelector:
matchNames:
- default
endpoints:
- port: http-metrics
interval: 15s
部署服务自动发现
[root@k8s01 manifests]# kubectl apply -f prometheus-jvm-metrices.yaml
servicemonitor.monitoring.coreos.com/jmx-metrics created
apiVersion: apps/v1beta1
kind: Deployment
metadata:
name: ability-jobs
spec:
replicas: 1
revisionHistoryLimit: 10
minReadySeconds: 20
strategy:
# indicate which strategy we want for rolling update
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 1
template:
metadata:
labels:
app: ability-jobs
spec:
terminationGracePeriodSeconds: 10
containers:
- name: ability-jobs
image: harbor.creditgogogo.com/jar/ability_jobs:latest
ports:
- containerPort: 8090
name: web
- containerPort: 9901
name: http-metrics
volumeMounts:
- name: app-logs
mountPath: /home/logs
- image: harbor.creditgogogo.com/ops/filebeat-7.10.2:1.0
imagePullPolicy: Always
name: filebeat
volumeMounts:
- name: app-logs
mountPath: /logs
- name: filebeat-config
mountPath: /etc/filebeat
volumes:
- name: app-logs
emptyDir: {}
- name: filebeat-config
configMap:
name: ability-jobs-filebeat-config
---
apiVersion: v1
kind: Service
metadata:
labels:
metrics: jmx-metrics # 添加了这个标签
name: ability-jobs
spec:
type: NodePort
ports:
- name: web
port: 8090
targetPort: 8090
nodePort: 30040
- name: http-metrics #jvm metrices端口name
port: 9901
targetPort: http-metrics
selector:
app: ability-jobs
部署
[root@k8s01 ability_jobs]# kubectl apply -f ability_jobs.yaml
服务发现配置成功后会出现在Prometheus的管理界面中:
编辑 /kube-prometheus/manifests/prometheus-rules.yaml 文件
[root@k8s01 manifests]# vim /kube-prometheus/manifests/prometheus-rules.yaml
填入以下内容:
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: jvm-metrics-rules
namespace: monitoring
spec:
groups:
- name: jvm-metrics-rules
rules:
# 在5分钟里,GC花费时间超过10%
- alert: GcTimeTooMuch
expr: increase(jvm_gc_collection_seconds_sum[5m]) > 30
for: 5m
labels:
severity: red
annotations:
summary: "{{ $labels.app }} GC时间占比超过10%"
message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} GC时间占比超过10%,当前值({{ $value }}%)"
# GC次数太多
- alert: GcCountTooMuch
expr: increase(jvm_gc_collection_seconds_count[1m]) > 30
for: 1m
labels:
severity: red
annotations:
summary: "{{ $labels.app }} 1分钟GC次数>30次"
message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} 1分钟GC次数>30次,当前值({{ $value }})"
# FGC次数太多
- alert: FgcCountTooMuch
expr: increase(jvm_gc_collection_seconds_count{gc="ConcurrentMarkSweep"}[1h]) > 3
for: 1m
labels:
severity: red
annotations:
summary: "{{ $labels.app }} 1小时的FGC次数>3次"
message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} 1小时的FGC次数>3次,当前值({{ $value }})"
# 非堆内存使用超过80%
- alert: NonheapUsageTooMuch
expr: jvm_memory_bytes_used{job="jmx-metrics", area="nonheap"} / jvm_memory_bytes_max * 100 > 80
for: 1m
labels:
severity: red
annotations:
summary: "{{ $labels.app }} 非堆内存使用>80%"
message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} 非堆内存使用率>80%,当前值({{ $value }}%)"
# 内存使用预警
- alert: HeighMemUsage
expr: process_resident_memory_bytes{job="jmx-metrics"} / os_total_physical_memory_bytes * 100 > 85
for: 1m
labels:
severity: red
annotations:
summary: "{{ $labels.app }} rss内存使用率大于85%"
message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} rss内存使用率大于85%,当前值({{ $value }}%)"
已配置钉钉告警通知,这里忽略。
具体请参考:K8S集群部署之Prometheus 监控-钉钉告警
[root@k8s01 manifests]# vim /kube-prometheus/manifests/alertmanager-secret.yaml
apiVersion: v1
data: {}
kind: Secret
metadata:
name: alertmanager-main
namespace: monitoring
stringData:
alertmanager.yaml: |-
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 5m
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
- send_resolved: true
url: 'http://webhook-dingtalk:8060/dingtalk/webhook/send'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
自此,jvm监控系统配置完成。
附jvm-exporter接口返回参数示例,可以根据需要自取其中的metric
参考上一篇文章:K8S集群部署kube-Prometheus监控tomcat jvm
文章参考:jvm-exporter整合k8s+prometheus监控报警
JVM metrices 监控指标说明参考:https://segmentfault.com/a/1190000018933341?utm_source=tag-newest