1、部署相关文件已经打成tar包上传至云,链接: https://pan.baidu.com/s/1NflGeaYCWPi9243gM4gGdw
提取码:1234
使用tar包注意:
1、报警目录alert拷贝至data目录下
2、给node打标签确定下prometheus的node,标签nodetype=prometheus
[root@2dot241 sy] kubectl label node 2dot239 nodetype=prometheus
因想要让数据持久化存储,所以固定了prometheus所在的node 这个参数nodeSelector:所以要给相应的node上打个标签
[root@2dot239 ~] mkdir -p /data/prometheus/data && chmod 777 /data/prometheus/data
然后在对应的机器上创建data目录和修改权限---很重要,会报错
3、grafana模板 https://grafana.com/grafana/dashboards
我加载的几个模板
8919 --node
3146 --pod
创建namespaces名为monitor
[root@2dot241 sy]# cat namespaces.yml
apiVersion: v1
kind: Namespace
metadata:
name: monitor
labels:
name: monitor
[root@2dot241 sy]# kubectl apply -f namespaces.yml
创建rbac文件:为了能够让Prometheus能够访问收到认证保护的Kubernetes API,我们首先需要做的是,对Prometheus进行访问授权。在Kubernetes中主要使用基于角色的访问控制模型(Role-Based Access Control),用于管理Kubernetes下资源访问权限。首先我们需要在Kubernetes下定义角色(ClusterRole),并且为该角色赋予响应的访问权限。同时创建Prometheus所使用的账号(ServiceAccount),最后则是将该账号与角色进行绑定(ClusterRoleBinding)
[root@2dot241 sy]# cat rbac.yml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitor
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitor
[root@2dot241 sy]# kubectl apply -f rbac.yml
prometheus-deploy和prometheus-svc:创建prometheus服务、使用映射主机端口30003访问prometheus
首先创建configmap--就是对应prometheus的配置文件--其中主要的就是自动发现和监控规则
其中很多监控项我都注释了,报警太烦了、只留了报警就要处理(是不是活着),不要紧就不报警的东西
[root@2dot241 sy]# cat configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitor
data:
#被引用到/etc/prometheus/prometheus.yml
prometheus.yml: |
global:
#每15s采集一次数据和15s做一次告警检测
scrape_interval: 15s
evaluation_interval: 15s
#指定加载的告警规则文件
rule_files:
- /etc/prometheus/rules.yml
#将报警送至何地进行报警
alerting:
alertmanagers:
- static_configs:
- targets: ["192.168.2.241:9093"]
#指定prometheus要监控的目标
scrape_configs:
- job_name: 'k8s-node'
scrape_interval: 10s
static_configs:
- targets:
- '192.168.2.239:9100'
- '192.168.2.240:9100'
#自定义获取监控数据,每个 job_name 都是独立的
- job_name: 'tomcat-pods'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_service_annotation_prometheus_io_jvm_scrape]
regex: true;true
action: keep
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_app_metrics_patn]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__meta_kubernetes_pod_ip, __meta_kubernetes_service_annotation_prometheus_io_app_metrics_port]
action: replace
target_label: __address__
regex: (.+);(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_pod_host_ip]
action: replace
target_label: kubernetes_host_ip
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module: [http_2xx]
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
- job_name: 'kubernetes-ingresses'
kubernetes_sd_configs:
- role: ingress
relabel_configs:
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: kubernetes_name
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: kube-state-metrics
static_configs:
- targets: ['kube-state-metrics.monitor.svc.cluster.local:8080']
# 监控规则文件,被引用到/etc/prometheus/rules.yml
rules.yml: |
groups:
- name: test-rule
rules:
############# Node监控 #############
- alert: k8s-node状态异常
expr: up{job="k8s-node"} != 1
for: 3m
labels:
team: k8s-node
annotations:
summary: "{{$labels.instance}}: Node节点状态异常"
description: "可能是重启了"
- alert: k8s-node节点CPU使用率
expr: (1 - avg(irate(node_cpu_seconds_total{job="k8s-node",mode="idle"}[1m])) by (instance)) * 100 > 95
for: 1m
labels:
team: k8s-node
annotations:
summary: "{{$labels.instance}}: Node节点CPU使用率超过95%"
description: "{{$labels.instance}}: Node节点当前CPU使用率为: {{ $value }}"
#- alert: k8s-node节点内存使用率
#expr: (sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 95
#for: 1m
#labels:
#team: k8s-node
#annotations:
#description: "Node服务器[[ {{$labels.instance}} ]] 内存使用率超过95%"
#summary: "{{$labels.instance}} 当前内存使用率为: {{ $value }}"
############ Pod 监控 ############
#- alert: 监控k8s的pod状态异常
#expr: up{kubernetes_namespace="monitor"} != 1
#for: 3m
#labels:
#team: "kube-state-metrics"
#annotations:
#description: "{{$labels.kubernetes_namespace}} 内的 pod 状态有变动"
#summary: "此 Pod 用于获取 k8s 监控数据, 绑定在一个节点上"
#- alert: 应用的 pod 状态有变动
#expr: kube_pod_container_status_ready{namespace="test"} != 1
#for: 3m
#labels:
#status: "test 命名空间内的 pod {{$labels.pod}}有变动"
#annotations:
#description: " {{$labels.container}} 的 pod 状态有变动"
#summary: "可能是重启或者在升级版本,如果频繁重启,请跟踪排查问题"
#- alert: 以下应用的 pod 重启次数已经超过15,请查看原因
#expr: kube_pod_container_status_restarts_total{namespace="product"} > 15
#for: 3m
#labels:
#status: " pod {{$labels.pod}} 重启次数太多"
#annotations:
#description: "Deployment {{$labels.container}} 内的 pod 重启次数太多"
# summary: "重启次数太多,可能是因为 pod 内应用有问题"
########### Java 监控 ############
- alert: jvm线程数过高
expr: jvm_threads_current{job="tomcat-pods"}>2000
for: 1m
labels:
status: "空间内 jvm 的变动情况"
annotations:
description: "{{$labels.kubernetes_pod_name}}: Jvm线程数过高"
summary: '{{ $labels.kubernetes_pod_name }} : 当前你线程值为: {{ $value }}'
[root@2dot241 sy]# cat prometheus-deploy.yml
apiVersion: apps/v1beta2
kind: Deployment
metadata:
labels:
name: prometheus-deployment
name: prometheus
namespace: monitor
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- image: prom/prometheus:latest
name: prometheus
imagePullPolicy: IfNotPresent
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/home/prometheus"
- "--storage.tsdb.retention=168h"
- "--web.enable-lifecycle"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: "/home/prometheus"
name: data
- mountPath: "/etc/prometheus"
name: config-volume
- mountPath: "/etc/localtime"
readOnly: false
name: localtime
resources:
requests:
cpu: 100m
memory: 2048Mi
limits:
cpu: 500m
memory: 3180Mi
serviceAccountName: prometheus
nodeSelector:
nodetype: prometheus
volumes:
- name: data
hostPath:
path: "/data/prometheus/data"
- name: config-volume
configMap:
name: prometheus-config
- name: localtime
hostPath:
path: "/etc/localtime"
type: File
[root@2dot241 sy] kubectl label node 2dot239 nodetype=prometheus
因想要让数据持久化存储,所以固定了prometheus所在的node 这个参数nodeSelector:所以要给相应的node上打个标签
[root@2dot239 ~] mkdir -p /data/prometheus/data && chmod 777 /data/prometheus/data
然后在对应的机器上创建data目录和修改权限
[root@2dot241 sy] cat prometheus-svc.yml
kind: Service
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus
namespace: monitor
spec:
type: NodePort
ports:
- port: 9090
targetPort: 9090
nodePort: 30003
selector:
app: prometheus
[root@2dot241 sy]# kubectl apply -f configmap.yaml
[root@2dot241 sy]# kubectl apply -f prometheus-svc.yml
[root@2dot241 sy]# kubectl apply -f prometheus-deploy.yml
[root@2dot241 sy]# cat reload.sh
#!/bin/bash
#用于修改配置文件后让prometheus加载配置,如果在prometheus可视化窗口没看到修改的rule就多执行两下
kubectl apply -f configmap.yaml
sleep 30
curl -XPOST http://192.168.2.241:30003/-/reload
这个时候已经可以进入到prometheus窗口了接下来安装采集pod和service相关的插件
安装kube-state-metrics
git地址:https://github.com/kubernetes/kube-state-metrics/tree/v1.9.8 下面也写了很详细的安装步骤就是这句话(要部署此项目,您只需运行即可kubectl apply -f examples/standard创建 Kubernetes 服务和部署。(注意:如果你的kubernetes集群的版本不是1.8+,请调整一些资源的apiVersion,查看yaml文件了解更多信息)。)
#我选这个版本是因为超过这个版本的那些镜像写的都是google的还得找镜像地址。。。。我懒
#下载zip包--我用的1.9.7没啥区别,我更换测试了下
[root@2dot241 sy] unzip kube-state-metrics-1.9.7.zip
[root@2dot241 sy] cd kube-state-metrics-1.9.7/examples/standard/
#这个包里默认用的namespace是kube-system我们用的是monitor所以要修改下
[root@2dot241 standard] sed -i s/kube-system/monitor/g ./*
[root@2dot241 standard] kubectl apply -f .
到这基本安装完成,剩余图形展示和报警,但是这里有一个问题,就是在prometheus的可视化中tragets没有pod信息,但是grafana中却有,我到现在不知道什么情况,如果有人知道请告诉我!谢谢,看下图:
剩下grafana和报警,我们使用docker安装
创建grafana数据目录
[root@2dot241 sy] mkdir -p /data/grafana/data && chmod -R 777 /data/grafana/
[root@2dot241 sy] cat start-grafana.sh
#!/bin/bash
docker stop `docker ps -a |awk '/grafana/{print $1}'`
docker rm `docker ps -a |awk '/grafana/{print $1}'`
docker run -d \
--name=grafana \
--restart=always \
-p 3000:3000 \
-m 4096m \
-v /data/grafana/data:/var/lib/grafana \
-v /data/grafana/log:/var/log/grafana \
grafana/grafana:8.0.6
[root@2dot241 sy] bash start-grafana.sh 创建完成后就可以访问宿主机对应的3000端口就能访问到grafana了
1、安装完之后,需要添加source,source直接点prometheus,链接就是[http://192.168.2.241:30003之前创建的prometheus界面]
2、添加模板dashboad(列出几个常用的)
点import导入,有俩种方式,直接填写官网模板号码比如下面的9276,或者导入json
https://grafana.com/dashboards/8919 node的cpu、内存等
https://grafana.com/dashboards/3146 pod 13105
https://grafana.com/dashboards/8588 deployment
安装alertmanager
我这只用了微信报警,想用钉钉和邮件的可以查看最下面的连接,他们配置的都比较多
创建alert数据目录
mkdir /data/alert/data
[root@2dot241 alert] cat alertmanager.yml
global:
resolve_timeout: 5m
templates:
- "/etc/alertmanager/template/*.tmpl" #报警模板文件位置`
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 6m
receiver: wechat
receivers:
- name: 'wechat'
wechat_configs:
- corp_id: 'wwd2849aba51a0c8220' #企业微信id 后台查看
to_party: '2' #微信部门ID
agent_id: '1000008' #企业微信应用程序id
api_secret: 'znhoVsyT8lafYe3ohELP01AzcV43ENkFd3to3aIYEn' #secret,后台应用程序中查看
send_resolved: true #报警解除后通知
message: '{{ template "wechat.default.message" . }}' #报警模板中字符串设置-要和报警模板对应(文本、html等类型)
#- name: 'default'
# email_configs:
# - to: ""
# send_resolved: true
# from: ""
# smarthost: "smtp.xxx.com:25"
# auth_username: ""
# auth_password: ""
# webhook_configs:
# - url: 'http://192.168.50.60:8060/dingtalk/ops_dingding/send'
# send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname']
[root@2dot241 alert] mkdir template
[root@2dot241 template] cat wechat.tmpl 报警模板,在template目录中创建
{{ define "wechat.default.message" }}
{{- range .Alerts.Firing -}}
=====监控报警=====
告警状态:{{ .Status }}
告警级别:{{ .Labels.severity }}
告警类型:{{ .Labels.alertname }}
告警应用:{{ .Annotations.summary }}
告警主机:{{ .Labels.instance }}
告警详情:{{ .Annotations.description }}
告警时间:{{ .StartsAt.Local.Format "2006-01-02 15:04:05" }}
=======end=======
{{ end }}
{{- range .Alerts.Resolved -}}
=====报警恢复=====
告警状态:{{ .Status }}
告警级别:{{ .Labels.severity }}
告警类型:{{ .Labels.alertname }}
告警应用:{{ .Annotations.summary }}
告警主机:{{ .Labels.instance }}
告警详情:{{ .Annotations.description }}
告警时间:{{ .StartsAt.Local.Format "2006-01-02 15:04:05" }}
恢复时间: {{ .EndsAt.Local.Format "2006-01-02 15:04:05" }}
=======end=======
{{ end }}
{{- end }}
[root@2dot241 alert] cat start-alert.sh
#!/bin/bash
docker stop `docker ps -a |awk '/alertmanager/{print $1}'`
docker rm `docker ps -a |awk '/alertmanager/{print $1}'`
docker run -d \
--name alertmanager \
--restart=always \
-p 9093:9093 \
-v /etc/localtime:/etc/localtime:ro \
-v /data/alert/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
-v /data/alert/data:/alertmanager \
-v /data/alert/template:/etc/alertmanager/template \
prom/alertmanager:latest
[root@2dot241 alert]# ./start-alert.sh 执行脚本就可以了
参考文档:
https://shenshengkun.github.io/posts/e2e612d1.html (多数镜像无法下载、报警需要修改报警信息)
https://blog.csdn.net/qq_31547771/article/details/119217023?spm=1001.2014.3001.5501 (无报警、监控信息、rule文件分离)
https://www.prometheus.wang/kubernetes/READMD.html (介绍详细,配置为阶段性,适用于学习了解)