kube-prometheus stack | k8s 1.20 | k8s 1.21 | k8s 1.22 | k8s 1.23 | k8s 1.24 |
release-0.8 | ✓ | ✓ | × | × | × |
release-0.9 | × | ✓ | ✓ | × | × |
release-0.10 | × | × | ✓ | ✓ | × |
release-0.11 | × | × | × | ✓ | ✓ |
main | × | × | × | × | ✓ |
kube-prometheus-0.11.0
k8s-v1.23.9
1.获取项目
mkdir ~/kube-prometheus && cd ~/kube-prometheus
wget https://github.com/prometheus-operator/kube-prometheus/archive/refs/tags/v0.11.0.tar.gz
tar xf v0.11.0.tar.gz && rm -rf v0.11.0.tar.gz
2、拉取(导入)镜像(docker版)
cd kube-prometheus-0.11.0
[root@master ~/kube-prometheus/kube-prometheus-0.11.0]# find ./manifests -type f |xargs grep 'image: '|sort|uniq|awk '{print $3}'|grep ^[a-zA-Z]|grep -Evw 'error|kubeRbacProxy'|sort -rn|uniq |grep -n ".*"
1:quay.io/prometheus/prometheus:v2.36.1
2:quay.io/prometheus-operator/prometheus-operator:v0.57.0
3:quay.io/prometheus/node-exporter:v1.3.1
4:quay.io/prometheus/blackbox-exporter:v0.21.0
5:quay.io/prometheus/alertmanager:v0.24.0
6:quay.io/brancz/kube-rbac-proxy:v0.12.0
7:k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.1
8:k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.5.0
9:jimmidyson/configmap-reload:v0.5.0
10:grafana/grafana:8.5.5
[root@master ~/kube-prometheus/kube-prometheus-0.11.0]#
# grep image prometheusAdapter-deployment.yaml
image: k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.1
# grep image kubeStateMetrics-deployment.yaml
image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.5.0
cd ~/kube-prometheus/kube-prometheus-0.11.0/manifests
#1、 修改镜像(默认谷歌k8s.gcr.io)
sed -i 's/k8s.gcr.io\/prometheus-adapter/v5cn/g' prometheusAdapter-deployment.yaml
把 k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.1
修改为 v5cn/prometheus-adapter:v0.9.1
# 2、修改镜像(默认谷歌k8s.gcr.io)
sed -i 's/k8s.gcr.io\/kube-state-metrics/dyrnq/g' kubeStateMetrics-deployment.yaml
把 k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.5.0
修改为 dyrnq/kube-state-metrics:v2.5.0
# k8s-master上pull镜像
find ~/kube-prometheus/kube-prometheus-0.11.0/manifests -type f |xargs grep 'image: '|sort|uniq|awk '{print $3}'|grep ^[a-zA-Z]|grep -Evw 'error|kubeRbacProxy'|sort -rn|uniq |xargs -i docker pull {}
# 所有node节点pull镜像
cat > ~/pull.sh << 'EOF'
v5cn/prometheus-adapter:v0.9.1
quay.io/prometheus/prometheus:v2.36.1
quay.io/prometheus-operator/prometheus-operator:v0.57.0
quay.io/prometheus/node-exporter:v1.3.1
quay.io/prometheus/blackbox-exporter:v0.21.0
quay.io/prometheus/alertmanager:v0.24.0
quay.io/brancz/kube-rbac-proxy:v0.12.0
jimmidyson/configmap-reload:v0.5.0
grafana/grafana:8.5.5
dyrnq/kube-state-metrics:v2.5.0
EOF
cat ~/pull.sh |xargs -i docker pull {}
3、部署kube-prometheus项目
cd ~/kube-prometheus/kube-prometheus-0.11.0
# 方式1
kubectl apply --server-side -f manifests/setup
kubectl apply -f manifests/
# 方式2(可能要多执行几遍,所以推荐使用方式1)
kubectl apply --server-side -Rf manifests
4、暴露prometheus、grafana、alertmanager服务(将ClusterIP修改为NodePort)
#1、prometheus
kubectl patch svc/prometheus-k8s -n monitoring --patch '{"spec": {"type":"NodePort"}}'
#2、grafana
kubectl patch svc/grafana -n monitoring --patch '{"spec": {"type":"NodePort"}}'
#3、alertmanager
kubectl patch svc/alertmanager-main -n monitoring --patch '{"spec": {"type":"NodePort"}}'
kubectl get svc -n monitoring |grep NodePort
6、监控kube-controller-manager+kube-scheduler
a、默认没监控到以上两个服务的原因分析:
1、和
ServiceMonitor
的定义有关系2、先来查看下 kube-scheduler 组件对应的 ServiceMonitor 资源的定义
3、在
ServiceMonitor
资源对象里的selector.matchLabels
在kube-system
这个命名空间下面匹配具有k8s-app=kube-scheduler
这样的 Service4、但是系统中根本就没有对应的 Service:(问题所在)
5、所以需要去创建一个对应的 Service 对象,才能与
ServiceMonitor
进行关联:(解决问题)
cat manifests/kubernetesControlPlane-serviceMonitorKubeScheduler.yaml
kubectl get svc -n kube-system -l app.kubernetes.io/name=kube-scheduler
No resources found in kube-system namespace.
# cat manifests/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml
#2、先来查看下 kube-scheduler组件对应的 ServiceMonitor资源的定义
# cat manifests/kubernetesControlPlane-serviceMonitorKubeScheduler.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: kube-scheduler
app.kubernetes.io/part-of: kube-prometheus
name: kube-scheduler
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
port: https-metrics
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: app.kubernetes.io/name
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/name: kube-scheduler
b、解决问题:
mkdir ~/my-kube-prometheus && cd ~/my-kube-prometheus
1、对kube-Controller-manager的监控
cat > ~/my-kube-prometheus/prometheus-kubeControllerManagerService.yaml << EOF
apiVersion: v1
kind: Service
metadata:
namespace: kube-system
name: kube-controller-manager
labels: #必须和上面的 ServiceMonitor 下面的 matchLabels 保持一致
app.kubernetes.io/name: kube-controller-manager
spec:
selector:
component: kube-controller-manager
ports:
- name: https-metrics
port: 10257
targetPort: 10257 #controller-manager的安全端口为10257
EOF
其中最重要的是上面 labels 和 selector 部分,labels 区域的配置必须和我们上面的 ServiceMonitor 对象中的 selector 保持一致,selector 下面配置的是
component=kube-scheduler
,为什么会是这个 label 标签呢,我们可以去 describe 下 kube-scheduler 这个 Pod:
# kubectl describe pod kube-scheduler-master -n kube-system
Name: kube-scheduler-master
Namespace: kube-system
Priority: 2000001000
Priority Class Name: system-node-critical
Node: master/192.168.1.201
Start Time: Tue, 04 Jan 2022 10:09:14 +0800
Labels: component=kube-scheduler
tier=control-plane
......
可以看到这个 Pod 具有
component=kube-scheduler
和tier=control-plane
这两个标签,而前面这个标签具有更唯一的特性,所以使用前面这个标签较好,这样上面创建的 Service 就可以和这个 Pod 进行关联了
kubectl apply -f ~/my-kube-prometheus/prometheus-kubeControllerManagerService.yaml
kubectl get svc -n kube-system
sed -i 's/bind-address=127.0.0.1/bind-address=0.0.0.0/g' /etc/kubernetes/manifests/kube-controller-manager.yaml
因为 kube-controller-manager 启动的时候默认绑定的是
127.0.0.1
地址,所以要通过 IP 地址去访问就被拒绝了,所以需要将--bind-address=127.0.0.1
更改为--bind-address=0.0.0.0
,更改后 kube-scheduler 会自动重启,重启完成后再去查看 Prometheus 上面的采集目标就正常了
2、对kube-Scheduler的监控
cat > ~/my-kube-prometheus/prometheus-kubeSchedulerService.yaml << EOF
apiVersion: v1
kind: Service
metadata:
namespace: kube-system
name: kube-scheduler
labels: #必须和上面的 ServiceMonitor 下面的 matchLabels 保持一致
app.kubernetes.io/name: kube-scheduler
spec:
selector:
component: kube-scheduler
ports:
- name: https-metrics
port: 10259
targetPort: 10259 #需要注意现在版本默认的安全端口是10259
EOF
kubectl apply -f ~/my-kube-prometheus/prometheus-kubeSchedulerService.yaml
sed -i 's/bind-address=127.0.0.1/bind-address=0.0.0.0/g' /etc/kubernetes/manifests/kube-scheduler.yaml
如果要清理 Prometheus-Operator,可以直接删除对应的资源清单即可:
# kubectl delete -f manifests
# kubectl delete -f manifests/setup
感谢:linuxkaifa