k8s监控处理
1.cadvisor/exporter+prometheus+grafana 安装
1.1 配置nfs安装
ubuntu:
nfs 服务器
apt-get install nfs-kernel-server
# 创建一个/data/pvdata的共享目录
mkdir /data/pvdata
centos:
chown nfsnobody:nfsnobody /data/pvdata
ubuntu:
chown chown nobody:nogroup /data/pvdata
vim /etc/exports
#ip填写自己所支持IP范围
/data/pvdata xxx.xxx.xxx.0/24(rw,async,all_squash)
exportfs -rv
#显示 exporting xxx.xxx.xxx.0/24:/data/pvdata
#需要在prometheus服务所在的节点安装
#nfs 客户端
apt-get update
apt-get install nfs-common
在其他节点上测试nfs是否可用,挂载命令:
mkdir /kubernetes
mount nfs服务器的ip:/data/pvdata /kubernetes
1.2 prometheus配置
mkdir /data/k8s/yaml/kube-system/prometheus
cd /data/k8s/yaml/kube-system/prometheus/
# 从github官网下载yaml部署文件
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-rbac.yaml
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-configmap.yaml
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-service.yaml
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/prometheus-statefulset.yaml
1.2.1 修改prometheus-statefulset.yaml
# 删掉最下面的10行
volumeClaimTemplates:
- metadata:
name: prometheus-data
spec:
storageClassName: standard
accessModes:
- ReadWriteOnce
resources:
requests:
storage: "16Gi"
# 新增下面3行(使用自己指定的nfs作为存储)
- name: prometheus-data
persistentVolumeClaim:
claimName: prometheus-data
1.2.2 新增pv/pvc 修改yaml文件
#创建普罗米修斯数据存储位置
mkdir /data/pvdata/prometheus
#注意 ubuntu 系统需要换成nobody
chown nfsnobody. /data/pvdata/prometheus
cat > prometheus-pvc-data.yaml << EFO
apiVersion: v1
kind: PersistentVolume
metadata:
name: prometheus-data
spec:
storageClassName: prometheus-data
capacity:
storage: 10Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Recycle
nfs:
path: /data/pvdata/prometheus
server: nfs-server-ip
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-data
namespace: kube-system
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
storageClassName: prometheus-data
EFO
1.2.3 修改Nodeport与prometheus镜像版本
#修改prometheus-service.yaml
type: NodePort
#修改prometheus-statefulset.yaml
#注意prometheus的配置,默认的cpu、mem配置支持10个节点30个pod
prometheus 镜像版本更改最新的v2.13.0
添加 args: --storage.tsdb.retention.time=指定数据保存时长
1.2.4 开始安装配置prometheus
# 应用yaml文件
kubectl apply -f prometheus-rbac.yaml
kubectl apply -f prometheus-configmap.yaml
kubectl apply -f prometheus-pvc-data.yaml
kubectl apply -f prometheus-service.yaml
kubectl apply -f prometheus-statefulset.yaml
#查看是否安装成功
kubectl get pods -n kube-system |grep prometheus
#获取所在prometheus所在的节点 NODE 信息
kubectl get pods -n kube-system -o wide |grep prometheus
#获取prometheus的NODEPort
kubectl get service -n kube-system
prometheus NodePort xxx.xxx.xxx.xxx 9090:32809/TCP 5d20h
访问prometheus服务 NodeIP+Nodeport(32809)
1.3 安装node-exporter
#下载node-exporter的yaml文件
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/node-exporter-ds.yml
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/node-exporter-service.yaml
#应用node-exporter
kubectl apply -f node-exporter-service.yaml
kubectl apply -f node-exporter-ds.yml
1.4 部署kube-state-metrics
#下载yaml文件
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/kube-state-metrics-service.yaml
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/kube-state-metrics-rbac.yaml
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/kube-state-metrics-deployment.yaml
#应用yaml文件
kubectl apply -f kube-state-metrics-service.yaml
kubectl apply -f kube-state-metrics-rbac.yaml
kubectl apply -f kube-state-metrics-deployment.yaml
1.5 部署grafana
1.5.1 创建数据存储目录
mkdir /data/pvdata/prometheus-grafana
ubuntu:
chown nobody. /data/pvdata/prometheus-grafana
centos:
chown nfsnobody. /data/pvdata/prometheus-grafana
1.5.2 创建grafana的pvc
cat > grafana-pvc.yaml << EFO
apiVersion: v1
kind: PersistentVolume
metadata:
name: prometheus-grafana
spec:
storageClassName: prometheus-grafana
capacity:
storage: 1Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Recycle
nfs:
path: /data/pvdata/prometheus-grafana
server: nfs服务ip
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-grafana
namespace: kube-system
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: prometheus-grafana
EFO
1.5.3 grafana-deploment.yaml
#该服务使用的cpu和内存可跟所需配置
cat > grafana-deployment.yaml << EFO
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: grafana
namespace: kube-system
labels:
app: grafana
spec:
revisionHistoryLimit: 10
template:
metadata:
labels:
app: grafana
component: prometheus
spec:
#nodeSelector:
# kubernetes.io/hostname: 可设置指定部署到的节点
containers:
- name: grafana
env:
- name: GF_SECURITY_ADMIN_USER
value: admin
- name: GF_SECURITY_ADMIN_PASSWORD
value: admin
image: grafana/grafana:6.4.3
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
name: grafana
readinessProbe:
failureThreshold: 10
httpGet:
path: /api/health
port: 3000
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
livenessProbe:
failureThreshold: 3
httpGet:
path: /api/health
port: 3000
scheme: HTTP
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
cpu: 100m
memory: 256Mi
requests:
cpu: 100m
memory: 256Mi
volumeMounts:
- mountPath: /var/lib/grafana
subPath: grafana
name: grafana-volumes
volumes:
- name: grafana-volumes
persistentVolumeClaim:
claimName: prometheus-grafana
---
# ------------------- APP Service ------------------- #
kind: Service
apiVersion: v1
metadata:
labels:
app: grafana
name: grafana
namespace: kube-system
spec:
#type: ClusterIP
type: NodePort
ports:
- port: 80
targetPort: 3000
selector:
app: grafana
EFO
1.5.4 部署文件并查看服务的ip和端口
kubectl apply -f grafana-pvc.yaml
kubectl apply -f grafana-deployment.yaml
#查看服务和端口
kubectl get service -n kube-system
prometheus NodePort xxx.xxx.xxx.xxx 9090:31920/TCP 3d23h
kubectl get pods -n kube-system -o wide
获取到grafana所在的node上,拿到nodeip+上面获得的端口(31920)即可访问grafana服务。
-- 登陆用户名默认admin,admin。登陆之后需要配置数据源,数据源选择prometheus,填写prometheus服务的ip和端口。然后在datasource 那一栏设置。
导入入dashboard可以选择模版10000,就可以看数据监控页面了
1.6 k8s报警系统
1.6.1 下载所需alertmanager yaml文件
#建议与prometheus放到一个服务器上,避免出现需要安装nfs客户端的错误。
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-pvc.yaml
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-service.yaml
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-deployment.yaml
curl -O https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/prometheus/alertmanager-configmap.yaml
1.6.2 创建保存报警信息的文件夹
mkdir /data/pvdata/prometheus-alertmanager
chown nfsnobody. /data/pvdata/prometheus-alertmanager
1.6.3 创建alertmanager-pvc.yaml
cat > alertmanager-pvc.yaml << EFO
apiVersion: v1
kind: PersistentVolume
metadata:
name: prometheus-alertmanager
spec:
storageClassName: prometheus-alertmanager
capacity:
storage: 1Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Recycle
nfs:
path: /data/pvdata/prometheus-alertmanager
server: 搭建的nfs服务ip
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-alertmanager
namespace: kube-system
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: prometheus-alertmanager
EFO
1.6.4 修改alertmanger-deploment.yaml
# 修改最后一行的claimName,更改自己创建的volume
- name: storage-volume
persistentVolumeClaim:
claimName: prometheus-alertmanager
```
#### 1.6.5 修改alertmanger-service.yaml
```shell
#修改spec中的type 为NodePort,方便根据节点ip和随机映射的端口访问该服务。
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/name: "Alertmanager"
spec:
ports:
- name: http
port: 80
protocol: TCP
targetPort: 9093
selector:
k8s-app: alertmanager
type: NodePort
1.6.6 部署alertmanger
kubectl apply -f alertmanager-pvc.yaml
kubectl apply -f alertmanager-configmap.yaml
kubectl apply -f alertmanager-service.yaml
kubectl apply -f alertmanager-deployment.yaml
1.6.7 创建告警规则
kubectl edit configmaps prometheus-config -n kube-system
// 在prometheus.yml: |下面添加
global:
#抓取数据间隔
scrape_interval: 5s
#评估告警时间间隔
evaluation_interval: 5s
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanger服务的ip和监听的端口"]
rule_files:
- "/etc/config/rules.yml"
// 创建告警规则, 在最下面添加
rules.yml: |
groups:
- name: monitor
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
team: kube-system
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
// 重载配置文件
curl -X POST http://prometheus的节点ip:端口/-/reload
1.6.8 创建邮件告警
# 修改alertmanager-configmap.yaml文件
cat > alertmanager-configmap.yaml << EFO
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: EnsureExists
data:
alertmanager.yml: |
global:
resolve_timeout: 3m #解析的超时时间
smtp_smarthost: 'smtp.163.com:25'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'PASSWORD'
smtp_require_tls: false
route:
group_by: ['example']
group_wait: 60s
group_interval: 60s
repeat_interval: 12h
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
#填写web_hook_url
- url: 'web_hook_url'
#是否在告警消除时发送回执消息
send_resolved: false
email_configs:
- to: '[email protected]'
send_resolved: false
EFO
kubectl delete configmaps -n kube-system alertmanager-config
kubectl apply -f alertmanager-configmap.yaml
参考 :
https://www.jianshu.com/p/e76053b6f3f5