git clone https://github.com/VictoriaMetrics/VictoriaMetrics
cd VictoriaMetrics
make vmalert
构建二进制文件将放置在VictoriaMetrics/bin文件夹中。
vim alert.rules
#rule示例
groups:
- name: test-rule
rules:
- alert: 主机状态
expr: up == 0
for: 2m
labels:
status: warning
annotations:
summary: "{{$labels.instance}}:服务器关闭"
description: "{{$labels.instance}}:服务器关闭"
使用helm安装promethues和alertmanager的配置是同一文件中。
#阿里云
helm repo add aliyuncs https://apphub.aliyuncs.com
#官方
helm repo add stable https://kubernetes-charts.storage.googleapis.com
helm search repo prometheus-operator
NAME CHART VERSION APP VERSION DESCRIPTION
aliyuncs/prometheus-operator 8.7.0 0.35.0 Provides easy monitoring definitions for Kubern...
stable/prometheus-operator 8.13.7 0.38.1 Provides easy monitoring definitions for Kubern...
helm install mypro aliyuncs/prometheus-operator
#helm list
NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
mypro default 1 2020-06-09 09:32:37.091220013 +0800 CST deployed prometheus-operator-8.7.0 0.35.0
#helm status mypro
NAME: mypro
LAST DEPLOYED: Tue Jun 9 09:32:37 2020
NAMESPACE: default
STATUS: deployed
REVISION: 1
NOTES:
The Prometheus Operator has been installed. Check its status by running:
kubectl --namespace default get pods -l "release=mypro"
Visit https://github.com/coreos/prometheus-operator for instructions on how
to create & configure Alertmanager and Prometheus instances using the Operator.
#kubectl --namespace default get pods -l "release=mypro"
NAME READY STATUS RESTARTS AGE
mypro-grafana-f5b868868-8ckgs 2/2 Running 0 55m
mypro-prometheus-node-exporter-dg6w4 1/1 Running 0 55m
mypro-prometheus-node-exporter-x9l4b 1/1 Running 0 55m
mypro-prometheus-operator-operator-5b458d4659-p7t4l 2/2 Running 0 55m
cat grafana-ingress.yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: ingress-grafana
spec:
rules:
- host: grafana.com
http:
paths:
- backend:
serviceName: mypro-grafana
servicePort: 80
cat prometheus-ingress.yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: ingress-prometheus
spec:
rules:
- host: prometheus.com
http:
paths:
- backend:
serviceName: mypro-prometheus-operator-prometheus
servicePort: 9090
cat alertmanager-ingress.yaml
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: ingress-alertmanager
spec:
rules:
- host: alertmanager.com
http:
paths:
- backend:
serviceName: mypro-prometheus-operator-alertmanager
servicePort: 9093
# kubectl get ingress
NAME CLASS HOSTS ADDRESS PORTS AGE
ingress-alertmanager alertmanager.com x.x.x.x 80 29m
ingress-grafana grafana.com x.x.x.x 80 32m
ingress-prometheus prometheus.com x.x.x.x 80 30m
访问以下几个url
http://grafana.com
http://prometheus.com
http://alertmanager.com
# cat alertmanger_config.yaml
global:
resolve_timeout: 5m # 处理超时时间,默认为5min
templates: # 指定邮件模板的路径,可以使用相对路径,template/*.tmpl的方式
- '/usr/local/alertmanager/template/default.tmpl'
# 定义路由树信息
route:
group_by: [alertname] # 报警分组依据
receiver: ops_notify # 设置默认接收人
group_wait: 30s # 最初即第一次等待多久时间发送一组警报的通知
group_interval: 60s # 在发送新警报前的等待时间
repeat_interval: 1h # 重复发送告警时间。默认1h
routes:
- receiver: ops_notify # 基础告警通知人
group_wait: 10s
match_re:
alertname: 实例存活告警|磁盘使用率告警 # 匹配告警规则中的名称发送
- receiver: info_notify # 消息告警通知人
group_wait: 10s
match_re:
alertname: 内存使用率告警|CPU使用率告警|目录大小告警
# 定义基础告警接收者
receivers:
- name: ops_notify
webhook_configs:
- url: http://localhost:8060/dingtalk/webhook1/send
send_resolved: true # 警报被解决之后是否通知
# message: '{{ template "wechat.default.message" . }}'
# 定义消息告警接收者
- name: info_notify
webhook_configs:
- url: http://localhost:8060/dingtalk/webhook1/send
send_resolved: true
# message: '{{ template "wechat.default.message" . }}'
# 一个inhibition规则是在与另一组匹配器匹配的警报存在的条件下,使匹配一组匹配器的#警报失效的规则。两个警报必须具有一组相同的标签。
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
cat alertmanger_config.yaml |base64
kubectl edit secret alertmanager-mypro-prometheus-operator-alertmanager
prometheus-operator可以使用PrometheusRule来动态的添加自定义监控项
kubectl get prometheus mypro-prometheus-operator-prometheus -o jsonpath={".spec.ruleSelector"};echo
# cat PrometheusRule.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app: prometheus-operator #和Prometheus中的标签选择器中的标签,如果要自己创建一个Prometheus的配置关联到PrometheusRule的labels
release: mypro #和Prometheus中的标签选择器中的标签,如果要自己创建一个Prometheus的配置关联到PrometheusRule的labels
prometheus: test-example
name: test-load1-prometheusrule
spec:
groups:
- name: test-load-1
rules:
- alert: test-load-1
expr: node_load1 > 1
for: 2m
labels:
team: node
annotations:
summary: "{{$labels.instance}}: load 1 >1"
description: "{{$labels.instance}}: job {{$labels.job}} 测试测试 负载大于1"
#导入
kubectl apply -f PrometheusRule.yaml
# kubectl exec -it prometheus-mypro-prometheus-operator-prometheus-0 sh
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl kubectl exec [POD] -- [COMMAND] instead.
Defaulting container name to prometheus.
Use 'kubectl describe pod/prometheus-mypro-prometheus-operator-prometheus-0 -n default' to see all of the containers in this pod.
/prometheus $ ls /etc/prometheus/rules/prometheus-mypro-prometheus-operator-prometheus-rulefiles-0/default-test-load1-prometheusrule.yaml
/etc/prometheus/rules/prometheus-mypro-prometheus-operator-prometheus-rulefiles-0/default-test-load1-prometheusrule.yaml
/prometheus $ cat /etc/prometheus/rules/prometheus-mypro-prometheus-operator-prometheus-rulefiles-0/default-test-load1-prometheusrule.yaml
groups:
- name: test-load-1
rules:
- alert: test-load-1
annotations:
description: '{{$labels.instance}}: job {{$labels.job}} 测试测试 负载大于1'
summary: '{{$labels.instance}}: load 1 >1'
expr: node_load1 > 1
for: 2m
labels:
team: node