prometheus altermanager邮件报警:

下载网址:
https://prometheus.io/download

tar zxfv alertmanager-0.15.2.linux-amd64.tar.gz -C /space/

mv /space/alertmanager-0.15.2.linux-amd64 /Influxdb/alertmanager

vi /space/altermanager/altermanager.yml

global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'xxxxx'

route:
group_by: ['down']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'shprom'
receivers:

:wq

注:

465端口方式:

global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:465'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'xxxx' (此处为授权码,不是密码)
smtp_require_tls: false

/space/altermanager/altermanager --config.file=/space/altermanager/altermanager.yml

mkdir /space/prometheus/rules

vi /space/prometheus/rules/down.yml

groups:

  • name: down
    rules:
    • alert: InstanceDown
      expr: up == 0
      for: 30s
      labels:
      user: shprom
      annotations:
      summary: "Instance {{ $labels.instance }} down"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."

:wq

vi /space/prometheus/rules/mem.yml

groups:

  • name: mem
    rules:
    • alert: NodeMemoryUsage
      expr: (node_memory_MemTotal - (node_memory_MemFree+node_memory_Buffers+node_memory_Cached )) / node_memory_MemTotal * 100 > 80
      for: 1m
      labels:
      serverity: page
      annotations:
      summary: "{{ $labels.instance }} High Memory usage detected"
      description: "{{ $labels.instance }}: Memory usage is above 80% (current value is:{{ $value }})"

:wq

vi /space/prometheus/rules/cpu.yml

groups:

  • name: cpu
    rules:
    • alert: NodeCPUUsage
      expr: (100 - (avg by (instance) (irate(node_cpu{mode="idle"}[5m])) * 100)) > 80
      for: 1m
      labels:
      serverity: page
      annotations:
      summary: "{{ $labels.instance }} High CPU usage detected"
      description: "{{ $labels.instance }}: CPU usage is above 80% (current value is:{{ $value }})"

:wq

vi /space/prometheus/rules/home.yml

groups:

  • name: home
    rules:
    • alert: NodeHomeUsage
      expr: (100 - (node_filesystem_avail_bytes{mountpoint="/home"} / node_filesystem_size_bytes{mountpoint="/home"}) * 100) > 80
      for: 1m
      labels:
      serverity: page
      annotations:
      summary: "{{ $labels.instance }} High Memory usage detected"
      description: "{{ $labels.instance }}: Memory usage is above 80% (current value is:{{ $value }})"

:wq

vi /space/prometheus/prometheus.yml

alerting:
alertmanagers:

  • static_configs:
    • targets: ['localhost:9093']

      - alertmanager:9093

rule_files:

  • "rules/down.yml"
  • "rules/mem.yml"
  • "rules/cpu.yml"

:wq

/space/prometheus/prometheus --config.file=/space/prometheus/prometheus.yml --storage.tsdb.path=/space/prometheus/data

可以到http://ip:9090——status——rules和alerts确认是否生效