安装懒得看的直接跳到【八、配置日志告警】
https://github.com/prometheus/prometheus/releases
[root@testqwe prometheus-2.41.0.linux-amd64]# wget https://github.com/prometheus/prometheus/releases/download/v2.41.0/prometheus-2.41.0.linux-amd64.tar.gz
[root@testqwe prometheus-2.41.0.linux-amd64]# tar -zxvf prometheus-2.41.0.linux-386.tar.gz
alertmanagers:告警相关配置
rule_files:告警规则路径配置
scrape_configs:监控采集相关配置
[root@testqwe prometheus-2.41.0.linux-amd64]# cat prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.40.233:9093
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/root/prometheus-2.41.0.linux-amd64/rules/*_rules.yml"
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
- job_name: 'promethues-node'
static_configs:
- targets: ['192.168.40.233:9100']
[root@testqwe prometheus-2.41.0.linux-amd64]# cat /lib/systemd/system/prometheus.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/root/prometheus-2.41.0.linux-amd64/
ExecStart=/root/prometheus-2.41.0.linux-amd64/prometheus --config.file=/root/prometheus-2.41.0.linux-amd64/prometheus.yml
[Install]
WantedBy=multi-user.target
[root@testqwe prometheus-2.41.0.linux-amd64]# systemctl daemon-reload
[root@testqwe prometheus-2.41.0.linux-amd64]# systemctl start prometheus.service
https://github.com/prometheus/node_exporter/releases
[root@testqwe ~]# wget https://github.com/prometheus/node_exporter/releases/download/v1.5.0/node_exporter-1.5.0.linux-amd64.tar.gz
[root@testqwe ~]# tar -zxvf node_exporter-1.5.0.linux-amd64.tar.gz
[root@testqwe node_exporter-1.5.0.linux-amd64]# cat /lib/systemd/system/node-exporter.service
[Unit]
Description=Prometheus Node Exporter
After=network.target
[Service]
ExecStart=/root/node_exporter-1.5.0.linux-amd64/node_exporter
[Install]
WantedBy=multi-user.target
[root@testqwe node_exporter-1.5.0.linux-amd64]# systemctl daemon-reload
[root@testqwe node_exporter-1.5.0.linux-amd64]# systemctl start node-exporter.service
https://github.com/timonwong/prometheus-webhook-dingtalk/releases
[root@testqwe ~]# wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
[root@testqwe ~]# tar -zxvf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
修改webhook1:
url:为钉钉机器人的Webhook地址
secret:为钉钉机器人的加签
[root@testqwe prometheus-webhook-dingtalk]# cat config.example.yml
## Request timeout
# timeout: 5s
## Uncomment following line in order to write template from scratch (be careful!)
#no_builtin_template: true
## Customizable templates path
#templates:
# - contrib/templates/legacy/template.tmpl
## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
#default_message:
# title: '{{ template "legacy.title" . }}'
# text: '{{ template "legacy.content" . }}'
## Targets, previously was known as "profiles"
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=24788be6b1869b9b3ee134577d91778f5677fba60619affasdq1c201499a843c
# secret for signature
secret: SECda59729c25d76e46d7648d052b567eb92c8aeaa8f568437c3ebcdb8ff08711e7
webhook2:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
webhook_legacy:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
# Customize template content
message:
# Use legacy template
title: '{{ template "legacy.title" . }}'
text: '{{ template "legacy.content" . }}'
webhook_mention_all:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention:
all: true
webhook_mention_users:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention:
mobiles: ['156xxxx8827', '189xxxx8325']
[root@testqwe prometheus-webhook-dingtalk]# cat /lib/systemd/system/prometheus-webhook-dingtalk.service
[Unit]
Description=https://github.com/timonwong/prometheus-webhook-dingtalk/releases/
After=network-online.target
[Service]
Restart=on-failure
ExecStart=/root/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --config.file=/root/prometheus-webhook-dingtalk/config.example.yml
[Install]
WantedBy=multi-user.target
[root@testqwe prometheus-webhook-dingtalk]# systemctl daemon-reload
[root@testqwe prometheus-webhook-dingtalk]# systemctl start prometheus-webhook-dingtalk.service
#或后台启动
[root@testqwe prometheus-webhook-dingtalk]# ./prometheus-webhook-dingtalk --config.file=config.example.yml >dingtalk.log 2>&1 &
https://github.com/prometheus/alertmanager/releases
[root@testqwe ~]# wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz
[root@testqwe ~]# tar -zxvf alertmanager-0.25.0.linux-amd64.tar.gz
配置钉钉告警相关内容:webhook_configs
[root@testqwe alertmanager-0.24.0.linux-amd64]# cat alertmanager.yml
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://192.168.40.233:8060/dingtalk/webhook1/send'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
参考链接:https://prometheus.io/docs/alerting/latest/configuration/
## Alertmanager 配置文件
global:
resolve_timeout: 5m
# smtp配置
smtp_from: "[email protected]"
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: "[email protected]"
smtp_auth_password: "auth_pass"
smtp_require_tls: true
# 路由分组
route:
receiver: ops
group_wait: 30s # 在组内等待所配置的时间,如果同组内,30秒内出现相同报警,在一个组内出现。
group_interval: 5m # 如果组内内容不变化,合并为一条警报信息,5m后发送。
repeat_interval: 24h # 发送报警间隔,如果指定时间内没有修复,则重新发送报警。
group_by: [alertname] # 报警分组
routes:
- match:
team: operations #根据team标签进行匹配,走不同的接收规则
receiver: 'ops'
- match_re:
service: nginx|apache
receiver: 'web'
- match_re:
service: hbase|spark
receiver: 'hadoop'
- match_re:
service: mysql|mongodb
receiver: 'db'
# 接收器指定发送人以及发送渠道
receivers:
# ops分组的定义
- name: ops
email_configs:
- to: '[email protected],[email protected]'
send_resolved: true
headers:
subject: "[operations] 报警邮件"
from: "警报中心"
to: "小煜狼皇"
# 钉钉配置
webhook_configs:
- url: http://localhost:8070/dingtalk/ops/send
# 企业微信配置
wechat_configs:
- corp_id: 'ww5421dksajhdasjkhj'
api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
send_resolved: true
to_party: '2'
agent_id: '1000002'
api_secret: 'Tm1kkEE3RGqVhv5hO-khdakjsdkjsahjkdksahjkdsahkj'
# web
- name: web
email_configs:
- to: '[email protected]'
send_resolved: true
headers: { Subject: "[web] 报警邮件"} # 接收邮件的标题
webhook_configs:
- url: http://localhost:8070/dingtalk/web/send
- url: http://localhost:8070/dingtalk/ops/send
# db
- name: db
email_configs:
- to: '[email protected]'
send_resolved: true
headers: { Subject: "[db] 报警邮件"} # 接收邮件的标题
webhook_configs:
- url: http://localhost:8070/dingtalk/db/send
- url: http://localhost:8070/dingtalk/ops/send
# hadoop
- name: hadoop
email_configs:
- to: '[email protected]'
send_resolved: true
headers: { Subject: "[hadoop] 报警邮件"} # 接收邮件的标题
webhook_configs:
- url: http://localhost:8070/dingtalk/hadoop/send
- url: http://localhost:8070/dingtalk/ops/send
# 抑制器配置
inhibit_rules: # 抑制规则
- source_match: # 源标签警报触发时抑制含有目标标签的警报,在当前警报匹配 status: 'High'
status: 'High'
target_match:
status: 'Warning' #
equal: ['alertname','operations', 'instance'] # 确保这个配置下的标签内容相同才会抑制,也就是说警报中必须有这三个标签值才会被抑制。
inhibit_rules:
Alertmanager的抑制机制可以避免当某种问题告警产生之后用户接收到大量由此问题导致的一系列的其它告警通知。例如当集群不可用时,用户可能只希望接收到一条告警,告诉他这时候集群出现了问题,而不是大量的如集群中的应用异常、中间件服务异常的告警通知。
当已经发送的告警通知匹配到target_match和target_match_re规则,当有新的告警规则如果满足source_match或者定义的匹配规则,并且已发送的告警与新产生的告警中equal定义的标签完全相同,则启动抑制机制,新的告警不会发送。
通过上面的配置,可以在alertname/operations/instance相同的情况下,high的报警会抑制warning级别的报警信息。
[root@testqwe prometheus-webhook-dingtalk]# cat /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
User=root
ExecStart=/root/alertmanager-0.24.0.linux-amd64/alertmanager --config.file=/root/alertmanager-0.24.0.linux-amd64/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
[root@testqwe ~]# systemctl daemon-reload
[root@testqwe ~]# systemctl start alertmanager.service
https://github.com/grafana/loki/releases
[root@testqwe ~]# wget https://github.com/grafana/loki/releases/download/v2.7.1/loki-linux-amd64.zip
[root@testqwe ~]# unzip loki-linux-amd64.zip
主要修改 ruler 内参数
[root@testqwe ~]# cat loki/loki.yaml
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
ingester:
wal:
enabled: true
dir: /root/loki/wal
lifecycler:
address: 127.0.0.1
ring:
kvstore:
store: inmemory
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 1h # Any chunk not receiving new logs in this time will be flushed
max_chunk_age: 1h # All chunks will be flushed when they hit this age, default is 1h
chunk_target_size: 1048576 # Loki will attempt to build chunks up to 1.5MB, flushing first if chunk_idle_period or max_chunk_age is reached first
chunk_retain_period: 30s # Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m)
max_transfer_retries: 0 # Chunk transfers disabled
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /root/loki/boltdb-shipper-active
cache_location: /root/loki/boltdb-shipper-cache
cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
shared_store: filesystem
filesystem:
directory: /root/loki/chunks
compactor:
working_directory: /root/loki/boltdb-shipper-compactor
shared_store: filesystem
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: false
retention_period: 0s
ruler:
# 触发告警事件后的回调查询地址
# 如果用grafana的话就配置成grafana/explore
external_url: http://192.168.40.233:3000
# alertmanager地址
alertmanager_url: http://192.168.40.233:9093
enable_alertmanager_v2: true
ring:
kvstore:
store: inmemory
# 启用loki rules API
enable_api: true
# 对rules分片,支持ruler多实例
enable_sharding: true
# ruler服务的一致性哈希环配置,用于支持多实例和分片
# rules临时规则文件存储路径
rule_path: /root/loki/rules-temp
# rules规则存储
# 主要支持本地存储(local)和对象文件系统(azure, gcs, s3, swift)
storage:
type: local
local:
directory: /root/loki/rules
# rules规则加载时间
flush_period: 1m
[root@testqwe ~]# cat /usr/lib/systemd/system/loki.service
[Unit]
Description=loki server
Wants=network-online.target
After=network-online.target
[Service]
ExecStart=/root/loki/loki-linux-amd64 -config.file=/root/loki/loki.yaml -target=all
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=loki
[Install]
WantedBy=default.target
[root@testqwe ~]# systemctl daemon-reload
[root@testqwe ~]# systemctl start loki.service
https://github.com/grafana/loki/releases
[root@testqwe ~]# wget https://github.com/grafana/loki/releases/download/v2.7.1/promtail-linux-amd64.zip
[root@testqwe ~]# unzip promtail-linux-amd64.zip
client:loki的地址
scrape_configs:抓取日志配置
详细配置参考:https://cloud.tencent.com/developer/article/1824988
[root@testqwe ~]# cat promtail/promtail.yaml
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /var/log/positions.yaml # This location needs to be writeable by promtail.
client:
url: http://192.168.40.233:3100/loki/api/v1/push
scrape_configs:
- job_name: system
pipeline_stages:
static_configs:
- targets:
- localhost
labels:
job: varlogs
host: 192.168.40.233
__path__: /var/log/{secure,messages,*log}
[root@testqwe ~]# cat /usr/lib/systemd/system/promtail.service
[Unit]
Description=promtail server
Wants=network-online.target
After=network-online.target
[Service]
ExecStart=/root/promtail/promtail-linux-amd64 -config.file=/root/promtail/promtail.yaml
StandardOutput=syslog
StandardError=syslog
SyslogIdentifier=promtail
[Install]
WantedBy=default.target
[root@testqwe ~]# systemctl daemon-reload
[root@testqwe ~]# systemctl start promtail.service
https://github.com/grafana/grafana/releases
[root@testqwe ~]# wget https://dl.grafana.com/enterprise/release/grafana-enterprise-9.3.2-1.x86_64.rpm
[root@testqwe ~]# yum install grafana-enterprise-9.3.2-1.x86_64.rpm
[root@testqwe ~]# systemctl start grafana-server.service
此告警示例为,监控 /var/log/secure 里关于ssh登陆失败的日志,监控日志为:Jan 3 18:26:06 testqwe sshd[22177]: Invalid user rmzx from 192.168.13.168 port 33640 ,告警内容为截取日志中的时间、user名称和目标IP地址
通过 (?P
规则含义:1分钟内出现"Invalid user"次数大于0则告警
[root@testqwe fake]# cat /root/loki/rules/fake/ssh-new.yml
- name: ssh监控
rules:
- alert: ssh连接错误告警
expr: |
sum by (alert_time,ssh_user,dst_ssh_user_name,ssh_from,dst_ssh_ip)
(rate(
{filename="/var/log/secure"}
|~ "Invalid user"
| regexp "(?P\\w{3}\\s+\\d{1}\\s+\\d{2}:\\d{2}:\\d{2}).*(?Puser.*)\\s(?P\\w+)\\s(?Pfrom)\\s(?P\\d{1,9}.\\d{0,9}.\\d{0,9}.\\d{0,9}) "[1m]))
> 0
for: 0
labels:
severity: error-log
annotations:
summary: "时间:{{ $labels.alert_time }}
目标IP: {{ $labels.dst_ssh_ip }}
用户:{{ $labels.dst_ssh_user_name }}"
效果图如下:
告警模板不会设置,只能将就用,回头研究一下