1.编辑graylog配置文件:
$ vim /etc/graylog/server.conf
添加如下smtp配置内容:
# Email transporttransport_email_enabled = truetransport_email_hostname = smtp.163.com
transport_email_port = 25
transport_email_use_auth = true
transport_email_use_tls = false
transport_email_use_ssl = false
transport_email_auth_username = [email protected]
transport_email_auth_password = xiao*******
#transport_email_subject_prefix = [graylog]
transport_email_from_email = [email protected]
transport_email_web_interface_url = http://localhost:9000
保存修改并退出编辑。
2.配置通知:
点击“Alerts->Manage notifications”
点击“Add new notification”
添加notification,如下图
点击“Test”测试通知是否配置正确
3.配置触发条件:
点击“Alerts->Manage conditions”
点击“Add new condition”
添加condition,如下图
至此,一个简单的告警配置已经完成。
上述告警示例表示当5分钟内接收到的日志消息超过10条的时候,发送告警通知给邮箱 [email protected]。
收到的告警通知邮件如下图:
快速入门
以下是基于官方关于hawkular alert的示例在本地的实践,更改了官方示例中不正确的部分。
添加警报
警报触发器的组成:
- 一组触发警报/事件的条件;
- 满足条件时要执行一个或多个操作(发送电子邮件,调用webhook等);
- 一些额外的元数据(如严重性);
下面的示例触发条件为:当气温低于0摄氏度时,示例会发送一封电子邮件至“[email protected]”。
此处为了演示方便将所有的内容放在了一个文件中,如下:
trigger_definition.json
{
"triggers": [
{
"trigger": {
"id": "temperature-trigger", //指定trigger id
"name": "Trigger for the temperature sensor", //指定trigger name
"severity": "HIGH", //级别
"enabled": true, //启用
"actions": [
{
"actionPlugin": "email", //触发后将会以邮件的形式提醒
"actionId": "notify-admin"
}
]
},
"conditions": [
{
"triggerMode": "FIRING",
"type": "threshold",
"dataId": "hm_g_temperature", //指标数据(此处指标名称为:temperature,由于是gauge类型,因此加“hm_g_前缀”)
"operator": "LT",
"threshold": 0
}
]
}
],
"actions": [
{
"actionPlugin": "email",
"actionId": "notify-admin",
"properties": {
"to": "[email protected]" //邮箱地址,可改成你可用的邮箱
}
}
]
}
导入触发器定义:
curl -u myUsername:myPassword -X POST http://localhost:8080/hawkular/alerts/import/all -d @trigger_definition.json \
-H "Content-Type: application/json" -H "Hawkular-Tenant: myTenant"
测试
基于之前的配置,导入满足触发条件的指标数据,将会发送电子邮件。
补充:hawkular默认使用localhost:25发送邮件。
metrics_day_2.json
[
{"timestamp": 1468620000000, "value": 8},
{"timestamp": 1468621800000, "value": 6},
{"timestamp": 1468623600000, "value": 3},
{"timestamp": 1468625400000, "value": 0},
{"timestamp": 1468627200000, "value": -2},
{"timestamp": 1468629000000, "value": -3},
{"timestamp": 1468630800000, "value": -1},
{"timestamp": 1468632600000, "value": 2},
{"timestamp": 1468634400000, "value": 4},
{"timestamp": 1468636200000, "value": 5},
{"timestamp": 1468638000000, "value": 8},
{"timestamp": 1468639800000, "value": 12},
{"timestamp": 1468641600000, "value": 13},
{"timestamp": 1468643400000, "value": 12},
{"timestamp": 1468645200000, "value": 13.4},
{"timestamp": 1468647000000, "value": 14},
{"timestamp": 1468648800000, "value": 14.3},
{"timestamp": 1468650600000, "value": 14.6},
{"timestamp": 1468652400000, "value": 17},
{"timestamp": 1468654200000, "value": 17.3},
{"timestamp": 1468656000000, "value": 17.5},
{"timestamp": 1468657800000, "value": 17.9},
{"timestamp": 1468659600000, "value": 18},
{"timestamp": 1468661400000, "value": 18.7},
{"timestamp": 1468663200000, "value": 19.2}
]
存入指标数据:
curl -u myUsername:myPassword -X POST http://localhost:8080/hawkular/metrics/gauges/temperature/raw -d @metrics_day_2.json \
-H "Content-Type: application/json" -H "Hawkular-Tenant: myTenant"
完成之后可以在日志文件中看到如下的输出:
15:10:02,110 INFO [org.hawkular.alerts.actions.api] (standalone-action-3) HAWKALERT240001: Plugin [email] has received an action message: [StandaloneActionMessage[action=Action[eventId='temperature-trigger-1501053001894-ee945e8a-7c26-4a5a-8236-de476d21f70c', ctime=1501053001903, event=Alert [tenantId=myTenant, triggerId=temperature-trigger, severity=HIGH, status=OPEN, ctime=1501053001894, lifecycle=[LifeCycle{user='system', status=OPEN, stime=1501053001894}], resolvedEvalSets=null], result='WAITING']]]
15:10:02,113 INFO [org.hawkular.alerts.actions.api] (standalone-action-4) HAWKALERT240001: Plugin [email] has received an action message: [StandaloneActionMessage[action=Action[eventId='temperature-trigger-1501053001895-d8046519-6e4e-430d-85e9-059f6333f549', ctime=1501053001903, event=Alert [tenantId=myTenant, triggerId=temperature-trigger, severity=HIGH, status=OPEN, ctime=1501053001895, lifecycle=[LifeCycle{user='system', status=OPEN, stime=1501053001895}], resolvedEvalSets=null], result='WAITING']]]
15:10:02,121 INFO [org.hawkular.alerts.actions.api] (standalone-action-5) HAWKALERT240001: Plugin [email] has received an action message: [StandaloneActionMessage[action=Action[eventId='temperature-trigger-1501053001892-55aa4be1-e955-4eaf-8d14-b79e8d4ec24f', ctime=1501053001897, event=Alert [tenantId=myTenant, triggerId=temperature-trigger, severity=HIGH, status=OPEN, ctime=1501053001892, lifecycle=[LifeCycle{user='system', status=OPEN, stime=1501053001892}], resolvedEvalSets=null], result='WAITING']]]
此时,我们可以在Cassandra数据库中查看一下我们的警报配置。
-
进入当前集群hawkular中的node1节点,启动cqlsh:
[root@localhost node1]# ./bin/cqlsh Connected to hawkular at 127.0.0.1:9042. [cqlsh 5.0.1 | Cassandra 3.0.12 | CQL spec 3.4.0 | Native protocol v4] Use HELP for help. cqlsh>
-
查看所有的keyspace:
cqlsh> describe keyspaces; system_schema hawkular_metrics hawkular_alerts system_traces system_auth system system_distributed
-
进入hawkular_alerts键空间:
cqlsh> use hawkular_alerts; cqlsh:hawkular_alerts>
-
查看该空间中的所有表:
cqlsh:hawkular_alerts> describe tables; alerts_statuses events cassalog actions_history_ctimes dampenings actions_definitions events_triggers tags alerts_triggers alerts_severities actions_history_alerts action_plugins alerts_lifecycle actions_history triggers_actions alerts_stimes actions_history_results sys_config alerts_ctimes events_categories actions_history_actions alerts triggers conditions events_ctimes
-
在表actions_definitions数据:
cqlsh:hawkular_alerts> select * from actions_definitions; tenantid | actionplugin | actionid | payload ----------+--------------+--------------+----------------------------------------------- ---------------------------------------------------------------------------------- myTenant | email | notify-admin | {"tenantId":"myTenant","actionPlugin":"email","actionId":"notify-admin","global":false,"properties":{"to":"[email protected]"}} (1 rows)
上述结果可以看到我们之前的触发定义信息。
登录邮箱可看到如下图的邮件:
补充:
在Hawkular Alerting Details中有介绍关于alert中condition部分的dataId名称的问题,不同类型metric的dataId要添加不同的前缀,具体如下:
- hm_a: availability
- hm_c: counter
- hm_cr: counter rate
- hm_g: gauge
- hm_gr: gauge rate
- hm_s: string
因此,当我们有一个gauge类型的指标temperature时,dataId就要写成:hm_g_temperature,否则就不会发生告警(所以,按照官方文档实践的同学就要注意啦,记得要改过来哟!)。
常用API命令示例
如果使用的是openshift中的hawkular metrics,则需要将-u "username: userpassword"
替换为-H "Authorization: Bearer XXXXXX"
,并使用https
。
查看alert状态
curl -u "username:userpassword" -X GET http://localhost:8080/hawkular/alerts/status -H "Content-Type: application/json" -H "Hawkular-Tenant: myTenant"
响应结果:
{"distributed":"false","Implementation-Version":"1.6.0.Final","Built-From-Git-SHA1":"82b3bb25a3b5a6d3e0e793110c6c544d6d0ce1cf","status":"STARTED"}
导入triggers和actions定义
curl -u "username:userpassword" -X POST http://localhost:8080/hawkular/alerts/import/all -d @trigger_definition.json -H "Content-Type: application/json" -H "Hawkular-Tenant: myTenant"
响应结果:
{"triggers":[{"trigger":{"tenantId":"myTenant","id":"memory_usage-trigger","name":"Trigger for the memory sensor","type":"STANDARD","eventType":"ALERT","eventCategory":null,"eventText":null,"severity":"HIGH","actions":[{"tenantId":"myTenant","actionPlugin":"email","actionId":"notify-admin"}],"autoDisable":false,"autoEnable":false,"autoResolve":false,"autoResolveAlerts":true,"autoResolveMatch":"ALL","enabled":true,"firingMatch":"ALL","source":"_none_"},"conditions":[{"tenantId":"myTenant","triggerId":"memory_usage-trigger","triggerMode":"FIRING","type":"THRESHOLD","conditionSetSize":1,"conditionSetIndex":1,"conditionId":"myTenant-memory_usage-trigger-FIRING-1-1","dataId":"hm_g_elasticsearch/9f5dae41-71d0-11e7-97e5-fa163eaf6ed5/memory/usage","operator":"GT","threshold":1.8E9}]}],"actions":[{"tenantId":"myTenant","actionPlugin":"email","actionId":"notify-admin","global":false,"properties":{"to":"[email protected]"}}]}
导出triggers和actions定义
curl -u "username:userpassword" -X GET http://localhost:8080/hawkular/alerts/export -H "Content-Type: application/json" -H "Hawkular-Tenant: myTenant"
响应结果:
{"triggers":[{"trigger":{"tenantId":"myTenant","id":"temperature-trigger","name":"Trigger for the temperature sensor","type":"STANDARD","eventType":"ALERT","eventCategory":null,"eventText":null,"severity":"HIGH","actions":[{"tenantId":"myTenant","actionPlugin":"email","actionId":"notify-admin"}],"autoDisable":false,"autoEnable":false,"autoResolve":false,"autoResolveAlerts":true,"autoResolveMatch":"ALL","enabled":true,"firingMatch":"ALL","source":"_none_"},"conditions":[{"tenantId":"myTenant","triggerId":"temperature-trigger","triggerMode":"FIRING","type":"THRESHOLD","conditionSetSize":1,"conditionSetIndex":1,"conditionId":"myTenant-temperature-trigger-FIRING-1-1","dataId":"hm_g_temperature","operator":"LT","threshold":0.0}]}],"actions":[{"tenantId":"myTenant","actionPlugin":"email","actionId":"notify-admin","global":false,"properties":{"to":"[email protected]"}}]}
查看可使用的action插件
curl -u "username:userpassword" -X GET http://localhost:8080/hawkular/alerts/plugins -H "Content-Type: application/json" -H "Hawkular-Tenant: myTenant"
响应结果:
["elasticsearch","email","webhook"]
获取已经定义的trigger
curl -u "username:userpassword" -X GET http://localhost:8080/hawkular/alerts/triggers/your_trigger_id -H "Content-Type: application/json" -H "Hawkular-Tenant: myTenant"
响应结果:
{"tenantId":"myTenant","id":"temperature-trigger","name":"Trigger for the temperature sensor","type":"STANDARD","eventType":"ALERT","eventCategory":null,"eventText":null,"severity":"HIGH","actions":[{"tenantId":"myTenant","actionPlugin":"email","actionId":"notify-admin"}],"autoDisable":false,"autoEnable":false,"autoResolve":false,"autoResolveAlerts":true,"autoResolveMatch":"ALL","enabled":true,"firingMatch":"ALL","source":"_none_"}
获取特定trigger的condition
curl -u "username:userpassword" -X GET http://localhost:8080/hawkular/alerts/triggers/temperature-trigger/conditions -H "Content-Type: application/json" -H "Hawkular-Tenant: myTenant"
响应结果:
[{"tenantId":"myTenant","triggerId":"temperature-trigger","triggerMode":"FIRING","type":"THRESHOLD","conditionSetSize":1,"conditionSetIndex":1,"conditionId":"myTenant-temperature-trigger-FIRING-1-1","dataId":"hm_g_temperature","operator":"LT","threshold":0.0}]
参考文献
- Alerts REST API。
- Alerts example。
- Hawkular Alerting for Developers。