Notify组件定义了路由处理过程中的receiver pipeline(本文不包含Silence和Inhibit部分), 包含等待间隔的WaitState,去重处理DedupStage,重试处理RetryStage和SetNotifyStage,实现上类似于中间件的方式,一层层的顺序处理。创建pipeline的函数定义如下:
// createStage creates a pipeline of stages for a receiver.
func createStage(rc *config.Receiver, tmpl *template.Template, wait func() time.Duration, notificationLog NotificationLog, logger log.Logger) Stage {
var fs FanoutStage
for _, i := range BuildReceiverIntegrations(rc, tmpl, logger) {
recv := &nflogpb.Receiver{
GroupName: rc.Name,
Integration: i.name,
Idx: uint32(i.idx),
}
var s MultiStage
s = append(s, NewWaitStage(wait))
s = append(s, NewDedupStage(i, notificationLog, recv))
s = append(s, NewRetryStage(i, rc.Name))
s = append(s, NewSetNotifiesStage(notificationLog, recv))
fs = append(fs, s)
}
return fs
}
等待间隔用来设置发送告警的等待时间,对于集群操作中,需要根据不同的peer设置不同的超时时间,如果仅仅一个Server本身,等待间隔设置为0;
// clusterWait returns a function that inspects the current peer state and returns
// a duration of one base timeout for each peer with a higher ID than ourselves.
func clusterWait(p *cluster.Peer, timeout time.Duration) func() time.Duration {
return func() time.Duration {
return time.Duration(p.Position()) * timeout
}
具体的实现上采用一个timer来传递信号,一旦时间到达后才返回对应的alerts,由于是串行执行的,所以消息传递会中止一段时间。
// Exec implements the Stage interface.
func (ws *WaitStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
select {
case <-time.After(ws.wait()):
case <-ctx.Done():
return ctx, nil, ctx.Err()
}
return ctx, alerts, nil
}
DedupStage用于管理告警的去重,传递的参数中包含了一个NotificationLog,用来保存告警的发送记录。当有多个机器组成集群的时候,NotificationLog会通过协议去进行通信,传递彼此的记录信息,加入集群中的A如果发送了告警,该记录会传递给B机器,并进行merge操作,这样B机器在发送告警的时候如果查询已经发送,则不再进行告警发送。关于NotificationLog的实现nflog可以查看nflog/nflog.go文件。
// DedupStage filters alerts.
// Filtering happens based on a notification log.
type DedupStage struct {
nflog NotificationLog
recv *nflogpb.Receiver
conf notifierConfig
now func() time.Time
hash func(*types.Alert) uint64
}
具体的处理逻辑如下:
func (n *DedupStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
...
entries, err := n.nflog.Query(nflog.QGroupKey(gkey), nflog.QReceiver(n.recv))
if err != nil && err != nflog.ErrNotFound {
return ctx, nil, err
}
var entry *nflogpb.Entry
switch len(entries) {
case 0:
case 1:
entry = entries[0]
case 2:
return ctx, nil, fmt.Errorf("unexpected entry result size %d", len(entries))
}
if n.needsUpdate(entry, firingSet, resolvedSet, repeatInterval) {
return ctx, alerts, nil
}
return ctx, nil, nil
}
其中的nflog.Query将根据接收和group key进行查询,一旦查找到,则不再返回对应的alerts. nflog设置了GC用来删除过期的日志记录。防止一直存在log中导致告警无法继续发送.
RetryStage利用backoff策略来管理告警的重发,对于没有发送成功的告警将不断重试,直到超时时间,numFailedNotifications用来传递发送失败的统计metrics,numNotifications用来发送成功的metrics统计信息。
select {
case <-tick.C:
now := time.Now()
retry, err := r.integration.Notify(ctx, sent...)
notificationLatencySeconds.WithLabelValues(r.integration.name).Observe(time.Since(now).Seconds())
if err != nil {
numFailedNotifications.WithLabelValues(r.integration.name).Inc()
level.Debug(l).Log("msg", "Notify attempt failed", "attempt", i, "integration", r.integration.name, "receiver", r.groupName, "err", err)
if !retry {
return ctx, alerts, fmt.Errorf("cancelling notify retry for %q due to unrecoverable error: %s", r.integration.name, err)
}
// Save this error to be able to return the last seen error by an
// integration upon context timeout.
iErr = err
} else {
numNotifications.WithLabelValues(r.integration.name).Inc()
return ctx, alerts, nil
}
case <-ctx.Done():
if iErr != nil {
return ctx, nil, iErr
}
return ctx, nil, ctx.Err()
}
SetNotifiesStage用来设置发送告警的信息到nfLog,该模块仅仅用于被该AM发送的告警的记录(Retry组件传递的alerts和Dedup组件中发送出去的告警信息)。
// Exec implements the Stage interface.
func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
gkey, ok := GroupKey(ctx)
if !ok {
return ctx, nil, fmt.Errorf("group key missing")
}
firing, ok := FiringAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("firing alerts missing")
}
resolved, ok := ResolvedAlerts(ctx)
if !ok {
return ctx, nil, fmt.Errorf("resolved alerts missing")
}
return ctx, alerts, n.nflog.Log(n.recv, gkey, firing, resolved)
}
Notify组件根据用户的配置路由发送告警信息,比如通过webhook,email,wechat,slack等。 我们内部消息系统是通过mattermost实现,可以兼容slack的配置。用户可以自定义自己的消息发送路由,notify/impl.go中有不同方式具体的实现细节.
// A Notifier notifies about alerts under constraints of the given context.
// It returns an error if unsuccessful and a flag whether the error is
// recoverable. This information is useful for a retry logic.
type Notifier interface {
Notify(context.Context, ...*types.Alert) (bool, error)
}
// An Integration wraps a notifier and its config to be uniquely identified by
// name and index from its origin in the configuration.
type Integration struct {
notifier Notifier
conf notifierConfig
name string
idx int
}
Integration定义一个集成路由组件,包含用户的配置信息和名称以及发送告警的实现。自定义的notify路由需要满足该Notifier接口,实现Notify方法。 比如下面是webhook的实现,首先定义一个管理webhook的结构体Webhook,包含基本的配置和模板信息,WebhookMessage定义了发送webhook的信息
// Webhook implements a Notifier for generic webhooks.
type Webhook struct {
conf *config.WebhookConfig
tmpl *template.Template
logger log.Logger
}
// NewWebhook returns a new Webhook.
func NewWebhook(conf *config.WebhookConfig, t *template.Template, l log.Logger) *Webhook {
return &Webhook{conf: conf, tmpl: t, logger: l}
}
// WebhookMessage defines the JSON object send to webhook endpoints.
type WebhookMessage struct {
*template.Data
// The protocol version.
Version string `json:"version"`
GroupKey string `json:"groupKey"`
}
基本结构定义完成后就可以编写具体的发送函数Notify来实现告警的发送,根据告警系统发送的告警信息(可能不止一个)将其通过模板生成对应的消息,由于可能包含多个告警,因此GroupKey用来返回聚合组的相关信息。生成的WebhookMessage经过JSON序列化后通过http协议传递到配置的web接口中,返回的w.retry(resp.StatusCode)
将检查是否发送ok,如果失败则返回错误信息。
// Notify implements the Notifier interface.
func (w *Webhook) Notify(ctx context.Context, alerts ...*types.Alert) (bool, error) {
data := w.tmpl.Data(receiverName(ctx, w.logger), groupLabels(ctx, w.logger), alerts...)
groupKey, ok := GroupKey(ctx)
if !ok {
level.Error(w.logger).Log("msg", "group key missing")
}
msg := &WebhookMessage{
Version: "4",
Data: data,
GroupKey: groupKey,
}
var buf bytes.Buffer
if err := json.NewEncoder(&buf).Encode(msg); err != nil {
return false, err
}
req, err := http.NewRequest("POST", w.conf.URL, &buf)
if err != nil {
return true, err
}
req.Header.Set("Content-Type", contentTypeJSON)
req.Header.Set("User-Agent", userAgentHeader)
c, err := commoncfg.NewHTTPClientFromConfig(w.conf.HTTPConfig)
if err != nil {
return false, err
}
resp, err := ctxhttp.Do(ctx, c, req)
if err != nil {
return true, err
}
resp.Body.Close()
return w.retry(resp.StatusCode)
}