Prometheus 实战于源码分析之alert

prometheus不仅可以提供数据采集功能,而且还可以做告警服务,通过匹配的性能参数,发出告警,先看看alert配置。

rule_files:
  [ -  ... ]

# A list of scrape configurations.
scrape_configs:
  [ -  ... ]

# Alerting specifies settings related to the Alertmanager.
alerting:
  alert_relabel_configs:
    [ -  ... ]
  alertmanagers:
    [ -  ... ]

其中的rule_file就是定义告警的规则的文件,譬如下面这个例子:

# Alert for any instance that is unreachable for >5 minutes.
ALERT InstanceDown
  IF up == 0
  FOR 5m
  LABELS { severity = "page" }
  ANNOTATIONS {
    summary = "Instance {{ $labels.instance }} down",
    description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
  }

# Alert for any instance that have a median request latency >1s.
ALERT APIHighRequestLatency
  IF api_http_request_latencies_second{quantile="0.5"} > 1
  FOR 1m
  ANNOTATIONS {
    summary = "High request latency on {{ $labels.instance }}",
    description = "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)",
  }

下面就介绍一下告警的产生和通知。先看rules的加载

func (m *Manager) loadGroups(interval time.Duration, filenames ...string) (map[string]*Group, error) {
    rules := []Rule{}
    for _, fn := range filenames {
        content, err := ioutil.ReadFile(fn)
        if err != nil {
            return nil, err
        }
        stmts, err := promql.ParseStmts(string(content))
        if err != nil {
            return nil, fmt.Errorf("error parsing %s: %s", fn, err)
        }

        for _, stmt := range stmts {
            var rule Rule

            switch r := stmt.(type) {
            case *promql.AlertStmt:
                rule = NewAlertingRule(r.Name, r.Expr, r.Duration, r.Labels, r.Annotations)

            case *promql.RecordStmt:
                rule = NewRecordingRule(r.Name, r.Expr, r.Labels)

            default:
                panic("retrieval.Manager.LoadRuleFiles: unknown statement type")
            }
            rules = append(rules, rule)
        }
    }

    // Currently there is no group syntax implemented. Thus all rules
    // are read into a single default group.
    g := NewGroup("default", interval, rules, m.opts)
    groups := map[string]*Group{g.name: g}
    return groups, nil
}

通过rule读取配置文件加载rules。这些告警是可以分组的,这默认是default组。
然后看一下alert是怎么定义的

type Alert struct {
    State       AlertState
    Labels      model.LabelSet
    Annotations model.LabelSet
    Value model.SampleValue
    ActiveAt, ResolvedAt model.Time
}

const (
    StateInactive AlertState = iota
    // 少于阈值持续时间
    StatePending
    // 大于阈值持续时间
    StateFiring
)

上面定义alert以及它的三种状态。

type AlertingRule struct {
    name string
    vector promql.Expr
    holdDuration time.Duration
    labels model.LabelSet
    annotations model.LabelSet
    mtx sync.Mutex
    active map[model.Fingerprint]*Alert
}

active记录了这个规则触发的告警。
现在开始走流程rules/manager.go,run方法

func (g *Group) run() {
    defer close(g.terminated)

    // Wait an initial amount to have consistently slotted intervals.
    select {
    case <-time.After(g.offset()):
    case <-g.done:
        return
    }

    iter := func() {
        iterationsScheduled.Inc()
        if g.opts.SampleAppender.NeedsThrottling() {
            iterationsSkipped.Inc()
            return
        }
        start := time.Now()
        g.Eval()

        iterationDuration.Observe(time.Since(start).Seconds())
    }
    iter()

    tick := time.NewTicker(g.interval)
    defer tick.Stop()

    for {
        select {
        case <-g.done:
            return
        default:
            select {
            case <-g.done:
                return
            case <-tick.C:
                iter()
            }
        }
    }
}

通过g.Eval(),遍历rules去匹配

func (g *Group) Eval() {
    var (
        now = model.Now()
        wg  sync.WaitGroup
    )

    for _, rule := range g.rules {
        rtyp := string(typeForRule(rule))

        wg.Add(1)
        // BUG(julius): Look at fixing thundering herd.
        go func(rule Rule) {
            defer wg.Done()

            defer func(t time.Time) {
                evalDuration.WithLabelValues(rtyp).Observe(time.Since(t).Seconds())
            }(time.Now())

            evalTotal.WithLabelValues(rtyp).Inc()

            vector, err := rule.Eval(g.opts.Context, now, g.opts.QueryEngine, g.opts.ExternalURL.Path)
            if err != nil {
                // Canceled queries are intentional termination of queries. This normally
                // happens on shutdown and thus we skip logging of any errors here.
                if _, ok := err.(promql.ErrQueryCanceled); !ok {
                    log.Warnf("Error while evaluating rule %q: %s", rule, err)
                }
                evalFailures.WithLabelValues(rtyp).Inc()
                return
            }

            if ar, ok := rule.(*AlertingRule); ok {
                g.sendAlerts(ar, now)
            }
            var (
                numOutOfOrder = 0
                numDuplicates = 0
            )
            for _, s := range vector {
                if err := g.opts.SampleAppender.Append(s); err != nil {
                    switch err {
                    case local.ErrOutOfOrderSample:
                        numOutOfOrder++
                        log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded")
                    case local.ErrDuplicateSampleForTimestamp:
                        numDuplicates++
                        log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded")
                    default:
                        log.With("sample", s).With("error", err).Warn("Rule evaluation result discarded")
                    }
                }
            }
            if numOutOfOrder > 0 {
                log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order result from rule evaluation")
            }
            if numDuplicates > 0 {
                log.With("numDropped", numDuplicates).Warn("Error on ingesting results from rule evaluation with different value but same timestamp")
            }
        }(rule)
    }
    wg.Wait()
}

通过rules/alerting.go里面的Eval犯法返回告警

func (r *AlertingRule) Eval(ctx context.Context, ts model.Time, engine *promql.Engine, externalURLPath string) (model.Vector, error) {
    query, err := engine.NewInstantQuery(r.vector.String(), ts)
    if err != nil {
        return nil, err
    }
    res, err := query.Exec(ctx).Vector()
    if err != nil {
        return nil, err
    }

    r.mtx.Lock()
    defer r.mtx.Unlock()

    // Create pending alerts for any new vector elements in the alert expression
    // or update the expression value for existing elements.
    resultFPs := map[model.Fingerprint]struct{}{}

    for _, smpl := range res {
        // Provide the alert information to the template.
        l := make(map[string]string, len(smpl.Metric))
        for k, v := range smpl.Metric {
            l[string(k)] = string(v)
        }

        tmplData := struct {
            Labels map[string]string
            Value  float64
        }{
            Labels: l,
            Value:  float64(smpl.Value),
        }
        // Inject some convenience variables that are easier to remember for users
        // who are not used to Go's templating system.
        defs := "{{$labels := .Labels}}{{$value := .Value}}"

        expand := func(text model.LabelValue) model.LabelValue {
            tmpl := template.NewTemplateExpander(
                ctx,
                defs+string(text),
                "__alert_"+r.Name(),
                tmplData,
                ts,
                engine,
                externalURLPath,
            )
            result, err := tmpl.Expand()
            if err != nil {
                result = fmt.Sprintf("", err)
                log.Warnf("Error expanding alert template %v with data '%v': %s", r.Name(), tmplData, err)
            }
            return model.LabelValue(result)
        }

        delete(smpl.Metric, model.MetricNameLabel)
        labels := make(model.LabelSet, len(smpl.Metric)+len(r.labels)+1)
        for ln, lv := range smpl.Metric {
            labels[ln] = lv
        }
        for ln, lv := range r.labels {
            labels[ln] = expand(lv)
        }
        labels[model.AlertNameLabel] = model.LabelValue(r.Name())

        annotations := make(model.LabelSet, len(r.annotations))
        for an, av := range r.annotations {
            annotations[an] = expand(av)
        }
        fp := smpl.Metric.Fingerprint()
        resultFPs[fp] = struct{}{}

        // Check whether we already have alerting state for the identifying label set.
        // Update the last value and annotations if so, create a new alert entry otherwise.
        if alert, ok := r.active[fp]; ok && alert.State != StateInactive {
            alert.Value = smpl.Value
            alert.Annotations = annotations
            continue
        }

        r.active[fp] = &Alert{
            Labels:      labels,
            Annotations: annotations,
            ActiveAt:    ts,
            State:       StatePending,
            Value:       smpl.Value,
        }
    }

    var vec model.Vector
    // Check if any pending alerts should be removed or fire now. Write out alert timeseries.
    for fp, a := range r.active {
        if _, ok := resultFPs[fp]; !ok {
            if a.State != StateInactive {
                vec = append(vec, r.sample(a, ts, false))
            }
            // If the alert was previously firing, keep it around for a given
            // retention time so it is reported as resolved to the AlertManager.
            if a.State == StatePending || (a.ResolvedAt != 0 && ts.Sub(a.ResolvedAt) > resolvedRetention) {
                delete(r.active, fp)
            }
            if a.State != StateInactive {
                a.State = StateInactive
                a.ResolvedAt = ts
            }
            continue
        }

        if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
            vec = append(vec, r.sample(a, ts, false))
            a.State = StateFiring
        }

        vec = append(vec, r.sample(a, ts, true))
    }

    return vec, nil
}

这个方法有点长,主要是做状态判断。res是执行查新语句返回的符合条件的监控资源,后面转化成告警。

你可能感兴趣的:(云计算)