prometheus不仅可以提供数据采集功能,而且还可以做告警服务,通过匹配的性能参数,发出告警,先看看alert配置。
rule_files:
[ - ... ]
# A list of scrape configurations.
scrape_configs:
[ - ... ]
# Alerting specifies settings related to the Alertmanager.
alerting:
alert_relabel_configs:
[ - ... ]
alertmanagers:
[ - ... ]
其中的rule_file就是定义告警的规则的文件,譬如下面这个例子:
# Alert for any instance that is unreachable for >5 minutes.
ALERT InstanceDown
IF up == 0
FOR 5m
LABELS { severity = "page" }
ANNOTATIONS {
summary = "Instance {{ $labels.instance }} down",
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.",
}
# Alert for any instance that have a median request latency >1s.
ALERT APIHighRequestLatency
IF api_http_request_latencies_second{quantile="0.5"} > 1
FOR 1m
ANNOTATIONS {
summary = "High request latency on {{ $labels.instance }}",
description = "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)",
}
下面就介绍一下告警的产生和通知。先看rules的加载
func (m *Manager) loadGroups(interval time.Duration, filenames ...string) (map[string]*Group, error) {
rules := []Rule{}
for _, fn := range filenames {
content, err := ioutil.ReadFile(fn)
if err != nil {
return nil, err
}
stmts, err := promql.ParseStmts(string(content))
if err != nil {
return nil, fmt.Errorf("error parsing %s: %s", fn, err)
}
for _, stmt := range stmts {
var rule Rule
switch r := stmt.(type) {
case *promql.AlertStmt:
rule = NewAlertingRule(r.Name, r.Expr, r.Duration, r.Labels, r.Annotations)
case *promql.RecordStmt:
rule = NewRecordingRule(r.Name, r.Expr, r.Labels)
default:
panic("retrieval.Manager.LoadRuleFiles: unknown statement type")
}
rules = append(rules, rule)
}
}
// Currently there is no group syntax implemented. Thus all rules
// are read into a single default group.
g := NewGroup("default", interval, rules, m.opts)
groups := map[string]*Group{g.name: g}
return groups, nil
}
通过rule读取配置文件加载rules。这些告警是可以分组的,这默认是default组。
然后看一下alert是怎么定义的
type Alert struct {
State AlertState
Labels model.LabelSet
Annotations model.LabelSet
Value model.SampleValue
ActiveAt, ResolvedAt model.Time
}
const (
StateInactive AlertState = iota
// 少于阈值持续时间
StatePending
// 大于阈值持续时间
StateFiring
)
上面定义alert以及它的三种状态。
type AlertingRule struct {
name string
vector promql.Expr
holdDuration time.Duration
labels model.LabelSet
annotations model.LabelSet
mtx sync.Mutex
active map[model.Fingerprint]*Alert
}
active记录了这个规则触发的告警。
现在开始走流程rules/manager.go,run方法
func (g *Group) run() {
defer close(g.terminated)
// Wait an initial amount to have consistently slotted intervals.
select {
case <-time.After(g.offset()):
case <-g.done:
return
}
iter := func() {
iterationsScheduled.Inc()
if g.opts.SampleAppender.NeedsThrottling() {
iterationsSkipped.Inc()
return
}
start := time.Now()
g.Eval()
iterationDuration.Observe(time.Since(start).Seconds())
}
iter()
tick := time.NewTicker(g.interval)
defer tick.Stop()
for {
select {
case <-g.done:
return
default:
select {
case <-g.done:
return
case <-tick.C:
iter()
}
}
}
}
通过g.Eval(),遍历rules去匹配
func (g *Group) Eval() {
var (
now = model.Now()
wg sync.WaitGroup
)
for _, rule := range g.rules {
rtyp := string(typeForRule(rule))
wg.Add(1)
// BUG(julius): Look at fixing thundering herd.
go func(rule Rule) {
defer wg.Done()
defer func(t time.Time) {
evalDuration.WithLabelValues(rtyp).Observe(time.Since(t).Seconds())
}(time.Now())
evalTotal.WithLabelValues(rtyp).Inc()
vector, err := rule.Eval(g.opts.Context, now, g.opts.QueryEngine, g.opts.ExternalURL.Path)
if err != nil {
// Canceled queries are intentional termination of queries. This normally
// happens on shutdown and thus we skip logging of any errors here.
if _, ok := err.(promql.ErrQueryCanceled); !ok {
log.Warnf("Error while evaluating rule %q: %s", rule, err)
}
evalFailures.WithLabelValues(rtyp).Inc()
return
}
if ar, ok := rule.(*AlertingRule); ok {
g.sendAlerts(ar, now)
}
var (
numOutOfOrder = 0
numDuplicates = 0
)
for _, s := range vector {
if err := g.opts.SampleAppender.Append(s); err != nil {
switch err {
case local.ErrOutOfOrderSample:
numOutOfOrder++
log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded")
case local.ErrDuplicateSampleForTimestamp:
numDuplicates++
log.With("sample", s).With("error", err).Debug("Rule evaluation result discarded")
default:
log.With("sample", s).With("error", err).Warn("Rule evaluation result discarded")
}
}
}
if numOutOfOrder > 0 {
log.With("numDropped", numOutOfOrder).Warn("Error on ingesting out-of-order result from rule evaluation")
}
if numDuplicates > 0 {
log.With("numDropped", numDuplicates).Warn("Error on ingesting results from rule evaluation with different value but same timestamp")
}
}(rule)
}
wg.Wait()
}
通过rules/alerting.go里面的Eval犯法返回告警
func (r *AlertingRule) Eval(ctx context.Context, ts model.Time, engine *promql.Engine, externalURLPath string) (model.Vector, error) {
query, err := engine.NewInstantQuery(r.vector.String(), ts)
if err != nil {
return nil, err
}
res, err := query.Exec(ctx).Vector()
if err != nil {
return nil, err
}
r.mtx.Lock()
defer r.mtx.Unlock()
// Create pending alerts for any new vector elements in the alert expression
// or update the expression value for existing elements.
resultFPs := map[model.Fingerprint]struct{}{}
for _, smpl := range res {
// Provide the alert information to the template.
l := make(map[string]string, len(smpl.Metric))
for k, v := range smpl.Metric {
l[string(k)] = string(v)
}
tmplData := struct {
Labels map[string]string
Value float64
}{
Labels: l,
Value: float64(smpl.Value),
}
// Inject some convenience variables that are easier to remember for users
// who are not used to Go's templating system.
defs := "{{$labels := .Labels}}{{$value := .Value}}"
expand := func(text model.LabelValue) model.LabelValue {
tmpl := template.NewTemplateExpander(
ctx,
defs+string(text),
"__alert_"+r.Name(),
tmplData,
ts,
engine,
externalURLPath,
)
result, err := tmpl.Expand()
if err != nil {
result = fmt.Sprintf("" , err)
log.Warnf("Error expanding alert template %v with data '%v': %s", r.Name(), tmplData, err)
}
return model.LabelValue(result)
}
delete(smpl.Metric, model.MetricNameLabel)
labels := make(model.LabelSet, len(smpl.Metric)+len(r.labels)+1)
for ln, lv := range smpl.Metric {
labels[ln] = lv
}
for ln, lv := range r.labels {
labels[ln] = expand(lv)
}
labels[model.AlertNameLabel] = model.LabelValue(r.Name())
annotations := make(model.LabelSet, len(r.annotations))
for an, av := range r.annotations {
annotations[an] = expand(av)
}
fp := smpl.Metric.Fingerprint()
resultFPs[fp] = struct{}{}
// Check whether we already have alerting state for the identifying label set.
// Update the last value and annotations if so, create a new alert entry otherwise.
if alert, ok := r.active[fp]; ok && alert.State != StateInactive {
alert.Value = smpl.Value
alert.Annotations = annotations
continue
}
r.active[fp] = &Alert{
Labels: labels,
Annotations: annotations,
ActiveAt: ts,
State: StatePending,
Value: smpl.Value,
}
}
var vec model.Vector
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
for fp, a := range r.active {
if _, ok := resultFPs[fp]; !ok {
if a.State != StateInactive {
vec = append(vec, r.sample(a, ts, false))
}
// If the alert was previously firing, keep it around for a given
// retention time so it is reported as resolved to the AlertManager.
if a.State == StatePending || (a.ResolvedAt != 0 && ts.Sub(a.ResolvedAt) > resolvedRetention) {
delete(r.active, fp)
}
if a.State != StateInactive {
a.State = StateInactive
a.ResolvedAt = ts
}
continue
}
if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
vec = append(vec, r.sample(a, ts, false))
a.State = StateFiring
}
vec = append(vec, r.sample(a, ts, true))
}
return vec, nil
}
这个方法有点长,主要是做状态判断。res是执行查新语句返回的符合条件的监控资源,后面转化成告警。