diff --git a/README.md b/README.md index c9a1b67..a07bbd7 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,8 @@ Examples: | total=2 firing=1 pending=0 inactive=1 Flags: + -S, --label-key-state string Use the given AlertRule label to override the exit state for firing alerts. + If this flag is set the plugin looks for warning/critical/ok in the provided label key --exclude-alert stringArray Alerts to ignore. Can be used multiple times and supports regex. --exclude-label stringArray The label of one or more specific alerts to exclude. This parameter can be repeated e.g.: '--exclude-label prio=high --exclude-label another=example' @@ -170,6 +172,11 @@ Flags: -P, --problems Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed ``` +The `--label-key-state` can be used to override the exit code for firing alerts. +When the flag is set, the plugin looks for the given label key on the AlertRule and uses +the specified as label value (`warning/critical/ok`) as exit code. +An invalid value will result in an UNKNOWN exit code. + #### Checking all defined alerts ```bash diff --git a/cmd/alert.go b/cmd/alert.go index be8466a..f6a68f1 100644 --- a/cmd/alert.go +++ b/cmd/alert.go @@ -22,6 +22,7 @@ type AlertConfig struct { ExcludeLabels []string IncludeLabels []string ProblemsOnly bool + StateLabelKey string NoAlertsState string } @@ -144,8 +145,9 @@ inactive = 0`, // Handle Inactive Alerts if len(rl.AlertingRule.Alerts) == 0 { - // Counting states for perfdata - switch rl.GetStatus() { + // Counting states for perfdata. We don't use the state-label override here + // to have the acutal count from Prometheus + switch rl.GetStatus("") { case 0: counterInactive++ case 1: @@ -156,7 +158,7 @@ inactive = 0`, sc := result.NewPartialResult() - _ = sc.SetState(rl.GetStatus()) + _ = sc.SetState(rl.GetStatus(cliAlertConfig.StateLabelKey)) sc.Output = rl.GetOutput() overall.AddSubcheck(sc) } @@ -165,8 +167,9 @@ inactive = 0`, if len(rl.AlertingRule.Alerts) > 0 { // Handle Pending or Firing Alerts for _, alert := range rl.AlertingRule.Alerts { - // Counting states for perfdata - switch rl.GetStatus() { + // Counting states for perfdata. We don't use the state-label override here + // to have the acutal count from Prometheus + switch rl.GetStatus("") { case 0: counterInactive++ case 1: @@ -177,7 +180,7 @@ inactive = 0`, sc := result.NewPartialResult() - _ = sc.SetState(rl.GetStatus()) + _ = sc.SetState(rl.GetStatus(cliAlertConfig.StateLabelKey)) // Set the alert in the internal Type to generate the output rl.Alert = alert sc.Output = rl.GetOutput() @@ -248,6 +251,10 @@ func init() { fs.BoolVarP(&cliAlertConfig.ProblemsOnly, "problems", "P", false, "Display only alerts which status is not inactive/OK. Note that in combination with the --name flag this might result in no alerts being displayed") + + fs.StringVarP(&cliAlertConfig.StateLabelKey, "label-key-state", "S", "", + "Use the given AlertRule label to override the exit state for firing alerts."+ + "\nIf this flag is set the plugin looks for warning/critical/ok in the provided label key") } // Function to convert state to integer. diff --git a/cmd/alert_test.go b/cmd/alert_test.go index 2a5124c..7565d8a 100644 --- a/cmd/alert_test.go +++ b/cmd/alert_test.go @@ -304,6 +304,22 @@ exit status 2 args: []string{"run", "../main.go", "alert", "--exclude-label", "team=database", "--exclude-label", "severity=critical"}, expected: "[OK] - 0 Alerts: 0 Firing - 0 Pending - 0 Inactive\n\\_ [OK] No alerts retrieved\n|total=0 firing=0 pending=0 inactive=0\n\n", }, + { + name: "alert-state-label", + server: httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write(loadTestdata(alertTestDataSet1)) + })), + args: []string{"run", "../main.go", "alert", "--label-key-state=icinga"}, + expected: `[WARNING] - 3 Alerts: 1 Firing - 1 Pending - 1 Inactive +\_ [OK] [HostOutOfMemory] is inactive +\_ [WARNING] [SqlAccessDeniedRate] - Job: [mysql] on Instance: [localhost] is pending - value: 0.40 - {"alertname":"SqlAccessDeniedRate","instance":"localhost","job":"mysql","severity":"warning"} +\_ [OK] [BlackboxTLS] - Job: [blackbox] on Instance: [https://localhost:443] is firing - value: -6065338.00 - {"alertname":"TLS","instance":"https://localhost:443","job":"blackbox","severity":"critical"} +|total=3 firing=1 pending=1 inactive=1 + +exit status 1 +`, + }, } for _, test := range tests { diff --git a/internal/alert/alert.go b/internal/alert/alert.go index 9733a04..841ab37 100644 --- a/internal/alert/alert.go +++ b/internal/alert/alert.go @@ -58,8 +58,10 @@ func FlattenRules(groups []v1.RuleGroup, wantedGroups []string) []Rule { return rules } -func (a *Rule) GetStatus() (status int) { - switch a.AlertingRule.State { +func (a *Rule) GetStatus(labelKey string) (status int) { + state := a.AlertingRule.State + + switch state { case string(v1.AlertStateFiring): status = check.Critical case string(v1.AlertStatePending): @@ -70,6 +72,26 @@ func (a *Rule) GetStatus() (status int) { status = check.Unknown } + if state == string(v1.AlertStateFiring) && labelKey != "" { + stateLabel, ok := a.AlertingRule.Labels[model.LabelName(labelKey)] + // If there is no such label key, we're done + if !ok { + return status + } + + lb := strings.ToLower(string(stateLabel)) + switch lb { + case "warning": + status = check.Warning + case "critical": + status = check.Critical + case "ok": + status = check.OK + default: + status = check.Unknown + } + } + return status } diff --git a/internal/alert/alert_test.go b/internal/alert/alert_test.go index 392d1e6..0d3ca5b 100644 --- a/internal/alert/alert_test.go +++ b/internal/alert/alert_test.go @@ -10,7 +10,6 @@ import ( ) func TestGetStatus(t *testing.T) { - testTime := time.Now() ar := v1.AlertingRule{ @@ -49,19 +48,67 @@ func TestGetStatus(t *testing.T) { Alert: ar.Alerts[0], } - actual := r.GetStatus() + actual := r.GetStatus("") if actual != check.Critical { t.Error("\nActual: ", actual, "\nExpected: ", check.Critical) } r.AlertingRule.State = "pending" - actual = r.GetStatus() + actual = r.GetStatus("") if actual != check.Warning { t.Error("\nActual: ", actual, "\nExpected: ", check.Warning) } } +func TestGetStatus_WithLabel(t *testing.T) { + ar := v1.AlertingRule{ + Alerts: []*v1.Alert{ + { + Annotations: model.LabelSet{ + "summary": "High request latency", + }, + Labels: model.LabelSet{ + "alertname": "HighRequestLatency", + "severity": "page", + }, + State: v1.AlertStateFiring, + Value: "1e+00", + }, + }, + Annotations: model.LabelSet{ + "summary": "High request latency", + }, + Labels: model.LabelSet{ + "severity": "page", + "icingaState": "OK", + }, + Duration: 600, + Health: v1.RuleHealthGood, + Name: "HighRequestLatency", + Query: "job:request_latency_seconds:mean5m{job=\"myjob\"} > 0.5", + LastError: "", + EvaluationTime: 0.5, + State: "firing", + } + + r := Rule{ + AlertingRule: ar, + Alert: ar.Alerts[0], + } + + actual := r.GetStatus("icingaState") + if actual != check.OK { + t.Error("\nActual: ", actual, "\nExpected: ", check.Critical) + } + + r.AlertingRule.State = "pending" + actual = r.GetStatus("icingaState") + if actual != check.Warning { + t.Error("\nActual: ", actual, "\nExpected: ", check.Warning) + } +} + func TestGetOutput(t *testing.T) { testTime := time.Now() diff --git a/testdata/alertmanager/alert.rules b/testdata/alertmanager/alert.rules index 933df38..d2f71d0 100644 --- a/testdata/alertmanager/alert.rules +++ b/testdata/alertmanager/alert.rules @@ -7,6 +7,7 @@ groups: for: 0m labels: severity: critical + icingaState: warning annotations: summary: Prometheus target missing (instance {{ $labels.instance }}) description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -16,6 +17,7 @@ groups: for: 0m labels: severity: low + icingaState: warning annotations: summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/testdata/unittest/alertDataset1.json b/testdata/unittest/alertDataset1.json index ef5ebc5..54a346b 100644 --- a/testdata/unittest/alertDataset1.json +++ b/testdata/unittest/alertDataset1.json @@ -87,6 +87,7 @@ "duration": 0, "labels": { "severity": "critical", + "icinga": "ok", "team": "network" }, "annotations": {