From 88f0ab0233d8a1c63595f47ae3cff8e6b687769e Mon Sep 17 00:00:00 2001 From: Rebeccah Date: Mon, 19 Sep 2022 13:03:20 -0400 Subject: [PATCH] add new alert rule for when error rate is over a certain rate, also fix typo in URL and in grafana alert rule Important learning: no newlines in rules/equations turns out datasourceUid can be set in prometheus_source.yml, and it can be anything we want. So I have set it to awx_alert, the PBFAnumbersetc value it was set to before was an autogenerated UID, and it would actually work just with that generated value, but because we want it to make sense, we're setting the value in prometheus_source.yml finally, update the docs to be reflective of grafana docs and how to export new rules a user might want to add. Co-authored-by: Elijah DeLee --- tools/docker-compose/README.md | 10 +- tools/grafana/alerting/alerts.yml | 224 +++++++++++++++++- .../grafana/datasources/prometheus_source.yml | 1 + 3 files changed, 226 insertions(+), 9 deletions(-) diff --git a/tools/docker-compose/README.md b/tools/docker-compose/README.md index 9080a6339b..585c2cd5bb 100644 --- a/tools/docker-compose/README.md +++ b/tools/docker-compose/README.md @@ -483,5 +483,11 @@ $ PROMETHEUS=yes GRAFANA=yes make docker-compose ### Alerts in Grafana -We are configuring alerts in grafana using the provisioning files method. This feature is new in Grafana as of August, 2022. Documentation can be found: https://grafana.com/docs/grafana/latest/administration/provisioning/#alerting however it does not fully show all parameters to the config. One way to understand how to build rules is to build them in the UI and use chrometools to inspect the payload as you save the rules. It appears that the "data" portion of the payload for each rule is the same syntax as needed in the provisioning file config. To reload the alerts without restarting the container, from within the container you can send a POST with `curl -X POST http://admin:admin@localhost:3000/api/admin/provisioning/alerting/relo -ad`. Keep in mind the grafana container does not default contain `curl` and you can get it with `apk add curl`. +We are configuring alerts in grafana using the provisioning files method. This feature is new in Grafana as of August, 2022. Documentation can be found: https://grafana.com/docs/grafana/latest/administration/provisioning/#alerting however it does not fully show all parameters to the config. + +One way to understand how to build rules is to build them in the UI and use chrometools to inspect the payload as you save the rules. It appears that the "data" portion of the payload for each rule is the same syntax as needed in the provisioning file config. To reload the alerts without restarting the container, from within the container you can send a POST with `curl -X POST http://admin:admin@localhost:3000/api/admin/provisioning/alerting/reload`. Keep in mind the grafana container does not contain `curl`. You can install it with the command `apk add curl`. + +Another way to export rules is explore the api. +1. Get all the folders: `GET` to `/api/folders` +2. Get the rules `GET` to `/api/ruler/grafana/api/v1/rules/{{ Folder }}` + diff --git a/tools/grafana/alerting/alerts.yml b/tools/grafana/alerting/alerts.yml index 155bcf9733..2a3760d84e 100644 --- a/tools/grafana/alerting/alerts.yml +++ b/tools/grafana/alerting/alerts.yml @@ -6,10 +6,167 @@ groups: name: awx_rules orgId: 1 rules: - - condition: A + - condition: if_failures_too_high dashboardUid: awx data: - - datasourceUid: PBFA97CFB590B2093 + - refId: total_errors + queryType: '' + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: awx_alert + model: + editorMode: code + expr: >- + max(delta(awx_instance_status_total{instance="awx1:8013", + status="failed|error"}[30m])) + hide: false + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: true + refId: total_errors + - refId: max_errors + queryType: '' + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: '-100' + model: + conditions: + - evaluator: + params: + - 80 + - 0 + type: gt + operator: + type: and + query: + params: + - total_errors + reducer: + params: [] + type: max + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: total_errors + hide: false + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: max_errors + type: reduce + - refId: total_success + queryType: '' + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: awx_alert + model: + datasource: + type: prometheus + uid: awx_alert + editorMode: code + expr: >- + max(delta(awx_instance_status_total{instance="awx1:8013", + status="successful"}[30m])) + hide: false + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: true + refId: total_success + - refId: max_success + queryType: '' + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: '-100' + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: + - total_success + reducer: + params: [] + type: max + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: total_success + hide: false + intervalMs: 1000 + maxDataPoints: 43200 + reducer: max + refId: max_success + type: reduce + - refId: compare + queryType: '' + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: '-100' + model: + conditions: + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: + - max_success + reducer: + params: [] + type: avg + type: query + - evaluator: + params: + - 0 + - 0 + type: gt + operator: + type: and + query: + params: + - max_success + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: $max_errors / ($max_errors+$max_success) >= .2 + hide: false + intervalMs: 1000 + maxDataPoints: 43200 + refId: compare + type: math + for: 30m + noDataState: OK + panelId: 2 + title: failure_rate_exceeded_20_percent + uid: failure_rate_exceeded_20_percent + - condition: if_redis_queue_too_large + dashboardUid: awx + data: + - datasourceUid: awx_alert model: editorMode: code expr: irate(callback_receiver_events_insert_db{node='awx_1'}[1m]) @@ -55,11 +212,11 @@ groups: relativeTimeRange: from: 0 to: 0 - - datasourceUid: PBFA97CFB590B2093 + - datasourceUid: awx_alert model: datasource: type: prometheus - uid: PBFA97CFB590B2093 + uid: awx_alert editorMode: code expr: callback_receiver_events_queue_size_redis{node='awx_1'} hide: false @@ -125,9 +282,7 @@ groups: name: Expression type: __expr__ uid: __expr__ - expression: '( - ${mean_redis_queue_size} > - ($mean_event_insertion_rate\ * 120))' + expression: '($mean_redis_queue_size > ($mean_event_insertion_rate * 120))' hide: false intervalMs: 1000 maxDataPoints: 43200 @@ -143,3 +298,58 @@ groups: panelId: 1 title: redis_queue_too_large_to_clear_in_2_min uid: redis_queue_too_large_to_clear_in_2_min + - condition: if_capacity_is_too_low + dashboardUid: awx + no_data_state: OK + exec_err_state: Error + data: + - refId: remaining_capacity + queryType: '' + relativeTimeRange: + from: 1800 + to: 0 + datasourceUid: awx_alert + model: + editorMode: builder + expr: awx_instance_remaining_capacity{instance="awx1:8013"} + hide: false + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: true + refId: remaining_capacity + - refId: if_capacity_is_too_low + queryType: '' + relativeTimeRange: + from: 0 + to: 0 + datasourceUid: "-100" + model: + conditions: + - evaluator: + params: + - 20 + - 0 + type: lt + operator: + type: when + query: + params: + - remaining_capacity + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: remaining_capacity + hide: false + intervalMs: 1000 + maxDataPoints: 43200 + refId: if_capacity_is_too_low + type: classic_conditions + for: 30m + title: if_capacity_is_too_low + uid: if_capacity_is_too_low diff --git a/tools/grafana/datasources/prometheus_source.yml b/tools/grafana/datasources/prometheus_source.yml index 22619c637f..80b9a88e5c 100644 --- a/tools/grafana/datasources/prometheus_source.yml +++ b/tools/grafana/datasources/prometheus_source.yml @@ -10,3 +10,4 @@ datasources: editable: true jsonData: timeInterval: 5s + uid: awx_alert