Files
awx/tools/grafana/alerting/alerts.yml
Rebeccah 88f0ab0233 add new alert rule for when error rate is over a certain rate, also fix
typo in URL and in grafana alert rule

Important learning: no newlines in rules/equations

turns out datasourceUid can be set in prometheus_source.yml, and it can be anything we want. So I have set it to awx_alert, the PBFAnumbersetc value it was set to before was an autogenerated UID, and it would actually work just with that generated value, but because we want it to make sense, we're setting the value in prometheus_source.yml

finally, update the docs to be reflective of grafana docs and how to export new rules a user might want to add.

Co-authored-by: Elijah DeLee <kdelee@redhat.com>
2022-09-23 15:05:57 -04:00

356 lines
10 KiB
YAML

---
apiVersion: 1
groups:
- folder: awx
interval: 60s
name: awx_rules
orgId: 1
rules:
- condition: if_failures_too_high
dashboardUid: awx
data:
- refId: total_errors
queryType: ''
relativeTimeRange:
from: 600
to: 0
datasourceUid: awx_alert
model:
editorMode: code
expr: >-
max(delta(awx_instance_status_total{instance="awx1:8013",
status="failed|error"}[30m]))
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: total_errors
- refId: max_errors
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: '-100'
model:
conditions:
- evaluator:
params:
- 80
- 0
type: gt
operator:
type: and
query:
params:
- total_errors
reducer:
params: []
type: max
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: total_errors
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: max
refId: max_errors
type: reduce
- refId: total_success
queryType: ''
relativeTimeRange:
from: 600
to: 0
datasourceUid: awx_alert
model:
datasource:
type: prometheus
uid: awx_alert
editorMode: code
expr: >-
max(delta(awx_instance_status_total{instance="awx1:8013",
status="successful"}[30m]))
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: total_success
- refId: max_success
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: '-100'
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- total_success
reducer:
params: []
type: max
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: total_success
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: max
refId: max_success
type: reduce
- refId: compare
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: '-100'
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- max_success
reducer:
params: []
type: avg
type: query
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- max_success
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: $max_errors / ($max_errors+$max_success) >= .2
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: compare
type: math
for: 30m
noDataState: OK
panelId: 2
title: failure_rate_exceeded_20_percent
uid: failure_rate_exceeded_20_percent
- condition: if_redis_queue_too_large
dashboardUid: awx
data:
- datasourceUid: awx_alert
model:
editorMode: code
expr: irate(callback_receiver_events_insert_db{node='awx_1'}[1m])
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: events_insertion_rate_per_second
queryType: ""
refId: events_insertion_rate_per_second
relativeTimeRange:
from: 300
to: 0
- datasourceUid: -100
model:
conditions:
- evaluator:
params:
- 3
type: gt
operator:
type: and
query:
params:
- event_insertion_rate
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: -100
expression: events_insertion_rate_per_second
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: mean
refId: mean_event_insertion_rate
type: reduce
queryType: ""
refId: mean_event_insertion_rate
relativeTimeRange:
from: 0
to: 0
- datasourceUid: awx_alert
model:
datasource:
type: prometheus
uid: awx_alert
editorMode: code
expr: callback_receiver_events_queue_size_redis{node='awx_1'}
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: redis_queue_size
queryType: ""
refId: redis_queue_size
relativeTimeRange:
from: 300
to: 0
- datasourceUid: -100
model:
conditions:
- evaluator:
params:
- 3
type: gt
operator:
type: and
query:
params:
- event_insertion_rate
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: -100
expression: redis_queue_size
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: mean_redis_queue_size
type: reduce
queryType: ""
refId: mean_redis_queue_size
relativeTimeRange:
from: 0
to: 0
- datasourceUid: -100
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- mean_redis_queue_size
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: '($mean_redis_queue_size > ($mean_event_insertion_rate * 120))'
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: redis_queue_growing_faster_than_insertion_rate
type: math
queryType: ""
refId: redis_queue_growing_faster_than_insertion_rate
relativeTimeRange:
from: 0
to: 0
for: 60s
noDataState: OK
panelId: 1
title: redis_queue_too_large_to_clear_in_2_min
uid: redis_queue_too_large_to_clear_in_2_min
- condition: if_capacity_is_too_low
dashboardUid: awx
no_data_state: OK
exec_err_state: Error
data:
- refId: remaining_capacity
queryType: ''
relativeTimeRange:
from: 1800
to: 0
datasourceUid: awx_alert
model:
editorMode: builder
expr: awx_instance_remaining_capacity{instance="awx1:8013"}
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: remaining_capacity
- refId: if_capacity_is_too_low
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 20
- 0
type: lt
operator:
type: when
query:
params:
- remaining_capacity
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: remaining_capacity
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: if_capacity_is_too_low
type: classic_conditions
for: 30m
title: if_capacity_is_too_low
uid: if_capacity_is_too_low