awx/tools/grafana/alerting/alerts.yml
Elijah DeLee d50c97ae22
Updates to Grafana Dashboard and example alerts
More fun in the grafana dashboard. The rows organize the panels and are
collapsable. Also, tested with multiple nodes and fixed some
labeling issues when there are more than one node.

Update grafana alerting readme info and some fun prose about one of the
alerts as well as some reorganizing of the code for clarity.

finally, drop the time to fire for alerts because it's better to have them be a bit touchy so users can verify they work vs. not being sure.
2022-10-11 11:14:22 -04:00

537 lines
16 KiB
YAML

---
apiVersion: 1
groups:
- folder: awx
interval: 10s
name: awx_rules
orgId: 1
exec_err_state: Alerting
no_data_state: NoData
rules:
- for: 5m
noDataState: OK
panelId: 2
title: failure_rate_exceeded_20_percent
uid: failure_rate_exceeded_20_percent
condition: compare
data:
- refId: total_errors
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: awx_prometheus
model:
editorMode: code
expr: >-
max(delta(awx_instance_status_total{instance="awx1:8013",
status=~"failed|error"}[30m]))
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: total_errors
- refId: max_errors
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 80
- 0
type: gt
operator:
type: and
query:
params:
- total_errors
reducer:
params: []
type: max
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: total_errors
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: max
refId: max_errors
type: reduce
- refId: total_success
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: awx_prometheus
model:
datasource:
type: prometheus
uid: awx_prometheus
editorMode: code
expr: >-
max(delta(awx_instance_status_total{instance="awx1:8013",
status="successful"}[30m]))
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: total_success
- refId: max_success
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- total_success
reducer:
params: []
type: max
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: total_success
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: max
refId: max_success
type: reduce
- refId: compare
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- max_success
reducer:
params: []
type: avg
type: query
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- max_success
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: $max_errors / ($max_errors+$max_success) >= .2
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: compare
type: math
- for: 60s
noDataState: OK
panelId: 1
title: redis_queue_too_large_to_clear_in_2_min
uid: redis_queue_too_large_to_clear_in_2_min
condition: redis_queue_growing_faster_than_insertion_rate
dashboardUid: awx
data:
- refId: events_insertion_rate_per_second
relativeTimeRange:
from: 300
to: 0
datasourceUid: awx_prometheus
model:
editorMode: code
expr: irate(callback_receiver_events_insert_db{node='awx_1'}[1m])
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: events_insertion_rate_per_second
queryType: ""
- refId: mean_event_insertion_rate
relativeTimeRange:
from: 0
to: 0
datasourceUid: -100
model:
conditions:
- evaluator:
params:
- 3
type: gt
operator:
type: and
query:
params:
- event_insertion_rate
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: -100
expression: events_insertion_rate_per_second
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: mean
refId: mean_event_insertion_rate
type: reduce
queryType: ""
- refId: redis_queue_size
relativeTimeRange:
from: 300
to: 0
datasourceUid: awx_prometheus
model:
datasource:
type: prometheus
uid: awx_prometheus
editorMode: code
expr: callback_receiver_events_queue_size_redis{node='awx_1'}
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: redis_queue_size
queryType: ""
- refId: last_redis_queue_size
relativeTimeRange:
from: 0
to: 0
datasourceUid: -100
model:
conditions:
- evaluator:
params:
- 3
type: gt
operator:
type: and
query:
params:
- event_insertion_rate
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: -100
expression: redis_queue_size
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: last_redis_queue_size
type: reduce
queryType: ""
- refId: redis_queue_growing_faster_than_insertion_rate
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: -100
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- last_redis_queue_size
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: "($last_redis_queue_size > ($mean_event_insertion_rate * 120))"
hide: false
intervalMs: 1000
maxDataPoints: 43200
type: math
- for: 60s
noDataState: OK
panelId: 3
uid: capacity_below_10_percent
title: capacity_below_10_percent
condition: pending_jobs_and_capacity_compare
data:
- refId: remaining_capacity
queryType: ""
relativeTimeRange:
from: 300
to: 0
datasourceUid: awx_prometheus
model:
editorMode: code
expr: sum(awx_instance_remaining_capacity)
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: remaining_capacity
- refId: last_remaining_capacity
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 3
type: outside_range
operator:
type: and
query:
params:
- total_capacity
reducer:
params: []
type: percent_diff
type: query
datasource:
type: __expr__
uid: "-100"
expression: remaining_capacity
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: last_remaining_capacity
type: reduce
- refId: total_capacity
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: awx_prometheus
model:
datasource:
type: prometheus
uid: awx_prometheus
editorMode: code
expr: sum(awx_instance_capacity{instance="awx1:8013"})
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: total_capacity
- refId: last_total_capacity
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- capacity_below_10%
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: total_capacity
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: last_total_capacity
type: reduce
- refId: 10_percent_total_capacity
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- last_total_capacity
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: "$last_total_capacity*.10"
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: 10_percent_total_capacity
type: math
- refId: pending_jobs
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: awx_prometheus
model:
datasource:
type: prometheus
uid: awx_prometheus
editorMode: builder
expr: awx_pending_jobs_total{instance="awx1:8013"}
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: pending_jobs
- refId: last_pending_jobs
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- pending_jobs_and_capacity_compare
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: pending_jobs
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: last_pending_jobs
type: reduce
- refId: pending_jobs_and_capacity_compare
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- 10_percent_total_capacity
reducer:
params: []
type: last
type: query
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- pending_jobs
reducer:
params: []
type: last
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression:
"($10_percent_total_capacity > $last_remaining_capacity) && $last_pending_jobs
> 1"
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: mean
refId: pending_jobs_and_capacity_compare
type: math