awx/tools/grafana/alerting/alerts.yml
Chris Meyers ae1235b223 Rename container hostname from awx_1 to awx-1
* Django and other webservers that care about proper hostnames don't
  like underscores in them.
2024-04-03 15:58:17 -04:00

537 lines
16 KiB
YAML

---
apiVersion: 1
groups:
- folder: awx
interval: 10s
name: awx_rules
orgId: 1
exec_err_state: Alerting
no_data_state: NoData
rules:
- for: 5m
noDataState: OK
panelId: 2
title: failure_rate_exceeded_20_percent
uid: failure_rate_exceeded_20_percent
condition: compare
data:
- refId: total_errors
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: awx_prometheus
model:
editorMode: code
expr: >-
max(delta(awx_instance_status_total{instance="awx1:8013",
status=~"failed|error"}[30m]))
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: total_errors
- refId: max_errors
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 80
- 0
type: gt
operator:
type: and
query:
params:
- total_errors
reducer:
params: []
type: max
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: total_errors
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: max
refId: max_errors
type: reduce
- refId: total_success
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: awx_prometheus
model:
datasource:
type: prometheus
uid: awx_prometheus
editorMode: code
expr: >-
max(delta(awx_instance_status_total{instance="awx1:8013",
status="successful"}[30m]))
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: total_success
- refId: max_success
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- total_success
reducer:
params: []
type: max
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: total_success
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: max
refId: max_success
type: reduce
- refId: compare
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- max_success
reducer:
params: []
type: avg
type: query
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- max_success
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: $max_errors / ($max_errors+$max_success) >= .2
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: compare
type: math
- for: 60s
noDataState: OK
panelId: 1
title: redis_queue_too_large_to_clear_in_2_min
uid: redis_queue_too_large_to_clear_in_2_min
condition: redis_queue_growing_faster_than_insertion_rate
dashboardUid: awx
data:
- refId: events_insertion_rate_per_second
relativeTimeRange:
from: 300
to: 0
datasourceUid: awx_prometheus
model:
editorMode: code
expr: irate(callback_receiver_events_insert_db{node='awx-1'}[1m])
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: events_insertion_rate_per_second
queryType: ""
- refId: mean_event_insertion_rate
relativeTimeRange:
from: 0
to: 0
datasourceUid: -100
model:
conditions:
- evaluator:
params:
- 3
type: gt
operator:
type: and
query:
params:
- event_insertion_rate
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: -100
expression: events_insertion_rate_per_second
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: mean
refId: mean_event_insertion_rate
type: reduce
queryType: ""
- refId: redis_queue_size
relativeTimeRange:
from: 300
to: 0
datasourceUid: awx_prometheus
model:
datasource:
type: prometheus
uid: awx_prometheus
editorMode: code
expr: callback_receiver_events_queue_size_redis{node='awx-1'}
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: redis_queue_size
queryType: ""
- refId: last_redis_queue_size
relativeTimeRange:
from: 0
to: 0
datasourceUid: -100
model:
conditions:
- evaluator:
params:
- 3
type: gt
operator:
type: and
query:
params:
- event_insertion_rate
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: -100
expression: redis_queue_size
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: last_redis_queue_size
type: reduce
queryType: ""
- refId: redis_queue_growing_faster_than_insertion_rate
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: -100
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- last_redis_queue_size
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: "($last_redis_queue_size > ($mean_event_insertion_rate * 120))"
hide: false
intervalMs: 1000
maxDataPoints: 43200
type: math
- for: 60s
noDataState: OK
panelId: 3
uid: capacity_below_10_percent
title: capacity_below_10_percent
condition: pending_jobs_and_capacity_compare
data:
- refId: remaining_capacity
queryType: ""
relativeTimeRange:
from: 300
to: 0
datasourceUid: awx_prometheus
model:
editorMode: code
expr: sum(awx_instance_remaining_capacity)
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: remaining_capacity
- refId: last_remaining_capacity
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 3
type: outside_range
operator:
type: and
query:
params:
- total_capacity
reducer:
params: []
type: percent_diff
type: query
datasource:
type: __expr__
uid: "-100"
expression: remaining_capacity
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: last_remaining_capacity
type: reduce
- refId: total_capacity
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: awx_prometheus
model:
datasource:
type: prometheus
uid: awx_prometheus
editorMode: code
expr: sum(awx_instance_capacity{instance="awx1:8013"})
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: total_capacity
- refId: last_total_capacity
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- capacity_below_10%
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: total_capacity
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: last_total_capacity
type: reduce
- refId: 10_percent_total_capacity
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- last_total_capacity
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: "$last_total_capacity*.10"
hide: false
intervalMs: 1000
maxDataPoints: 43200
refId: 10_percent_total_capacity
type: math
- refId: pending_jobs
queryType: ""
relativeTimeRange:
from: 600
to: 0
datasourceUid: awx_prometheus
model:
datasource:
type: prometheus
uid: awx_prometheus
editorMode: builder
expr: awx_pending_jobs_total{instance="awx1:8013"}
hide: false
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: true
refId: pending_jobs
- refId: last_pending_jobs
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- pending_jobs_and_capacity_compare
reducer:
params: []
type: avg
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression: pending_jobs
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: last_pending_jobs
type: reduce
- refId: pending_jobs_and_capacity_compare
queryType: ""
relativeTimeRange:
from: 0
to: 0
datasourceUid: "-100"
model:
conditions:
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- 10_percent_total_capacity
reducer:
params: []
type: last
type: query
- evaluator:
params:
- 0
- 0
type: gt
operator:
type: and
query:
params:
- pending_jobs
reducer:
params: []
type: last
type: query
datasource:
name: Expression
type: __expr__
uid: __expr__
expression:
"($10_percent_total_capacity > $last_remaining_capacity) && $last_pending_jobs
> 1"
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: mean
refId: pending_jobs_and_capacity_compare
type: math