mirror of
https://github.com/ansible/awx.git
synced 2026-01-12 10:30:03 -03:30
* Django and other webservers that care about proper hostnames don't like underscores in them.
537 lines
16 KiB
YAML
537 lines
16 KiB
YAML
---
|
|
apiVersion: 1
|
|
groups:
|
|
- folder: awx
|
|
interval: 10s
|
|
name: awx_rules
|
|
orgId: 1
|
|
exec_err_state: Alerting
|
|
no_data_state: NoData
|
|
rules:
|
|
- for: 5m
|
|
noDataState: OK
|
|
panelId: 2
|
|
title: failure_rate_exceeded_20_percent
|
|
uid: failure_rate_exceeded_20_percent
|
|
condition: compare
|
|
data:
|
|
- refId: total_errors
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: awx_prometheus
|
|
model:
|
|
editorMode: code
|
|
expr: >-
|
|
max(delta(awx_instance_status_total{instance="awx1:8013",
|
|
status=~"failed|error"}[30m]))
|
|
hide: false
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: true
|
|
refId: total_errors
|
|
- refId: max_errors
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 80
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- total_errors
|
|
reducer:
|
|
params: []
|
|
type: max
|
|
type: query
|
|
datasource:
|
|
name: Expression
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: total_errors
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: max
|
|
refId: max_errors
|
|
type: reduce
|
|
- refId: total_success
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: awx_prometheus
|
|
model:
|
|
datasource:
|
|
type: prometheus
|
|
uid: awx_prometheus
|
|
editorMode: code
|
|
expr: >-
|
|
max(delta(awx_instance_status_total{instance="awx1:8013",
|
|
status="successful"}[30m]))
|
|
hide: false
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: true
|
|
refId: total_success
|
|
- refId: max_success
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- total_success
|
|
reducer:
|
|
params: []
|
|
type: max
|
|
type: query
|
|
datasource:
|
|
name: Expression
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: total_success
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: max
|
|
refId: max_success
|
|
type: reduce
|
|
- refId: compare
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- max_success
|
|
reducer:
|
|
params: []
|
|
type: avg
|
|
type: query
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- max_success
|
|
reducer:
|
|
params: []
|
|
type: avg
|
|
type: query
|
|
datasource:
|
|
name: Expression
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: $max_errors / ($max_errors+$max_success) >= .2
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: compare
|
|
type: math
|
|
- for: 60s
|
|
noDataState: OK
|
|
panelId: 1
|
|
title: redis_queue_too_large_to_clear_in_2_min
|
|
uid: redis_queue_too_large_to_clear_in_2_min
|
|
condition: redis_queue_growing_faster_than_insertion_rate
|
|
dashboardUid: awx
|
|
data:
|
|
- refId: events_insertion_rate_per_second
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: awx_prometheus
|
|
model:
|
|
editorMode: code
|
|
expr: irate(callback_receiver_events_insert_db{node='awx-1'}[1m])
|
|
hide: false
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: true
|
|
refId: events_insertion_rate_per_second
|
|
queryType: ""
|
|
- refId: mean_event_insertion_rate
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: -100
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 3
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- event_insertion_rate
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: -100
|
|
expression: events_insertion_rate_per_second
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: mean
|
|
refId: mean_event_insertion_rate
|
|
type: reduce
|
|
queryType: ""
|
|
- refId: redis_queue_size
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: awx_prometheus
|
|
model:
|
|
datasource:
|
|
type: prometheus
|
|
uid: awx_prometheus
|
|
editorMode: code
|
|
expr: callback_receiver_events_queue_size_redis{node='awx-1'}
|
|
hide: false
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: true
|
|
refId: redis_queue_size
|
|
queryType: ""
|
|
- refId: last_redis_queue_size
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: -100
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 3
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- event_insertion_rate
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: -100
|
|
expression: redis_queue_size
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: last_redis_queue_size
|
|
type: reduce
|
|
queryType: ""
|
|
- refId: redis_queue_growing_faster_than_insertion_rate
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: -100
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- last_redis_queue_size
|
|
reducer:
|
|
params: []
|
|
type: avg
|
|
type: query
|
|
datasource:
|
|
name: Expression
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: "($last_redis_queue_size > ($mean_event_insertion_rate * 120))"
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
type: math
|
|
- for: 60s
|
|
noDataState: OK
|
|
panelId: 3
|
|
uid: capacity_below_10_percent
|
|
title: capacity_below_10_percent
|
|
condition: pending_jobs_and_capacity_compare
|
|
data:
|
|
- refId: remaining_capacity
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: awx_prometheus
|
|
model:
|
|
editorMode: code
|
|
expr: sum(awx_instance_remaining_capacity)
|
|
hide: false
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: true
|
|
refId: remaining_capacity
|
|
- refId: last_remaining_capacity
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 3
|
|
type: outside_range
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- total_capacity
|
|
reducer:
|
|
params: []
|
|
type: percent_diff
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: "-100"
|
|
expression: remaining_capacity
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: last_remaining_capacity
|
|
type: reduce
|
|
- refId: total_capacity
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: awx_prometheus
|
|
model:
|
|
datasource:
|
|
type: prometheus
|
|
uid: awx_prometheus
|
|
editorMode: code
|
|
expr: sum(awx_instance_capacity{instance="awx1:8013"})
|
|
hide: false
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: true
|
|
refId: total_capacity
|
|
- refId: last_total_capacity
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- capacity_below_10%
|
|
reducer:
|
|
params: []
|
|
type: avg
|
|
type: query
|
|
datasource:
|
|
name: Expression
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: total_capacity
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: last_total_capacity
|
|
type: reduce
|
|
- refId: 10_percent_total_capacity
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- last_total_capacity
|
|
reducer:
|
|
params: []
|
|
type: avg
|
|
type: query
|
|
datasource:
|
|
name: Expression
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: "$last_total_capacity*.10"
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
refId: 10_percent_total_capacity
|
|
type: math
|
|
- refId: pending_jobs
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 600
|
|
to: 0
|
|
datasourceUid: awx_prometheus
|
|
model:
|
|
datasource:
|
|
type: prometheus
|
|
uid: awx_prometheus
|
|
editorMode: builder
|
|
expr: awx_pending_jobs_total{instance="awx1:8013"}
|
|
hide: false
|
|
intervalMs: 1000
|
|
legendFormat: __auto
|
|
maxDataPoints: 43200
|
|
range: true
|
|
refId: pending_jobs
|
|
- refId: last_pending_jobs
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- pending_jobs_and_capacity_compare
|
|
reducer:
|
|
params: []
|
|
type: avg
|
|
type: query
|
|
datasource:
|
|
name: Expression
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: pending_jobs
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: last_pending_jobs
|
|
type: reduce
|
|
- refId: pending_jobs_and_capacity_compare
|
|
queryType: ""
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: "-100"
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- 10_percent_total_capacity
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
- evaluator:
|
|
params:
|
|
- 0
|
|
- 0
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- pending_jobs
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
name: Expression
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression:
|
|
"($10_percent_total_capacity > $last_remaining_capacity) && $last_pending_jobs
|
|
> 1"
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: mean
|
|
refId: pending_jobs_and_capacity_compare
|
|
type: math
|