mirror of
https://github.com/ansible/awx.git
synced 2026-01-11 01:57:35 -03:30
Merge pull request #13001 from kdelee/moooore-dashboard
Moooore 🐮 dashboard
This commit is contained in:
commit
50614b961e
@ -180,7 +180,7 @@ services:
|
||||
image: postgres:12
|
||||
container_name: tools_postgres_1
|
||||
# additional logging settings for postgres can be found https://www.postgresql.org/docs/current/runtime-config-logging.html
|
||||
command: postgres -c log_destination=stderr -c log_min_messages=info -c log_min_duration_statement={{ pg_log_min_duration_statement|default(1000) }}
|
||||
command: postgres -c log_destination=stderr -c log_min_messages=info -c log_min_duration_statement={{ pg_log_min_duration_statement|default(1000) }} -c max_connections={{ pg_max_connections|default(1024) }}
|
||||
environment:
|
||||
POSTGRES_HOST_AUTH_METHOD: trust
|
||||
POSTGRES_USER: {{ pg_username }}
|
||||
|
||||
@ -36,9 +36,18 @@ GRAFANA=true PROMETHEUS=true EXTRA_SOURCES_ANSIBLE_OPTS="-e scrape_interval=1 ad
|
||||
|
||||
We are configuring alerts in grafana using the provisioning files method. This feature is new in Grafana as of August, 2022. Documentation can be found: https://grafana.com/docs/grafana/latest/administration/provisioning/#alerting however it does not fully show all parameters to the config.
|
||||
|
||||
One way to understand how to build rules is to build them in the UI and use chrometools to inspect the payload as you save the rules. It appears that the "data" portion of the payload for each rule is the same syntax as needed in the provisioning file config. To reload the alerts without restarting the container, from within the container you can send a POST with `curl -X POST http://admin:admin@localhost:3000/api/admin/provisioning/alerting/reload`. Keep in mind the grafana container does not contain `curl`. You can install it with the command `apk add curl`.
|
||||
One way to understand how to build rules is to build them in the UI and use chrometools to inspect the payload as you save the rules. It appears that the "data" portion of the payload for each rule is the same syntax as needed in the provisioning file config. To reload the alerts without restarting the container, from your terminal you can send a POST with `curl -X POST http://admin:admin@localhost:3001/api/admin/provisioning/alerting/reload`.
|
||||
|
||||
Another way to export rules is explore the api.
|
||||
1. Get all the folders: `GET` to `/api/folders`
|
||||
2. Get the rules `GET` to `/api/ruler/grafana/api/v1/rules/{{ Folder }}`
|
||||
|
||||
You can do this via curl or in the web browser.
|
||||
|
||||
### Included Alerts
|
||||
|
||||
#### Alert if remaining capacity low and pending jobs exist
|
||||
|
||||
We want to know if jobs are in pending but we lack capacity in the cluster to run them. Our approach is to sum all remaining capacity in the cluster and compare it to the total capacity of the cluster. If less than 10% of our capacity is remaining and we have pending jobs, and this is true for more than 180s, we will fire the alert.
|
||||
|
||||
This alert is named "capacity_below_10_percent" and can be found in this directory in https://github.com/ansible/awx/blob/devel/tools/grafana/alerting/alerts.yml
|
||||
|
||||
@ -2,15 +2,21 @@
|
||||
apiVersion: 1
|
||||
groups:
|
||||
- folder: awx
|
||||
interval: 60s
|
||||
interval: 10s
|
||||
name: awx_rules
|
||||
orgId: 1
|
||||
exec_err_state: Alerting
|
||||
no_data_state: NoData
|
||||
rules:
|
||||
- condition: if_failures_too_high
|
||||
dashboardUid: awx
|
||||
- for: 5m
|
||||
noDataState: OK
|
||||
panelId: 2
|
||||
title: failure_rate_exceeded_20_percent
|
||||
uid: failure_rate_exceeded_20_percent
|
||||
condition: compare
|
||||
data:
|
||||
- refId: total_errors
|
||||
queryType: ''
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
@ -19,7 +25,7 @@ groups:
|
||||
editorMode: code
|
||||
expr: >-
|
||||
max(delta(awx_instance_status_total{instance="awx1:8013",
|
||||
status="failed|error"}[30m]))
|
||||
status=~"failed|error"}[30m]))
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
@ -27,11 +33,11 @@ groups:
|
||||
range: true
|
||||
refId: total_errors
|
||||
- refId: max_errors
|
||||
queryType: ''
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: '-100'
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
@ -60,7 +66,7 @@ groups:
|
||||
refId: max_errors
|
||||
type: reduce
|
||||
- refId: total_success
|
||||
queryType: ''
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
@ -80,11 +86,11 @@ groups:
|
||||
range: true
|
||||
refId: total_success
|
||||
- refId: max_success
|
||||
queryType: ''
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: '-100'
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
@ -113,11 +119,11 @@ groups:
|
||||
refId: max_success
|
||||
type: reduce
|
||||
- refId: compare
|
||||
queryType: ''
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: '-100'
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
@ -158,15 +164,19 @@ groups:
|
||||
maxDataPoints: 43200
|
||||
refId: compare
|
||||
type: math
|
||||
for: 30m
|
||||
- for: 60s
|
||||
noDataState: OK
|
||||
panelId: 2
|
||||
title: failure_rate_exceeded_20_percent
|
||||
uid: failure_rate_exceeded_20_percent
|
||||
- condition: if_redis_queue_too_large
|
||||
panelId: 1
|
||||
title: redis_queue_too_large_to_clear_in_2_min
|
||||
uid: redis_queue_too_large_to_clear_in_2_min
|
||||
condition: redis_queue_growing_faster_than_insertion_rate
|
||||
dashboardUid: awx
|
||||
data:
|
||||
- datasourceUid: awx_prometheus
|
||||
- refId: events_insertion_rate_per_second
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: awx_prometheus
|
||||
model:
|
||||
editorMode: code
|
||||
expr: irate(callback_receiver_events_insert_db{node='awx_1'}[1m])
|
||||
@ -177,11 +187,11 @@ groups:
|
||||
range: true
|
||||
refId: events_insertion_rate_per_second
|
||||
queryType: ""
|
||||
refId: events_insertion_rate_per_second
|
||||
- refId: mean_event_insertion_rate
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
from: 0
|
||||
to: 0
|
||||
- datasourceUid: -100
|
||||
datasourceUid: -100
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
@ -208,11 +218,11 @@ groups:
|
||||
refId: mean_event_insertion_rate
|
||||
type: reduce
|
||||
queryType: ""
|
||||
refId: mean_event_insertion_rate
|
||||
- refId: redis_queue_size
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
from: 300
|
||||
to: 0
|
||||
- datasourceUid: awx_prometheus
|
||||
datasourceUid: awx_prometheus
|
||||
model:
|
||||
datasource:
|
||||
type: prometheus
|
||||
@ -226,11 +236,11 @@ groups:
|
||||
range: true
|
||||
refId: redis_queue_size
|
||||
queryType: ""
|
||||
refId: redis_queue_size
|
||||
- refId: last_redis_queue_size
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
- datasourceUid: -100
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: -100
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
@ -257,11 +267,12 @@ groups:
|
||||
refId: last_redis_queue_size
|
||||
type: reduce
|
||||
queryType: ""
|
||||
refId: last_redis_queue_size
|
||||
- refId: redis_queue_growing_faster_than_insertion_rate
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
- datasourceUid: -100
|
||||
datasourceUid: -100
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
@ -282,44 +293,35 @@ groups:
|
||||
name: Expression
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: '($last_redis_queue_size > ($mean_event_insertion_rate * 120))'
|
||||
expression: "($last_redis_queue_size > ($mean_event_insertion_rate * 120))"
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: redis_queue_growing_faster_than_insertion_rate
|
||||
type: math
|
||||
queryType: ""
|
||||
refId: redis_queue_growing_faster_than_insertion_rate
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
for: 60s
|
||||
- for: 60s
|
||||
noDataState: OK
|
||||
panelId: 1
|
||||
title: redis_queue_too_large_to_clear_in_2_min
|
||||
uid: redis_queue_too_large_to_clear_in_2_min
|
||||
- condition: if_capacity_is_too_low
|
||||
dashboardUid: awx
|
||||
no_data_state: OK
|
||||
exec_err_state: Error
|
||||
panelId: 3
|
||||
uid: capacity_below_10_percent
|
||||
title: capacity_below_10_percent
|
||||
condition: pending_jobs_and_capacity_compare
|
||||
data:
|
||||
- refId: remaining_capacity
|
||||
queryType: ''
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 1800
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: awx_prometheus
|
||||
model:
|
||||
editorMode: builder
|
||||
expr: awx_instance_remaining_capacity{instance="awx1:8013"}
|
||||
editorMode: code
|
||||
expr: sum(awx_instance_remaining_capacity)
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: true
|
||||
refId: remaining_capacity
|
||||
- refId: if_capacity_is_too_low
|
||||
queryType: ''
|
||||
- refId: last_remaining_capacity
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
@ -328,14 +330,63 @@ groups:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 20
|
||||
- 0
|
||||
type: lt
|
||||
- 3
|
||||
type: outside_range
|
||||
operator:
|
||||
type: when
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- remaining_capacity
|
||||
- total_capacity
|
||||
reducer:
|
||||
params: []
|
||||
type: percent_diff
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: "-100"
|
||||
expression: remaining_capacity
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: last_remaining_capacity
|
||||
type: reduce
|
||||
- refId: total_capacity
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: awx_prometheus
|
||||
model:
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: awx_prometheus
|
||||
editorMode: code
|
||||
expr: sum(awx_instance_capacity{instance="awx1:8013"})
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: true
|
||||
refId: total_capacity
|
||||
- refId: last_total_capacity
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- capacity_below_10%
|
||||
reducer:
|
||||
params: []
|
||||
type: avg
|
||||
@ -344,12 +395,142 @@ groups:
|
||||
name: Expression
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: remaining_capacity
|
||||
expression: total_capacity
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: if_capacity_is_too_low
|
||||
type: classic_conditions
|
||||
for: 30m
|
||||
title: if_capacity_is_too_low
|
||||
uid: if_capacity_is_too_low
|
||||
reducer: last
|
||||
refId: last_total_capacity
|
||||
type: reduce
|
||||
- refId: 10_percent_total_capacity
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- last_total_capacity
|
||||
reducer:
|
||||
params: []
|
||||
type: avg
|
||||
type: query
|
||||
datasource:
|
||||
name: Expression
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: "$last_total_capacity*.10"
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
refId: 10_percent_total_capacity
|
||||
type: math
|
||||
- refId: pending_jobs
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: awx_prometheus
|
||||
model:
|
||||
datasource:
|
||||
type: prometheus
|
||||
uid: awx_prometheus
|
||||
editorMode: builder
|
||||
expr: awx_pending_jobs_total{instance="awx1:8013"}
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
legendFormat: __auto
|
||||
maxDataPoints: 43200
|
||||
range: true
|
||||
refId: pending_jobs
|
||||
- refId: last_pending_jobs
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- pending_jobs_and_capacity_compare
|
||||
reducer:
|
||||
params: []
|
||||
type: avg
|
||||
type: query
|
||||
datasource:
|
||||
name: Expression
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: pending_jobs
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: last_pending_jobs
|
||||
type: reduce
|
||||
- refId: pending_jobs_and_capacity_compare
|
||||
queryType: ""
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: "-100"
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- 10_percent_total_capacity
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
- evaluator:
|
||||
params:
|
||||
- 0
|
||||
- 0
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- pending_jobs
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
name: Expression
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression:
|
||||
"($10_percent_total_capacity > $last_remaining_capacity) && $last_pending_jobs
|
||||
> 1"
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: mean
|
||||
refId: pending_jobs_and_capacity_compare
|
||||
type: math
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user