--- apiVersion: 1 groups: - folder: awx interval: 10s name: awx_rules orgId: 1 exec_err_state: Alerting no_data_state: NoData rules: - for: 5m noDataState: OK panelId: 2 title: failure_rate_exceeded_20_percent uid: failure_rate_exceeded_20_percent condition: compare data: - refId: total_errors queryType: "" relativeTimeRange: from: 600 to: 0 datasourceUid: awx_prometheus model: editorMode: code expr: >- max(delta(awx_instance_status_total{instance="awx1:8013", status=~"failed|error"}[30m])) hide: false intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: true refId: total_errors - refId: max_errors queryType: "" relativeTimeRange: from: 0 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 80 - 0 type: gt operator: type: and query: params: - total_errors reducer: params: [] type: max type: query datasource: name: Expression type: __expr__ uid: __expr__ expression: total_errors hide: false intervalMs: 1000 maxDataPoints: 43200 reducer: max refId: max_errors type: reduce - refId: total_success queryType: "" relativeTimeRange: from: 600 to: 0 datasourceUid: awx_prometheus model: datasource: type: prometheus uid: awx_prometheus editorMode: code expr: >- max(delta(awx_instance_status_total{instance="awx1:8013", status="successful"}[30m])) hide: false intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: true refId: total_success - refId: max_success queryType: "" relativeTimeRange: from: 0 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 0 - 0 type: gt operator: type: and query: params: - total_success reducer: params: [] type: max type: query datasource: name: Expression type: __expr__ uid: __expr__ expression: total_success hide: false intervalMs: 1000 maxDataPoints: 43200 reducer: max refId: max_success type: reduce - refId: compare queryType: "" relativeTimeRange: from: 0 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 0 - 0 type: gt operator: type: and query: params: - max_success reducer: params: [] type: avg type: query - evaluator: params: - 0 - 0 type: gt operator: type: and query: params: - max_success reducer: params: [] type: avg type: query datasource: name: Expression type: __expr__ uid: __expr__ expression: $max_errors / ($max_errors+$max_success) >= .2 hide: false intervalMs: 1000 maxDataPoints: 43200 refId: compare type: math - for: 60s noDataState: OK panelId: 1 title: redis_queue_too_large_to_clear_in_2_min uid: redis_queue_too_large_to_clear_in_2_min condition: redis_queue_growing_faster_than_insertion_rate dashboardUid: awx data: - refId: events_insertion_rate_per_second relativeTimeRange: from: 300 to: 0 datasourceUid: awx_prometheus model: editorMode: code expr: irate(callback_receiver_events_insert_db{node='awx-1'}[1m]) hide: false intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: true refId: events_insertion_rate_per_second queryType: "" - refId: mean_event_insertion_rate relativeTimeRange: from: 0 to: 0 datasourceUid: -100 model: conditions: - evaluator: params: - 3 type: gt operator: type: and query: params: - event_insertion_rate reducer: params: [] type: last type: query datasource: type: __expr__ uid: -100 expression: events_insertion_rate_per_second hide: false intervalMs: 1000 maxDataPoints: 43200 reducer: mean refId: mean_event_insertion_rate type: reduce queryType: "" - refId: redis_queue_size relativeTimeRange: from: 300 to: 0 datasourceUid: awx_prometheus model: datasource: type: prometheus uid: awx_prometheus editorMode: code expr: callback_receiver_events_queue_size_redis{node='awx-1'} hide: false intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: true refId: redis_queue_size queryType: "" - refId: last_redis_queue_size relativeTimeRange: from: 0 to: 0 datasourceUid: -100 model: conditions: - evaluator: params: - 3 type: gt operator: type: and query: params: - event_insertion_rate reducer: params: [] type: last type: query datasource: type: __expr__ uid: -100 expression: redis_queue_size hide: false intervalMs: 1000 maxDataPoints: 43200 reducer: last refId: last_redis_queue_size type: reduce queryType: "" - refId: redis_queue_growing_faster_than_insertion_rate queryType: "" relativeTimeRange: from: 0 to: 0 datasourceUid: -100 model: conditions: - evaluator: params: - 0 - 0 type: gt operator: type: and query: params: - last_redis_queue_size reducer: params: [] type: avg type: query datasource: name: Expression type: __expr__ uid: __expr__ expression: "($last_redis_queue_size > ($mean_event_insertion_rate * 120))" hide: false intervalMs: 1000 maxDataPoints: 43200 type: math - for: 60s noDataState: OK panelId: 3 uid: capacity_below_10_percent title: capacity_below_10_percent condition: pending_jobs_and_capacity_compare data: - refId: remaining_capacity queryType: "" relativeTimeRange: from: 300 to: 0 datasourceUid: awx_prometheus model: editorMode: code expr: sum(awx_instance_remaining_capacity) hide: false intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: true refId: remaining_capacity - refId: last_remaining_capacity queryType: "" relativeTimeRange: from: 0 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 3 type: outside_range operator: type: and query: params: - total_capacity reducer: params: [] type: percent_diff type: query datasource: type: __expr__ uid: "-100" expression: remaining_capacity hide: false intervalMs: 1000 maxDataPoints: 43200 reducer: last refId: last_remaining_capacity type: reduce - refId: total_capacity queryType: "" relativeTimeRange: from: 600 to: 0 datasourceUid: awx_prometheus model: datasource: type: prometheus uid: awx_prometheus editorMode: code expr: sum(awx_instance_capacity{instance="awx1:8013"}) hide: false intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: true refId: total_capacity - refId: last_total_capacity queryType: "" relativeTimeRange: from: 0 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 0 - 0 type: gt operator: type: and query: params: - capacity_below_10% reducer: params: [] type: avg type: query datasource: name: Expression type: __expr__ uid: __expr__ expression: total_capacity hide: false intervalMs: 1000 maxDataPoints: 43200 reducer: last refId: last_total_capacity type: reduce - refId: 10_percent_total_capacity queryType: "" relativeTimeRange: from: 0 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 0 - 0 type: gt operator: type: and query: params: - last_total_capacity reducer: params: [] type: avg type: query datasource: name: Expression type: __expr__ uid: __expr__ expression: "$last_total_capacity*.10" hide: false intervalMs: 1000 maxDataPoints: 43200 refId: 10_percent_total_capacity type: math - refId: pending_jobs queryType: "" relativeTimeRange: from: 600 to: 0 datasourceUid: awx_prometheus model: datasource: type: prometheus uid: awx_prometheus editorMode: builder expr: awx_pending_jobs_total{instance="awx1:8013"} hide: false intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: true refId: pending_jobs - refId: last_pending_jobs queryType: "" relativeTimeRange: from: 0 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 0 - 0 type: gt operator: type: and query: params: - pending_jobs_and_capacity_compare reducer: params: [] type: avg type: query datasource: name: Expression type: __expr__ uid: __expr__ expression: pending_jobs hide: false intervalMs: 1000 maxDataPoints: 43200 reducer: last refId: last_pending_jobs type: reduce - refId: pending_jobs_and_capacity_compare queryType: "" relativeTimeRange: from: 0 to: 0 datasourceUid: "-100" model: conditions: - evaluator: params: - 0 - 0 type: gt operator: type: and query: params: - 10_percent_total_capacity reducer: params: [] type: last type: query - evaluator: params: - 0 - 0 type: gt operator: type: and query: params: - pending_jobs reducer: params: [] type: last type: query datasource: name: Expression type: __expr__ uid: __expr__ expression: "($10_percent_total_capacity > $last_remaining_capacity) && $last_pending_jobs > 1" hide: false intervalMs: 1000 maxDataPoints: 43200 reducer: mean refId: pending_jobs_and_capacity_compare type: math