awx/tools/grafana/alerting/alerts.yml

---
apiVersion: 1
groups:
  - folder: awx
    interval: 10s
    name: awx_rules
    orgId: 1
    exec_err_state: Alerting
    no_data_state: NoData
    rules:
      - for: 5m
        noDataState: OK
        panelId: 2
        title: failure_rate_exceeded_20_percent
        uid: failure_rate_exceeded_20_percent
        condition: compare
        data:
          - refId: total_errors
            queryType: ""
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: awx_prometheus
            model:
              editorMode: code
              expr: >-
                max(delta(awx_instance_status_total{instance="awx1:8013",
                status=~"failed|error"}[30m]))
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: total_errors
          - refId: max_errors
            queryType: ""
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: "-100"
            model:
              conditions:
                - evaluator:
                    params:
                      - 80
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - total_errors
                  reducer:
                    params: []
                    type: max
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: total_errors
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: max
              refId: max_errors
              type: reduce
          - refId: total_success
            queryType: ""
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: awx_prometheus
            model:
              datasource:
                type: prometheus
                uid: awx_prometheus
              editorMode: code
              expr: >-
                max(delta(awx_instance_status_total{instance="awx1:8013",
                status="successful"}[30m]))
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: total_success
          - refId: max_success
            queryType: ""
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: "-100"
            model:
              conditions:
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - total_success
                  reducer:
                    params: []
                    type: max
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: total_success
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: max
              refId: max_success
              type: reduce
          - refId: compare
            queryType: ""
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: "-100"
            model:
              conditions:
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - max_success
                  reducer:
                    params: []
                    type: avg
                  type: query
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - max_success
                  reducer:
                    params: []
                    type: avg
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: $max_errors / ($max_errors+$max_success) >= .2
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              refId: compare
              type: math
      - for: 60s
        noDataState: OK
        panelId: 1
        title: redis_queue_too_large_to_clear_in_2_min
        uid: redis_queue_too_large_to_clear_in_2_min
        condition: redis_queue_growing_faster_than_insertion_rate
        dashboardUid: awx
        data:
          - refId: events_insertion_rate_per_second
            relativeTimeRange:
              from: 300
              to: 0
            datasourceUid: awx_prometheus
            model:
              editorMode: code
              expr: irate(callback_receiver_events_insert_db{node='awx_1'}[1m])
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: events_insertion_rate_per_second
            queryType: ""
          - refId: mean_event_insertion_rate
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: -100
            model:
              conditions:
                - evaluator:
                    params:
                      - 3
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - event_insertion_rate
                  reducer:
                    params: []
                    type: last
                  type: query
              datasource:
                type: __expr__
                uid: -100
              expression: events_insertion_rate_per_second
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: mean
              refId: mean_event_insertion_rate
              type: reduce
            queryType: ""
          - refId: redis_queue_size
            relativeTimeRange:
              from: 300
              to: 0
            datasourceUid: awx_prometheus
            model:
              datasource:
                type: prometheus
                uid: awx_prometheus
              editorMode: code
              expr: callback_receiver_events_queue_size_redis{node='awx_1'}
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: redis_queue_size
            queryType: ""
          - refId: last_redis_queue_size
            relativeTimeRange:
            from: 0
            to: 0
            datasourceUid: -100
            model:
              conditions:
                - evaluator:
                    params:
                      - 3
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - event_insertion_rate
                  reducer:
                    params: []
                    type: last
                  type: query
              datasource:
                type: __expr__
                uid: -100
              expression: redis_queue_size
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: last
              refId: last_redis_queue_size
              type: reduce
            queryType: ""
          - refId: redis_queue_growing_faster_than_insertion_rate
            queryType: ""
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: -100
            model:
              conditions:
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - last_redis_queue_size
                  reducer:
                    params: []
                    type: avg
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: "($last_redis_queue_size > ($mean_event_insertion_rate * 120))"
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              type: math
      - for: 60s
        noDataState: OK
        panelId: 3
        uid: capacity_below_10_percent
        title: capacity_below_10_percent
        condition: pending_jobs_and_capacity_compare
        data:
          - refId: remaining_capacity
            queryType: ""
            relativeTimeRange:
              from: 300
              to: 0
            datasourceUid: awx_prometheus
            model:
              editorMode: code
              expr: sum(awx_instance_remaining_capacity)
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: remaining_capacity
          - refId: last_remaining_capacity
            queryType: ""
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: "-100"
            model:
              conditions:
                - evaluator:
                    params:
                      - 3
                    type: outside_range
                  operator:
                    type: and
                  query:
                    params:
                      - total_capacity
                  reducer:
                    params: []
                    type: percent_diff
                  type: query
              datasource:
                type: __expr__
                uid: "-100"
              expression: remaining_capacity
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: last
              refId: last_remaining_capacity
              type: reduce
          - refId: total_capacity
            queryType: ""
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: awx_prometheus
            model:
              datasource:
                type: prometheus
                uid: awx_prometheus
              editorMode: code
              expr: sum(awx_instance_capacity{instance="awx1:8013"})
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: total_capacity
          - refId: last_total_capacity
            queryType: ""
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: "-100"
            model:
              conditions:
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - capacity_below_10%
                  reducer:
                    params: []
                    type: avg
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: total_capacity
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: last
              refId: last_total_capacity
              type: reduce
          - refId: 10_percent_total_capacity
            queryType: ""
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: "-100"
            model:
              conditions:
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - last_total_capacity
                  reducer:
                    params: []
                    type: avg
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: "$last_total_capacity*.10"
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              refId: 10_percent_total_capacity
              type: math
          - refId: pending_jobs
            queryType: ""
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: awx_prometheus
            model:
              datasource:
                type: prometheus
                uid: awx_prometheus
              editorMode: builder
              expr: awx_pending_jobs_total{instance="awx1:8013"}
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: pending_jobs
          - refId: last_pending_jobs
            queryType: ""
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: "-100"
            model:
              conditions:
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - pending_jobs_and_capacity_compare
                  reducer:
                    params: []
                    type: avg
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: pending_jobs
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: last
              refId: last_pending_jobs
              type: reduce
          - refId: pending_jobs_and_capacity_compare
            queryType: ""
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: "-100"
            model:
              conditions:
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - 10_percent_total_capacity
                  reducer:
                    params: []
                    type: last
                  type: query
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - pending_jobs
                  reducer:
                    params: []
                    type: last
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression:
                "($10_percent_total_capacity > $last_remaining_capacity) && $last_pending_jobs
                > 1"
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: mean
              refId: pending_jobs_and_capacity_compare
              type: math