awx/tools/grafana/alerting/alerts.yml

---
apiVersion: 1
groups:
  - folder: awx
    interval: 60s
    name: awx_rules
    orgId: 1
    rules:
      - condition: if_failures_too_high
        dashboardUid: awx
        data:
          - refId: total_errors
            queryType: ''
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: awx_prometheus
            model:
              editorMode: code
              expr: >-
                max(delta(awx_instance_status_total{instance="awx1:8013",
                status="failed|error"}[30m]))
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: total_errors
          - refId: max_errors
            queryType: ''
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: '-100'
            model:
              conditions:
                - evaluator:
                    params:
                      - 80
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - total_errors
                  reducer:
                    params: []
                    type: max
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: total_errors
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: max
              refId: max_errors
              type: reduce
          - refId: total_success
            queryType: ''
            relativeTimeRange:
              from: 600
              to: 0
            datasourceUid: awx_prometheus
            model:
              datasource:
                type: prometheus
                uid: awx_prometheus
              editorMode: code
              expr: >-
                max(delta(awx_instance_status_total{instance="awx1:8013",
                status="successful"}[30m]))
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: total_success
          - refId: max_success
            queryType: ''
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: '-100'
            model:
              conditions:
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - total_success
                  reducer:
                    params: []
                    type: max
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: total_success
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: max
              refId: max_success
              type: reduce
          - refId: compare
            queryType: ''
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: '-100'
            model:
              conditions:
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - max_success
                  reducer:
                    params: []
                    type: avg
                  type: query
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - max_success
                  reducer:
                    params: []
                    type: avg
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: $max_errors / ($max_errors+$max_success) >= .2
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              refId: compare
              type: math
        for: 30m
        noDataState: OK
        panelId: 2
        title: failure_rate_exceeded_20_percent
        uid: failure_rate_exceeded_20_percent
      - condition: if_redis_queue_too_large
        dashboardUid: awx
        data:
          - datasourceUid: awx_prometheus
            model:
              editorMode: code
              expr: irate(callback_receiver_events_insert_db{node='awx_1'}[1m])
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: events_insertion_rate_per_second
            queryType: ""
            refId: events_insertion_rate_per_second
            relativeTimeRange:
              from: 300
              to: 0
          - datasourceUid: -100
            model:
              conditions:
                - evaluator:
                    params:
                      - 3
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - event_insertion_rate
                  reducer:
                    params: []
                    type: last
                  type: query
              datasource:
                type: __expr__
                uid: -100
              expression: events_insertion_rate_per_second
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: mean
              refId: mean_event_insertion_rate
              type: reduce
            queryType: ""
            refId: mean_event_insertion_rate
            relativeTimeRange:
              from: 0
              to: 0
          - datasourceUid: awx_prometheus
            model:
              datasource:
                type: prometheus
                uid: awx_prometheus
              editorMode: code
              expr: callback_receiver_events_queue_size_redis{node='awx_1'}
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: redis_queue_size
            queryType: ""
            refId: redis_queue_size
            relativeTimeRange:
              from: 300
              to: 0
          - datasourceUid: -100
            model:
              conditions:
                - evaluator:
                    params:
                      - 3
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - event_insertion_rate
                  reducer:
                    params: []
                    type: last
                  type: query
              datasource:
                type: __expr__
                uid: -100
              expression: redis_queue_size
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              reducer: last
              refId: last_redis_queue_size
              type: reduce
            queryType: ""
            refId: last_redis_queue_size
            relativeTimeRange:
              from: 0
              to: 0
          - datasourceUid: -100
            model:
              conditions:
                - evaluator:
                    params:
                      - 0
                      - 0
                    type: gt
                  operator:
                    type: and
                  query:
                    params:
                      - last_redis_queue_size
                  reducer:
                    params: []
                    type: avg
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: '($last_redis_queue_size > ($mean_event_insertion_rate * 120))'
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              refId: redis_queue_growing_faster_than_insertion_rate
              type: math
            queryType: ""
            refId: redis_queue_growing_faster_than_insertion_rate
            relativeTimeRange:
              from: 0
              to: 0
        for: 60s
        noDataState: OK
        panelId: 1
        title: redis_queue_too_large_to_clear_in_2_min
        uid: redis_queue_too_large_to_clear_in_2_min
      - condition: if_capacity_is_too_low
        dashboardUid: awx
        no_data_state: OK
        exec_err_state: Error
        data:
          - refId: remaining_capacity
            queryType: ''
            relativeTimeRange:
              from: 1800
              to: 0
            datasourceUid: awx_prometheus
            model:
              editorMode: builder
              expr: awx_instance_remaining_capacity{instance="awx1:8013"}
              hide: false
              intervalMs: 1000
              legendFormat: __auto
              maxDataPoints: 43200
              range: true
              refId: remaining_capacity
          - refId: if_capacity_is_too_low
            queryType: ''
            relativeTimeRange:
              from: 0
              to: 0
            datasourceUid: "-100"
            model:
              conditions:
                - evaluator:
                    params:
                      - 20
                      - 0
                    type: lt
                  operator:
                    type: when
                  query:
                    params:
                      - remaining_capacity
                  reducer:
                    params: []
                    type: avg
                  type: query
              datasource:
                name: Expression
                type: __expr__
                uid: __expr__
              expression: remaining_capacity
              hide: false
              intervalMs: 1000
              maxDataPoints: 43200
              refId: if_capacity_is_too_low
              type: classic_conditions
        for: 30m
        title: if_capacity_is_too_low
        uid: if_capacity_is_too_low