Sensible log behavior when redis is unavailable (#15466)

* Sensible log behavior when redis is unavailable

* Consistent behavior with dispatcher and callback
This commit is contained in:
Alan Rominger
2025-04-10 16:45:05 -04:00
committed by GitHub
parent 7a3010f0e6
commit c3ee0c2d8a
6 changed files with 48 additions and 13 deletions

View File

@@ -9,6 +9,7 @@ from prometheus_client.core import GaugeMetricFamily, HistogramMetricFamily
from prometheus_client.registry import CollectorRegistry
from django.conf import settings
from django.http import HttpRequest
import redis.exceptions
from rest_framework.request import Request
from awx.main.consumers import emit_channel_notification
@@ -290,8 +291,12 @@ class Metrics(MetricsNamespace):
def send_metrics(self):
# more than one thread could be calling this at the same time, so should
# acquire redis lock before sending metrics
lock = self.conn.lock(root_key + '-' + self._namespace + '_lock')
if not lock.acquire(blocking=False):
try:
lock = self.conn.lock(root_key + '-' + self._namespace + '_lock')
if not lock.acquire(blocking=False):
return
except redis.exceptions.ConnectionError as exc:
logger.warning(f'Connection error in send_metrics: {exc}')
return
try:
current_time = time.time()

View File

@@ -15,6 +15,7 @@ from datetime import timedelta
from django import db
from django.conf import settings
import redis.exceptions
from ansible_base.lib.logging.runtime import log_excess_runtime
@@ -130,10 +131,13 @@ class AWXConsumerBase(object):
@log_excess_runtime(logger, debug_cutoff=0.05, cutoff=0.2)
def record_statistics(self):
if time.time() - self.last_stats > 1: # buffer stat recording to once per second
save_data = self.pool.debug()
try:
self.redis.set(f'awx_{self.name}_statistics', self.pool.debug())
self.redis.set(f'awx_{self.name}_statistics', save_data)
except redis.exceptions.ConnectionError as exc:
logger.warning(f'Redis connection error saving {self.name} status data:\n{exc}\nmissed data:\n{save_data}')
except Exception:
logger.exception(f"encountered an error communicating with redis to store {self.name} statistics")
logger.exception(f"Unknown redis error saving {self.name} status data:\nmissed data:\n{save_data}")
self.last_stats = time.time()
def run(self, *args, **kwargs):
@@ -189,7 +193,10 @@ class AWXConsumerPG(AWXConsumerBase):
current_time = time.time()
self.pool.produce_subsystem_metrics(self.subsystem_metrics)
self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather))
self.subsystem_metrics.pipe_execute()
try:
self.subsystem_metrics.pipe_execute()
except redis.exceptions.ConnectionError as exc:
logger.warning(f'Redis connection error saving dispatcher metrics, error:\n{exc}')
self.listen_cumulative_time = 0.0
self.last_metrics_gather = current_time

View File

@@ -86,6 +86,7 @@ class CallbackBrokerWorker(BaseWorker):
return os.getpid()
def read(self, queue):
has_redis_error = False
try:
res = self.redis.blpop(self.queue_name, timeout=1)
if res is None:
@@ -95,14 +96,21 @@ class CallbackBrokerWorker(BaseWorker):
self.subsystem_metrics.inc('callback_receiver_events_popped_redis', 1)
self.subsystem_metrics.inc('callback_receiver_events_in_memory', 1)
return json.loads(res[1])
except redis.exceptions.ConnectionError as exc:
# Low noise log, because very common and many workers will write this
logger.error(f"redis connection error: {exc}")
has_redis_error = True
time.sleep(5)
except redis.exceptions.RedisError:
logger.exception("encountered an error communicating with redis")
has_redis_error = True
time.sleep(1)
except (json.JSONDecodeError, KeyError):
logger.exception("failed to decode JSON message from redis")
finally:
self.record_statistics()
self.record_read_metrics()
if not has_redis_error:
self.record_statistics()
self.record_read_metrics()
return {'event': 'FLUSH'}

View File

@@ -1,10 +1,13 @@
# Copyright (c) 2015 Ansible, Inc.
# All Rights Reserved.
from django.conf import settings
from django.core.management.base import BaseCommand
from awx.main.analytics.subsystem_metrics import CallbackReceiverMetricsServer
import redis
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
import redis.exceptions
from awx.main.analytics.subsystem_metrics import CallbackReceiverMetricsServer
from awx.main.dispatch.control import Control
from awx.main.dispatch.worker import AWXConsumerRedis, CallbackBrokerWorker
@@ -27,7 +30,10 @@ class Command(BaseCommand):
return
consumer = None
CallbackReceiverMetricsServer().start()
try:
CallbackReceiverMetricsServer().start()
except redis.exceptions.ConnectionError as exc:
raise CommandError(f'Callback receiver could not connect to redis, error: {exc}')
try:
consumer = AWXConsumerRedis(

View File

@@ -3,8 +3,10 @@
import logging
import yaml
import redis
from django.conf import settings
from django.core.management.base import BaseCommand
from django.core.management.base import BaseCommand, CommandError
from awx.main.dispatch import get_task_queuename
from awx.main.dispatch.control import Control
@@ -63,7 +65,10 @@ class Command(BaseCommand):
consumer = None
DispatcherMetricsServer().start()
try:
DispatcherMetricsServer().start()
except redis.exceptions.ConnectionError as exc:
raise CommandError(f'Dispatcher could not connect to redis, error: {exc}')
try:
queues = ['tower_broadcast_all', 'tower_settings_change', get_task_queuename()]

View File

@@ -10,6 +10,8 @@ import time
import sys
import signal
import redis
# Django
from django.db import transaction
from django.utils.translation import gettext_lazy as _, gettext_noop
@@ -120,6 +122,8 @@ class TaskBase:
self.subsystem_metrics.pipe_execute()
else:
logger.debug(f"skipping recording {self.prefix} metrics, last recorded {time_last_recorded} seconds ago")
except redis.exceptions.ConnectionError as exc:
logger.warning(f"Redis connection error saving metrics for {self.prefix}, error: {exc}")
except Exception:
logger.exception(f"Error saving metrics for {self.prefix}")