Sensible log behavior when redis is unavailable (#15466)

* Sensible log behavior when redis is unavailable

* Consistent behavior with dispatcher and callback
This commit is contained in:
Alan Rominger
2025-04-10 16:45:05 -04:00
committed by GitHub
parent 7a3010f0e6
commit c3ee0c2d8a
6 changed files with 48 additions and 13 deletions

View File

@@ -15,6 +15,7 @@ from datetime import timedelta
from django import db
from django.conf import settings
import redis.exceptions
from ansible_base.lib.logging.runtime import log_excess_runtime
@@ -130,10 +131,13 @@ class AWXConsumerBase(object):
@log_excess_runtime(logger, debug_cutoff=0.05, cutoff=0.2)
def record_statistics(self):
if time.time() - self.last_stats > 1: # buffer stat recording to once per second
save_data = self.pool.debug()
try:
self.redis.set(f'awx_{self.name}_statistics', self.pool.debug())
self.redis.set(f'awx_{self.name}_statistics', save_data)
except redis.exceptions.ConnectionError as exc:
logger.warning(f'Redis connection error saving {self.name} status data:\n{exc}\nmissed data:\n{save_data}')
except Exception:
logger.exception(f"encountered an error communicating with redis to store {self.name} statistics")
logger.exception(f"Unknown redis error saving {self.name} status data:\nmissed data:\n{save_data}")
self.last_stats = time.time()
def run(self, *args, **kwargs):
@@ -189,7 +193,10 @@ class AWXConsumerPG(AWXConsumerBase):
current_time = time.time()
self.pool.produce_subsystem_metrics(self.subsystem_metrics)
self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather))
self.subsystem_metrics.pipe_execute()
try:
self.subsystem_metrics.pipe_execute()
except redis.exceptions.ConnectionError as exc:
logger.warning(f'Redis connection error saving dispatcher metrics, error:\n{exc}')
self.listen_cumulative_time = 0.0
self.last_metrics_gather = current_time

View File

@@ -86,6 +86,7 @@ class CallbackBrokerWorker(BaseWorker):
return os.getpid()
def read(self, queue):
has_redis_error = False
try:
res = self.redis.blpop(self.queue_name, timeout=1)
if res is None:
@@ -95,14 +96,21 @@ class CallbackBrokerWorker(BaseWorker):
self.subsystem_metrics.inc('callback_receiver_events_popped_redis', 1)
self.subsystem_metrics.inc('callback_receiver_events_in_memory', 1)
return json.loads(res[1])
except redis.exceptions.ConnectionError as exc:
# Low noise log, because very common and many workers will write this
logger.error(f"redis connection error: {exc}")
has_redis_error = True
time.sleep(5)
except redis.exceptions.RedisError:
logger.exception("encountered an error communicating with redis")
has_redis_error = True
time.sleep(1)
except (json.JSONDecodeError, KeyError):
logger.exception("failed to decode JSON message from redis")
finally:
self.record_statistics()
self.record_read_metrics()
if not has_redis_error:
self.record_statistics()
self.record_read_metrics()
return {'event': 'FLUSH'}