mirror of
https://github.com/ansible/awx.git
synced 2026-02-23 22:16:00 -03:30
Sensible log behavior when redis is unavailable (#15466)
* Sensible log behavior when redis is unavailable * Consistent behavior with dispatcher and callback
This commit is contained in:
@@ -9,6 +9,7 @@ from prometheus_client.core import GaugeMetricFamily, HistogramMetricFamily
|
||||
from prometheus_client.registry import CollectorRegistry
|
||||
from django.conf import settings
|
||||
from django.http import HttpRequest
|
||||
import redis.exceptions
|
||||
from rest_framework.request import Request
|
||||
|
||||
from awx.main.consumers import emit_channel_notification
|
||||
@@ -290,8 +291,12 @@ class Metrics(MetricsNamespace):
|
||||
def send_metrics(self):
|
||||
# more than one thread could be calling this at the same time, so should
|
||||
# acquire redis lock before sending metrics
|
||||
lock = self.conn.lock(root_key + '-' + self._namespace + '_lock')
|
||||
if not lock.acquire(blocking=False):
|
||||
try:
|
||||
lock = self.conn.lock(root_key + '-' + self._namespace + '_lock')
|
||||
if not lock.acquire(blocking=False):
|
||||
return
|
||||
except redis.exceptions.ConnectionError as exc:
|
||||
logger.warning(f'Connection error in send_metrics: {exc}')
|
||||
return
|
||||
try:
|
||||
current_time = time.time()
|
||||
|
||||
@@ -15,6 +15,7 @@ from datetime import timedelta
|
||||
|
||||
from django import db
|
||||
from django.conf import settings
|
||||
import redis.exceptions
|
||||
|
||||
from ansible_base.lib.logging.runtime import log_excess_runtime
|
||||
|
||||
@@ -130,10 +131,13 @@ class AWXConsumerBase(object):
|
||||
@log_excess_runtime(logger, debug_cutoff=0.05, cutoff=0.2)
|
||||
def record_statistics(self):
|
||||
if time.time() - self.last_stats > 1: # buffer stat recording to once per second
|
||||
save_data = self.pool.debug()
|
||||
try:
|
||||
self.redis.set(f'awx_{self.name}_statistics', self.pool.debug())
|
||||
self.redis.set(f'awx_{self.name}_statistics', save_data)
|
||||
except redis.exceptions.ConnectionError as exc:
|
||||
logger.warning(f'Redis connection error saving {self.name} status data:\n{exc}\nmissed data:\n{save_data}')
|
||||
except Exception:
|
||||
logger.exception(f"encountered an error communicating with redis to store {self.name} statistics")
|
||||
logger.exception(f"Unknown redis error saving {self.name} status data:\nmissed data:\n{save_data}")
|
||||
self.last_stats = time.time()
|
||||
|
||||
def run(self, *args, **kwargs):
|
||||
@@ -189,7 +193,10 @@ class AWXConsumerPG(AWXConsumerBase):
|
||||
current_time = time.time()
|
||||
self.pool.produce_subsystem_metrics(self.subsystem_metrics)
|
||||
self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather))
|
||||
self.subsystem_metrics.pipe_execute()
|
||||
try:
|
||||
self.subsystem_metrics.pipe_execute()
|
||||
except redis.exceptions.ConnectionError as exc:
|
||||
logger.warning(f'Redis connection error saving dispatcher metrics, error:\n{exc}')
|
||||
self.listen_cumulative_time = 0.0
|
||||
self.last_metrics_gather = current_time
|
||||
|
||||
|
||||
@@ -86,6 +86,7 @@ class CallbackBrokerWorker(BaseWorker):
|
||||
return os.getpid()
|
||||
|
||||
def read(self, queue):
|
||||
has_redis_error = False
|
||||
try:
|
||||
res = self.redis.blpop(self.queue_name, timeout=1)
|
||||
if res is None:
|
||||
@@ -95,14 +96,21 @@ class CallbackBrokerWorker(BaseWorker):
|
||||
self.subsystem_metrics.inc('callback_receiver_events_popped_redis', 1)
|
||||
self.subsystem_metrics.inc('callback_receiver_events_in_memory', 1)
|
||||
return json.loads(res[1])
|
||||
except redis.exceptions.ConnectionError as exc:
|
||||
# Low noise log, because very common and many workers will write this
|
||||
logger.error(f"redis connection error: {exc}")
|
||||
has_redis_error = True
|
||||
time.sleep(5)
|
||||
except redis.exceptions.RedisError:
|
||||
logger.exception("encountered an error communicating with redis")
|
||||
has_redis_error = True
|
||||
time.sleep(1)
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
logger.exception("failed to decode JSON message from redis")
|
||||
finally:
|
||||
self.record_statistics()
|
||||
self.record_read_metrics()
|
||||
if not has_redis_error:
|
||||
self.record_statistics()
|
||||
self.record_read_metrics()
|
||||
|
||||
return {'event': 'FLUSH'}
|
||||
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
# Copyright (c) 2015 Ansible, Inc.
|
||||
# All Rights Reserved.
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from awx.main.analytics.subsystem_metrics import CallbackReceiverMetricsServer
|
||||
import redis
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
import redis.exceptions
|
||||
|
||||
from awx.main.analytics.subsystem_metrics import CallbackReceiverMetricsServer
|
||||
from awx.main.dispatch.control import Control
|
||||
from awx.main.dispatch.worker import AWXConsumerRedis, CallbackBrokerWorker
|
||||
|
||||
@@ -27,7 +30,10 @@ class Command(BaseCommand):
|
||||
return
|
||||
consumer = None
|
||||
|
||||
CallbackReceiverMetricsServer().start()
|
||||
try:
|
||||
CallbackReceiverMetricsServer().start()
|
||||
except redis.exceptions.ConnectionError as exc:
|
||||
raise CommandError(f'Callback receiver could not connect to redis, error: {exc}')
|
||||
|
||||
try:
|
||||
consumer = AWXConsumerRedis(
|
||||
|
||||
@@ -3,8 +3,10 @@
|
||||
import logging
|
||||
import yaml
|
||||
|
||||
import redis
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from awx.main.dispatch import get_task_queuename
|
||||
from awx.main.dispatch.control import Control
|
||||
@@ -63,7 +65,10 @@ class Command(BaseCommand):
|
||||
|
||||
consumer = None
|
||||
|
||||
DispatcherMetricsServer().start()
|
||||
try:
|
||||
DispatcherMetricsServer().start()
|
||||
except redis.exceptions.ConnectionError as exc:
|
||||
raise CommandError(f'Dispatcher could not connect to redis, error: {exc}')
|
||||
|
||||
try:
|
||||
queues = ['tower_broadcast_all', 'tower_settings_change', get_task_queuename()]
|
||||
|
||||
@@ -10,6 +10,8 @@ import time
|
||||
import sys
|
||||
import signal
|
||||
|
||||
import redis
|
||||
|
||||
# Django
|
||||
from django.db import transaction
|
||||
from django.utils.translation import gettext_lazy as _, gettext_noop
|
||||
@@ -120,6 +122,8 @@ class TaskBase:
|
||||
self.subsystem_metrics.pipe_execute()
|
||||
else:
|
||||
logger.debug(f"skipping recording {self.prefix} metrics, last recorded {time_last_recorded} seconds ago")
|
||||
except redis.exceptions.ConnectionError as exc:
|
||||
logger.warning(f"Redis connection error saving metrics for {self.prefix}, error: {exc}")
|
||||
except Exception:
|
||||
logger.exception(f"Error saving metrics for {self.prefix}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user