Prevent Dispatcher deadlock when Redis disappears (#14249)

This fixes https://github.com/ansible/awx/issues/14245 which has more information about this issue. This change addresses both: - A clashing signal handler (registering a callback to fire when the task manager times out, and hitting that callback in cases where we didn't expect to). Make dispatcher timeout use SIGUSR1, not SIGTERM. - Metrics not being reported should not make us crash, so that is now fixed as well. Signed-off-by: Rick Elrod <rick@elrod.me> Co-authored-by: Alan Rominger <arominge@redhat.com>
2026-03-03 09:48:51 -03:30 · 2023-07-18 10:43:46 -05:00
parent 8ddc19a927
commit 48edb15a03
3 changed files with 42 additions and 29 deletions
--- a/awx/main/dispatch/pool.py
+++ b/awx/main/dispatch/pool.py
@@ -417,16 +417,16 @@ class AutoscalePool(WorkerPool):
                # the task manager to never do more work
                current_task = w.current_task
                if current_task and isinstance(current_task, dict):
-                    endings = ['tasks.task_manager', 'tasks.dependency_manager', 'tasks.workflow_manager']
+                    endings = ('tasks.task_manager', 'tasks.dependency_manager', 'tasks.workflow_manager')
                    current_task_name = current_task.get('task', '')
-                    if any(current_task_name.endswith(e) for e in endings):
+                    if current_task_name.endswith(endings):
                        if 'started' not in current_task:
                            w.managed_tasks[current_task['uuid']]['started'] = time.time()
                        age = time.time() - current_task['started']
                        w.managed_tasks[current_task['uuid']]['age'] = age
                        if age > self.task_manager_timeout:
-                            logger.error(f'{current_task_name} has held the advisory lock for {age}, sending SIGTERM to {w.pid}')
-                            os.kill(w.pid, signal.SIGTERM)
+                            logger.error(f'{current_task_name} has held the advisory lock for {age}, sending SIGUSR1 to {w.pid}')
+                            os.kill(w.pid, signal.SIGUSR1)

        for m in orphaned:
            # if all the workers are dead, spawn at least one
--- a/awx/main/dispatch/worker/base.py
+++ b/awx/main/dispatch/worker/base.py
@@ -121,10 +121,9 @@ class AWXConsumerBase(object):
        if time.time() - self.last_stats > 1:  # buffer stat recording to once per second
            try:
                self.redis.set(f'awx_{self.name}_statistics', self.pool.debug())
-                self.last_stats = time.time()
            except Exception:
                logger.exception(f"encountered an error communicating with redis to store {self.name} statistics")
-                self.last_stats = time.time()
+            self.last_stats = time.time()

    def run(self, *args, **kwargs):
        signal.signal(signal.SIGINT, self.stop)
@@ -175,9 +174,12 @@ class AWXConsumerPG(AWXConsumerBase):

        # record subsystem metrics for the dispatcher
        if current_time - self.last_metrics_gather > 20:
-            self.pool.produce_subsystem_metrics(self.subsystem_metrics)
-            self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather))
-            self.subsystem_metrics.pipe_execute()
+            try:
+                self.pool.produce_subsystem_metrics(self.subsystem_metrics)
+                self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather))
+                self.subsystem_metrics.pipe_execute()
+            except Exception:
+                logger.exception(f"encountered an error trying to store {self.name} metrics")
            self.listen_cumulative_time = 0.0
            self.last_metrics_gather = current_time

@@ -250,8 +252,8 @@ class BaseWorker(object):
                    break
            except QueueEmpty:
                continue
-            except Exception as e:
-                logger.error("Exception on worker {}, restarting: ".format(idx) + str(e))
+            except Exception:
+                logger.exception("Exception on worker {}, reconnecting: ".format(idx))
                continue
            try:
                for conn in db.connections.all():