Avoid unbounded kwargs by fetching subtasks inside handle_work_error

Update tests to new handle_work_error call pattern Handle blame correctly with multiple serial deps add new test case corresponding to this scenario
2026-06-25 16:38:03 -02:30 · 2022-12-19 11:19:38 -05:00
parent b7f2825909
commit 94b34b801c
5 changed files with 72 additions and 70 deletions
--- a/awx/main/scheduler/task_manager.py
+++ b/awx/main/scheduler/task_manager.py
@@ -507,7 +507,7 @@ class TaskManager(TaskBase):
        return None

    @timeit
-    def start_task(self, task, instance_group, dependent_tasks=None, instance=None):
+    def start_task(self, task, instance_group, instance=None):
        # Just like for process_running_tasks, add the job to the dependency graph and
        # ask the TaskManagerInstanceGroups object to update consumed capacity on all
        # implicated instances and container groups.
@@ -524,14 +524,6 @@ class TaskManager(TaskBase):
            ScheduleTaskManager().schedule()
        from awx.main.tasks.system import handle_work_error, handle_work_success

-        dependent_tasks = dependent_tasks or []
-
-        task_actual = {
-            'type': get_type_for_model(type(task)),
-            'id': task.id,
-        }
-        dependencies = [{'type': get_type_for_model(type(t)), 'id': t.id} for t in dependent_tasks]
-
        task.status = 'waiting'

        (start_status, opts) = task.pre_start()
@@ -563,6 +555,7 @@ class TaskManager(TaskBase):
        # apply_async does a NOTIFY to the channel dispatcher is listening to
        # postgres will treat this as part of the transaction, which is what we want
        if task.status != 'failed' and type(task) is not WorkflowJob:
+            task_actual = {'type': get_type_for_model(type(task)), 'id': task.id}
            task_cls = task._get_task_class()
            task_cls.apply_async(
                [task.pk],
@@ -570,7 +563,7 @@ class TaskManager(TaskBase):
                queue=task.get_queue_name(),
                uuid=task.celery_task_id,
                callbacks=[{'task': handle_work_success.name, 'kwargs': {'task_actual': task_actual}}],
-                errbacks=[{'task': handle_work_error.name, 'args': [task.celery_task_id], 'kwargs': {'subtasks': [task_actual] + dependencies}}],
+                errbacks=[{'task': handle_work_error.name, 'kwargs': {'task_actual': task_actual}}],
            )

        # In exception cases, like a job failing pre-start checks, we send the websocket status message
@@ -609,7 +602,7 @@ class TaskManager(TaskBase):
            if isinstance(task, WorkflowJob):
                # Previously we were tracking allow_simultaneous blocking both here and in DependencyGraph.
                # Double check that using just the DependencyGraph works for Workflows and Sliced Jobs.
-                self.start_task(task, None, task.get_jobs_fail_chain(), None)
+                self.start_task(task, None, None)
                continue

            found_acceptable_queue = False
@@ -637,7 +630,7 @@ class TaskManager(TaskBase):
                execution_instance = self.tm_models.instances[control_instance.hostname].obj
                task.log_lifecycle("controller_node_chosen")
                task.log_lifecycle("execution_node_chosen")
-                self.start_task(task, self.controlplane_ig, task.get_jobs_fail_chain(), execution_instance)
+                self.start_task(task, self.controlplane_ig, execution_instance)
                found_acceptable_queue = True
                continue

@@ -645,7 +638,7 @@ class TaskManager(TaskBase):
                if not self.tm_models.instance_groups[instance_group.name].has_remaining_capacity(task):
                    continue
                if instance_group.is_container_group:
-                    self.start_task(task, instance_group, task.get_jobs_fail_chain(), None)
+                    self.start_task(task, instance_group, None)
                    found_acceptable_queue = True
                    break

@@ -670,7 +663,7 @@ class TaskManager(TaskBase):
                        )
                    )
                    execution_instance = self.tm_models.instances[execution_instance.hostname].obj
-                    self.start_task(task, instance_group, task.get_jobs_fail_chain(), execution_instance)
+                    self.start_task(task, instance_group, execution_instance)
                    found_acceptable_queue = True
                    break
                else: