Improve transactional integrity for starting controller jobs in dispatcherd (#16300)

Remove SELECT FOR UPDATE from job dispatch to reduce transaction rollbacks Move status transition from BaseTask.transition_status (which used SELECT FOR UPDATE inside transaction.atomic()) into dispatch_waiting_jobs. The new approach uses filter().update() which is atomic at the database level without requiring explicit row locks, reducing transaction contention and rollbacks observed in perfscale testing. The transition_status method was an artifact of the feature flag era where we needed to support both old and new code paths. Since dispatch_waiting_jobs is already a singleton (on_duplicate='queue_one') scoped to the local node, the de-duplication logic is unnecessary. Status is updated after task submission to dispatcherd, so the job's UUID is in the dispatch pipeline before being marked running — preventing the reaper from incorrectly reaping jobs during the handoff window. RunJob.run() handles the race where a worker picks up the task before the status update lands by accepting waiting and transitioning it to running itself. Signed-off-by: Seth Foster <fosterbseth@gmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-02-28 00:08:44 -03:30 · 2026-02-26 14:16:36 -05:00
parent a21f9fbdb8
commit 2c71bcda32
2 changed files with 43 additions and 32 deletions
--- a/awx/main/tasks/jobs.py
+++ b/awx/main/tasks/jobs.py
@@ -17,7 +17,6 @@ import urllib.parse as urlparse
 # Django
 from django.conf import settings
 from django.db import transaction
 # Shared code for the AWX platform
 from awx_plugins.interfaces._temporary_private_container_api import CONTAINER_ROOT, get_incontainer_path
@@ -205,6 +204,7 @@ def dispatch_waiting_jobs(binder):
        if not kwargs:
            kwargs = {}
        binder.control('run', data={'task': serialize_task(uj._get_task_class()), 'args': [uj.id], 'kwargs': kwargs, 'uuid': uj.celery_task_id})
        UnifiedJob.objects.filter(pk=uj.pk, status='waiting').update(status='running', start_args='')
 class BaseTask(object):
@@ -551,48 +551,32 @@ class BaseTask(object):
    def should_use_fact_cache(self):
        return False
    def transition_status(self, pk: int) -> bool:
        """Atomically transition status to running, if False returned, another process got it"""
        with transaction.atomic():
            # Explanation of parts for the fetch:
            # .values - avoid loading a full object, this is known to lead to deadlocks due to signals
            #   the signals load other related rows which another process may be locking, and happens in practice
            # of=('self',) - keeps FK tables out of the lock list, another way deadlocks can happen
            # .get - just load the single job
            instance_data = UnifiedJob.objects.select_for_update(of=('self',)).values('status', 'cancel_flag').get(pk=pk)
            # If status is not waiting (obtained under lock) then this process does not have clearence to run
            if instance_data['status'] == 'waiting':
                if instance_data['cancel_flag']:
                    updated_status = 'canceled'
                else:
                    updated_status = 'running'
                # Explanation of the update:
                # .filter - again, do not load the full object
                # .update - a bulk update on just that one row, avoid loading unintended data
                UnifiedJob.objects.filter(pk=pk).update(status=updated_status, start_args='')
            elif instance_data['status'] == 'running':
                logger.info(f'Job {pk} is being ran by another process, exiting')
                return False
        return True
    @with_path_cleanup
    @with_signal_handling
    def run(self, pk, **kwargs):
        """
        Run the job/task and capture its output.
        """
        if not self.instance:  # Used to skip fetch for local runs
            if not self.transition_status(pk):
                logger.info(f'Job {pk} is being ran by another process, exiting')
                return
-        # Load the instance
+        if not self.instance:  # Used to skip fetch for local runs
-        self.instance = self.update_model(pk)
+            # Load the instance
            self.instance = self.update_model(pk)
        # status should be "running" from dispatch_waiting_jobs,
        # but may still be "waiting" if the worker picked this up before the status update landed.
        if self.instance.status == 'waiting':
            UnifiedJob.objects.filter(pk=pk).update(status="running", start_args='')
            self.instance.refresh_from_db()
        if self.instance.status != 'running':
            logger.error(f'Not starting {self.instance.status} task pk={pk} because its status "{self.instance.status}" is not expected')
            return
        if self.instance.cancel_flag:
            self.instance = self.update_model(pk, status='canceled')
            self.instance.websocket_emit_status('canceled')
            return
        self.instance.websocket_emit_status("running")
        status, rc = 'error', None
        self.runner_callback.event_ct = 0
--- a/awx/main/tests/functional/tasks/test_tasks_jobs.py
+++ b/awx/main/tests/functional/tasks/test_tasks_jobs.py
@@ -29,3 +29,30 @@ def test_cancel_flag_on_start(jt_linked, caplog):
    job = Job.objects.get(id=job.id)
    assert job.status == 'canceled'
@pytest.mark.django_db
 def test_runjob_run_can_accept_waiting_status(jt_linked, mocker):
    """Test that RunJob.run() can accept a job in 'waiting' status and transition it to 'running'
    before the pre_run_hook is called"""
    job = jt_linked.create_unified_job()
    job.status = 'waiting'
    job.save()
    status_at_pre_run = None
    def capture_status(instance, private_data_dir):
        nonlocal status_at_pre_run
        instance.refresh_from_db()
        status_at_pre_run = instance.status
    mock_pre_run = mocker.patch.object(RunJob, 'pre_run_hook', side_effect=capture_status)
    task = RunJob()
    try:
        task.run(job.id)
    except Exception:
        pass
    mock_pre_run.assert_called_once()
    assert status_at_pre_run == 'running'