[AAP-74497] Reset orphaned waiting jobs when controller node is deprovisioned (#16467)

Reset orphaned waiting jobs when controller node is deprovisioned

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Seth Foster
2026-06-02 10:46:52 -04:00
committed by GitHub
parent b14b9e1771
commit 5cc467d4cf
2 changed files with 46 additions and 0 deletions

View File

@@ -160,3 +160,38 @@ class TestJobReaper(object):
assert job.started > ref_time
assert job.status == 'running'
assert job.job_explanation == ''
def test_waiting_job_reset_when_controller_node_deprovisioned(self):
"""When a controller pod is replaced (e.g. K8s rollout), waiting jobs
assigned to the now-gone controller_node should be reset to pending
by the task manager so they can be re-dispatched."""
from awx.main.scheduler import TaskManager
live_inst = Instance(hostname='awx-task-live', node_type='control')
live_inst.save()
# No instance record for 'awx-task-dead' — it was already deprovisioned
job = Job.objects.create(status='waiting', controller_node='awx-task-dead', execution_node='')
tm = TaskManager()
tm.reap_jobs_from_orphaned_instances()
job.refresh_from_db()
assert job.status == 'pending'
assert job.controller_node == ''
assert job.execution_node == ''
@pytest.mark.parametrize('node_type', ['control', 'hybrid'])
def test_waiting_job_not_reset_when_controller_node_alive(self, node_type):
"""Waiting jobs on a live control or hybrid node should not be touched."""
from awx.main.scheduler import TaskManager
live_inst = Instance(hostname='awx-task-live', node_type=node_type)
live_inst.save()
job = Job.objects.create(status='waiting', controller_node='awx-task-live', execution_node='')
tm = TaskManager()
tm.reap_jobs_from_orphaned_instances()
job.refresh_from_db()
assert job.status == 'waiting'
assert job.controller_node == 'awx-task-live'