Files
awx/awx/main/tests/functional/test_dispatch.py
Seth Foster 5cc467d4cf [AAP-74497] Reset orphaned waiting jobs when controller node is deprovisioned (#16467)
Reset orphaned waiting jobs when controller node is deprovisioned

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-06-02 10:46:52 -04:00

198 lines
6.7 KiB
Python

import datetime
from unittest import mock
from django.utils.timezone import now as tz_now
import pytest
from awx.main.models import Job, WorkflowJob, Instance
from awx.main.dispatch import reaper
from awx.main.tasks import system
from dispatcherd.publish import task
'''
Prevent logger.<warn, debug, error> calls from triggering database operations
'''
@pytest.fixture(autouse=True)
def _disable_database_settings(mocker):
m = mocker.patch('awx.conf.settings.SettingsWrapper.all_supported_settings', new_callable=mock.PropertyMock)
m.return_value = []
def restricted(a, b):
raise AssertionError("This code should not run because it isn't decorated with @task")
@task()
def add(a, b):
return a + b
class BaseTask(object):
def add(self, a, b):
return add(a, b)
class Restricted(object):
def run(self, a, b):
raise AssertionError("This code should not run because it isn't decorated with @task")
@task()
class Adder(BaseTask):
def run(self, a, b):
return super(Adder, self).add(a, b)
@task(queue='hard-math')
def multiply(a, b):
return a * b
yesterday = tz_now() - datetime.timedelta(days=1)
minute = tz_now() - datetime.timedelta(seconds=120)
now = tz_now()
@pytest.mark.django_db
class TestJobReaper(object):
@pytest.mark.parametrize(
'status, execution_node, controller_node, modified, fail',
[
('running', '', '', None, False), # running, not assigned to the instance
('running', 'awx', '', None, True), # running, has the instance as its execution_node
('running', '', 'awx', None, True), # running, has the instance as its controller_node
],
)
def test_should_reap(self, status, fail, execution_node, controller_node, modified):
i = Instance(hostname='awx')
i.save()
j = Job(
status=status,
execution_node=execution_node,
controller_node=controller_node,
start_args='SENSITIVE',
)
j.save()
if modified:
# we have to edit the modification time _without_ calling save()
# (because .save() overwrites it to _now_)
Job.objects.filter(id=j.id).update(modified=modified)
reaper.reap(i)
job = Job.objects.first()
if fail:
assert job.status == 'failed'
assert 'marked as failed' in job.job_explanation
assert job.start_args == ''
else:
assert job.status == status
def test_waiting_job_sent_back_to_pending(self):
this_inst = Instance(hostname='awx')
this_inst.save()
lost_inst = Instance(hostname='lost', node_type=Instance.Types.EXECUTION, node_state=Instance.States.UNAVAILABLE)
lost_inst.save()
job = Job.objects.create(status='waiting', controller_node=lost_inst.hostname, execution_node='lost')
system._heartbeat_handle_lost_instances([lost_inst], this_inst)
job.refresh_from_db()
assert job.status == 'pending'
assert job.controller_node == ''
assert job.execution_node == ''
@pytest.mark.parametrize(
'excluded_uuids, fail, started',
[
(['abc123'], False, None),
([], False, None),
([], True, minute),
],
)
def test_do_not_reap_excluded_uuids(self, excluded_uuids, fail, started):
"""Modified Test to account for ref_time in reap()"""
i = Instance(hostname='awx')
i.save()
j = Job(
status='running',
execution_node='awx',
controller_node='',
start_args='SENSITIVE',
celery_task_id='abc123',
)
j.save()
if started:
Job.objects.filter(id=j.id).update(started=started)
# if the UUID is excluded, don't reap it
reaper.reap(i, excluded_uuids=excluded_uuids, ref_time=now)
job = Job.objects.first()
if fail:
assert job.status == 'failed'
assert 'marked as failed' in job.job_explanation
assert job.start_args == ''
else:
assert job.status == 'running'
def test_workflow_does_not_reap(self):
i = Instance(hostname='awx')
i.save()
j = WorkflowJob(status='running', execution_node='awx')
j.save()
reaper.reap(i)
assert WorkflowJob.objects.first().status == 'running'
def test_should_not_reap_new(self):
"""
This test is designed specifically to ensure that jobs that are launched after the dispatcher has provided a list of UUIDs aren't reaped.
It is very racy and this test is designed with that in mind
"""
i = Instance(hostname='awx')
# ref_time is set to 10 seconds in the past to mimic someone launching a job in the heartbeat window.
ref_time = tz_now() - datetime.timedelta(seconds=10)
# creating job at current time
job = Job.objects.create(status='running', controller_node=i.hostname)
reaper.reap(i, ref_time=ref_time)
# explictly refreshing from db to ensure up to date cache
job.refresh_from_db()
assert job.started > ref_time
assert job.status == 'running'
assert job.job_explanation == ''
def test_waiting_job_reset_when_controller_node_deprovisioned(self):
"""When a controller pod is replaced (e.g. K8s rollout), waiting jobs
assigned to the now-gone controller_node should be reset to pending
by the task manager so they can be re-dispatched."""
from awx.main.scheduler import TaskManager
live_inst = Instance(hostname='awx-task-live', node_type='control')
live_inst.save()
# No instance record for 'awx-task-dead' — it was already deprovisioned
job = Job.objects.create(status='waiting', controller_node='awx-task-dead', execution_node='')
tm = TaskManager()
tm.reap_jobs_from_orphaned_instances()
job.refresh_from_db()
assert job.status == 'pending'
assert job.controller_node == ''
assert job.execution_node == ''
@pytest.mark.parametrize('node_type', ['control', 'hybrid'])
def test_waiting_job_not_reset_when_controller_node_alive(self, node_type):
"""Waiting jobs on a live control or hybrid node should not be touched."""
from awx.main.scheduler import TaskManager
live_inst = Instance(hostname='awx-task-live', node_type=node_type)
live_inst.save()
job = Job.objects.create(status='waiting', controller_node='awx-task-live', execution_node='')
tm = TaskManager()
tm.reap_jobs_from_orphaned_instances()
job.refresh_from_db()
assert job.status == 'waiting'
assert job.controller_node == 'awx-task-live'