mirror of
https://github.com/ansible/awx.git
synced 2026-06-25 16:38:03 -02:30
Fix job cancel chain bugs (#16325)
* Fix job cancel chain bugs * Early relief valve for canceled jobs, ATF related changes * Add test and fix for approval nodes as well * Revert unwanted change * Refactor workflow approval nodes to make it more clean * Revert data structure changes * Delete local utility file * Review comment addressing * Use canceled status in websocket * Delete slop * Add agent marker * Bugbot comment about status websocket mismatch
This commit is contained in:
@@ -0,0 +1,274 @@
|
||||
# Generated by Claude Opus 4.6 (claude-opus-4-6)
|
||||
#
|
||||
# Test file for cancel + dependency chain behavior and workflow cancel propagation.
|
||||
#
|
||||
# These tests verify:
|
||||
#
|
||||
# 1. TaskManager.process_job_dep_failures() correctly distinguishes canceled vs
|
||||
# failed dependencies in the job_explanation message.
|
||||
#
|
||||
# 2. TaskManager.process_pending_tasks() transitions pending jobs with
|
||||
# cancel_flag=True directly to canceled status.
|
||||
#
|
||||
# 3. WorkflowManager + TaskManager together cancel all spawned jobs in a
|
||||
# workflow and finalize the workflow as canceled.
|
||||
|
||||
import pytest
|
||||
from unittest import mock
|
||||
|
||||
from awx.main.scheduler import TaskManager, DependencyManager, WorkflowManager
|
||||
from awx.main.models import JobTemplate, ProjectUpdate, WorkflowApproval, WorkflowJobTemplate
|
||||
from awx.main.models.workflow import WorkflowApprovalTemplate
|
||||
from . import create_job
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def scm_on_launch_objects(job_template_factory):
|
||||
"""Create a job template with a project configured for scm_update_on_launch."""
|
||||
objects = job_template_factory(
|
||||
'jt',
|
||||
organization='org1',
|
||||
project='proj',
|
||||
inventory='inv',
|
||||
credential='cred',
|
||||
)
|
||||
p = objects.project
|
||||
p.scm_update_on_launch = True
|
||||
p.scm_update_cache_timeout = 0
|
||||
p.save(skip_update=True)
|
||||
return objects
|
||||
|
||||
|
||||
def _create_job_with_dependency(objects):
|
||||
"""Create a pending job and run DependencyManager to produce its project update dependency.
|
||||
|
||||
Returns (job, project_update).
|
||||
"""
|
||||
j = create_job(objects.job_template, dependencies_processed=False)
|
||||
with mock.patch('awx.main.models.unified_jobs.UnifiedJobTemplate.update'):
|
||||
DependencyManager().schedule()
|
||||
assert j.dependent_jobs.count() == 1
|
||||
pu = j.dependent_jobs.first()
|
||||
assert isinstance(pu.get_real_instance(), ProjectUpdate)
|
||||
return j, pu
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestCanceledDependencyFailsBlockedJob:
|
||||
"""When a dependency project update is canceled or failed, the task manager
|
||||
should fail the blocked job via process_job_dep_failures."""
|
||||
|
||||
def test_canceled_dependency_fails_blocked_job(self, controlplane_instance_group, scm_on_launch_objects):
|
||||
"""A canceled dependency causes the blocked job to be failed with
|
||||
a 'Previous Task Canceled' explanation."""
|
||||
j, pu = _create_job_with_dependency(scm_on_launch_objects)
|
||||
|
||||
ProjectUpdate.objects.filter(pk=pu.pk).update(status='canceled', cancel_flag=True)
|
||||
|
||||
with mock.patch("awx.main.scheduler.TaskManager.start_task"):
|
||||
TaskManager().schedule()
|
||||
|
||||
j.refresh_from_db()
|
||||
assert j.status == 'failed'
|
||||
assert 'Previous Task Canceled' in j.job_explanation
|
||||
|
||||
def test_failed_dependency_fails_blocked_job(self, controlplane_instance_group, scm_on_launch_objects):
|
||||
"""A failed dependency causes the blocked job to be failed with
|
||||
a 'Previous Task Failed' explanation."""
|
||||
j, pu = _create_job_with_dependency(scm_on_launch_objects)
|
||||
|
||||
ProjectUpdate.objects.filter(pk=pu.pk).update(status='failed')
|
||||
|
||||
with mock.patch("awx.main.scheduler.TaskManager.start_task"):
|
||||
TaskManager().schedule()
|
||||
|
||||
j.refresh_from_db()
|
||||
assert j.status == 'failed'
|
||||
assert 'Previous Task Failed' in j.job_explanation
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestTaskManagerCancelsPendingJobsWithCancelFlag:
|
||||
"""When the task manager encounters pending jobs that have cancel_flag set,
|
||||
it should transition them directly to canceled status."""
|
||||
|
||||
def test_pending_job_with_cancel_flag_is_canceled(self, controlplane_instance_group, job_template_factory):
|
||||
"""A pending job with cancel_flag=True is transitioned to canceled
|
||||
by the task manager without being started."""
|
||||
objects = job_template_factory(
|
||||
'jt',
|
||||
organization='org1',
|
||||
project='proj',
|
||||
inventory='inv',
|
||||
credential='cred',
|
||||
)
|
||||
j = create_job(objects.job_template)
|
||||
j.cancel_flag = True
|
||||
j.save(update_fields=['cancel_flag'])
|
||||
|
||||
with mock.patch("awx.main.scheduler.TaskManager.start_task") as mock_start:
|
||||
TaskManager().schedule()
|
||||
|
||||
j.refresh_from_db()
|
||||
assert j.status == 'canceled'
|
||||
assert 'canceled before it started' in j.job_explanation
|
||||
assert not mock_start.called
|
||||
|
||||
def test_pending_job_without_cancel_flag_is_not_canceled(self, controlplane_instance_group, job_template_factory):
|
||||
"""A normal pending job without cancel_flag should not be canceled
|
||||
by the task manager (sanity check)."""
|
||||
objects = job_template_factory(
|
||||
'jt',
|
||||
organization='org1',
|
||||
project='proj',
|
||||
inventory='inv',
|
||||
credential='cred',
|
||||
)
|
||||
j = create_job(objects.job_template)
|
||||
|
||||
with mock.patch("awx.main.scheduler.TaskManager.start_task"):
|
||||
TaskManager().schedule()
|
||||
|
||||
j.refresh_from_db()
|
||||
assert j.status != 'canceled'
|
||||
|
||||
def test_multiple_pending_jobs_with_cancel_flag_bulk_canceled(self, controlplane_instance_group, job_template_factory):
|
||||
"""Multiple pending jobs with cancel_flag=True are all transitioned
|
||||
to canceled in a single task manager cycle."""
|
||||
objects = job_template_factory(
|
||||
'jt',
|
||||
organization='org1',
|
||||
project='proj',
|
||||
inventory='inv',
|
||||
credential='cred',
|
||||
)
|
||||
jt = objects.job_template
|
||||
jt.allow_simultaneous = True
|
||||
jt.save()
|
||||
|
||||
jobs = []
|
||||
for _ in range(3):
|
||||
j = create_job(jt)
|
||||
j.cancel_flag = True
|
||||
j.save(update_fields=['cancel_flag'])
|
||||
jobs.append(j)
|
||||
|
||||
with mock.patch("awx.main.scheduler.TaskManager.start_task") as mock_start:
|
||||
TaskManager().schedule()
|
||||
|
||||
for j in jobs:
|
||||
j.refresh_from_db()
|
||||
assert j.status == 'canceled', f"Job {j.id} should be canceled but is {j.status}"
|
||||
assert 'canceled before it started' in j.job_explanation
|
||||
assert not mock_start.called
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestWorkflowCancelFinalizesWorkflow:
|
||||
"""When a workflow job is canceled, the WorkflowManager cancels spawned child
|
||||
jobs (setting cancel_flag), the TaskManager transitions those pending jobs to
|
||||
canceled, and a final WorkflowManager pass finalizes the workflow as canceled."""
|
||||
|
||||
def test_cancel_workflow_with_parallel_nodes(self, inventory, project, controlplane_instance_group):
|
||||
"""Create a workflow with parallel nodes, cancel it after one job is
|
||||
running, and verify all jobs and the workflow reach canceled status."""
|
||||
jt = JobTemplate.objects.create(allow_simultaneous=False, inventory=inventory, project=project, playbook='helloworld.yml')
|
||||
wfjt = WorkflowJobTemplate.objects.create(name='test-cancel-wf')
|
||||
for _ in range(4):
|
||||
wfjt.workflow_nodes.create(unified_job_template=jt)
|
||||
|
||||
wj = wfjt.create_unified_job()
|
||||
wj.signal_start()
|
||||
|
||||
# TaskManager transitions workflow job to running via start_task
|
||||
TaskManager().schedule()
|
||||
wj.refresh_from_db()
|
||||
assert wj.status == 'running'
|
||||
|
||||
# WorkflowManager spawns jobs for all 4 nodes
|
||||
WorkflowManager().schedule()
|
||||
assert jt.jobs.count() == 4
|
||||
|
||||
# Simulate one job running (blocking the others via allow_simultaneous=False)
|
||||
first_job = jt.jobs.order_by('created').first()
|
||||
first_job.status = 'running'
|
||||
first_job.celery_task_id = 'fake-task-id'
|
||||
first_job.controller_node = 'test-node'
|
||||
first_job.save(update_fields=['status', 'celery_task_id', 'controller_node'])
|
||||
|
||||
# Cancel the workflow
|
||||
wj.cancel_flag = True
|
||||
wj.save(update_fields=['cancel_flag'])
|
||||
|
||||
# WorkflowManager sees cancel_flag, calls cancel_node_jobs() which sets
|
||||
# cancel_flag on all child jobs
|
||||
with mock.patch('awx.main.models.unified_jobs.UnifiedJob.cancel_dispatcher_process'):
|
||||
WorkflowManager().schedule()
|
||||
|
||||
# The running job won't actually stop in tests (no dispatcher), simulate it
|
||||
first_job.status = 'canceled'
|
||||
first_job.save(update_fields=['status'])
|
||||
|
||||
# TaskManager processes remaining pending jobs with cancel_flag set
|
||||
with mock.patch("awx.main.scheduler.TaskManager.start_task") as mock_start:
|
||||
DependencyManager().schedule()
|
||||
TaskManager().schedule()
|
||||
|
||||
for job in jt.jobs.all():
|
||||
job.refresh_from_db()
|
||||
assert job.status == 'canceled', f"Job {job.id} should be canceled but is {job.status}"
|
||||
assert not mock_start.called
|
||||
|
||||
# Final WorkflowManager pass finalizes the workflow
|
||||
WorkflowManager().schedule()
|
||||
wj.refresh_from_db()
|
||||
assert wj.status == 'canceled'
|
||||
|
||||
def test_cancel_workflow_with_approval_node(self, controlplane_instance_group):
|
||||
"""Create a workflow with a pending approval node and a downstream job
|
||||
node. Cancel the workflow and verify the approval is directly canceled
|
||||
by the WorkflowManager (since approvals are excluded from TaskManager),
|
||||
the downstream node is marked do_not_run, and the workflow finalizes."""
|
||||
approval_template = WorkflowApprovalTemplate.objects.create(name='test-approval', timeout=0)
|
||||
wfjt = WorkflowJobTemplate.objects.create(name='test-cancel-approval-wf')
|
||||
approval_node = wfjt.workflow_nodes.create(unified_job_template=approval_template)
|
||||
|
||||
# Add a downstream node (just another approval to keep it simple)
|
||||
downstream_template = WorkflowApprovalTemplate.objects.create(name='test-downstream', timeout=0)
|
||||
downstream_node = wfjt.workflow_nodes.create(unified_job_template=downstream_template)
|
||||
approval_node.success_nodes.add(downstream_node)
|
||||
|
||||
wj = wfjt.create_unified_job()
|
||||
wj.signal_start()
|
||||
|
||||
# TaskManager transitions workflow to running
|
||||
TaskManager().schedule()
|
||||
wj.refresh_from_db()
|
||||
assert wj.status == 'running'
|
||||
|
||||
# WorkflowManager spawns the approval (root node only, downstream waits)
|
||||
WorkflowManager().schedule()
|
||||
assert WorkflowApproval.objects.filter(unified_job_node__workflow_job=wj).count() == 1
|
||||
|
||||
approval_job = WorkflowApproval.objects.get(unified_job_node__workflow_job=wj)
|
||||
assert approval_job.status == 'pending'
|
||||
|
||||
# Cancel the workflow
|
||||
wj.cancel_flag = True
|
||||
wj.save(update_fields=['cancel_flag'])
|
||||
|
||||
# WorkflowManager should cancel the approval directly and mark
|
||||
# the downstream node as do_not_run
|
||||
WorkflowManager().schedule()
|
||||
|
||||
approval_job.refresh_from_db()
|
||||
assert approval_job.status == 'canceled', f"Approval should be canceled directly by WorkflowManager but is {approval_job.status}"
|
||||
|
||||
# Downstream node should be marked do_not_run with no job spawned
|
||||
downstream_wj_node = wj.workflow_nodes.get(unified_job_template=downstream_template)
|
||||
assert downstream_wj_node.do_not_run is True
|
||||
assert downstream_wj_node.job is None
|
||||
|
||||
# Workflow should finalize as canceled in the same pass
|
||||
wj.refresh_from_db()
|
||||
assert wj.status == 'canceled'
|
||||
Reference in New Issue
Block a user