mirror of
https://github.com/ansible/awx.git
synced 2026-03-21 02:47:35 -02:30
Fix race with heartbeat and reaper logic (#13713)
* Fix race with heartbeat and reaper logic * Fix tests to fail when over drift over heartbeat time * replaced modified with started time for reap() code and added test * fixed logic bug and cleaned up tests * Added comments to tests to call out reasoning
This commit is contained in:
@@ -70,7 +70,7 @@ def reap_waiting(instance=None, status='failed', job_explanation=None, grace_per
|
|||||||
reap_job(j, status, job_explanation=job_explanation)
|
reap_job(j, status, job_explanation=job_explanation)
|
||||||
|
|
||||||
|
|
||||||
def reap(instance=None, status='failed', job_explanation=None, excluded_uuids=None):
|
def reap(instance=None, status='failed', job_explanation=None, excluded_uuids=None, ref_time=None):
|
||||||
"""
|
"""
|
||||||
Reap all jobs in running for this instance.
|
Reap all jobs in running for this instance.
|
||||||
"""
|
"""
|
||||||
@@ -79,9 +79,11 @@ def reap(instance=None, status='failed', job_explanation=None, excluded_uuids=No
|
|||||||
else:
|
else:
|
||||||
hostname = instance.hostname
|
hostname = instance.hostname
|
||||||
workflow_ctype_id = ContentType.objects.get_for_model(WorkflowJob).id
|
workflow_ctype_id = ContentType.objects.get_for_model(WorkflowJob).id
|
||||||
jobs = UnifiedJob.objects.filter(
|
base_Q = Q(status='running') & (Q(execution_node=hostname) | Q(controller_node=hostname)) & ~Q(polymorphic_ctype_id=workflow_ctype_id)
|
||||||
Q(status='running') & (Q(execution_node=hostname) | Q(controller_node=hostname)) & ~Q(polymorphic_ctype_id=workflow_ctype_id)
|
if ref_time:
|
||||||
)
|
jobs = UnifiedJob.objects.filter(base_Q & Q(started__lte=ref_time))
|
||||||
|
else:
|
||||||
|
jobs = UnifiedJob.objects.filter(base_Q)
|
||||||
if excluded_uuids:
|
if excluded_uuids:
|
||||||
jobs = jobs.exclude(celery_task_id__in=excluded_uuids)
|
jobs = jobs.exclude(celery_task_id__in=excluded_uuids)
|
||||||
for j in jobs:
|
for j in jobs:
|
||||||
|
|||||||
@@ -581,7 +581,7 @@ def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
|
|||||||
active_task_ids = []
|
active_task_ids = []
|
||||||
for task_list in worker_tasks.values():
|
for task_list in worker_tasks.values():
|
||||||
active_task_ids.extend(task_list)
|
active_task_ids.extend(task_list)
|
||||||
reaper.reap(instance=this_inst, excluded_uuids=active_task_ids)
|
reaper.reap(instance=this_inst, excluded_uuids=active_task_ids, ref_time=datetime.fromisoformat(dispatch_time))
|
||||||
if max(len(task_list) for task_list in worker_tasks.values()) <= 1:
|
if max(len(task_list) for task_list in worker_tasks.values()) <= 1:
|
||||||
reaper.reap_waiting(instance=this_inst, excluded_uuids=active_task_ids, ref_time=datetime.fromisoformat(dispatch_time))
|
reaper.reap_waiting(instance=this_inst, excluded_uuids=active_task_ids, ref_time=datetime.fromisoformat(dispatch_time))
|
||||||
|
|
||||||
|
|||||||
@@ -337,6 +337,8 @@ class TestTaskPublisher:
|
|||||||
|
|
||||||
|
|
||||||
yesterday = tz_now() - datetime.timedelta(days=1)
|
yesterday = tz_now() - datetime.timedelta(days=1)
|
||||||
|
minute = tz_now() - datetime.timedelta(seconds=120)
|
||||||
|
now = tz_now()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
@@ -379,13 +381,15 @@ class TestJobReaper(object):
|
|||||||
assert job.status == status
|
assert job.status == status
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'excluded_uuids, fail',
|
'excluded_uuids, fail, started',
|
||||||
[
|
[
|
||||||
(['abc123'], False),
|
(['abc123'], False, None),
|
||||||
([], True),
|
([], False, None),
|
||||||
|
([], True, minute),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_do_not_reap_excluded_uuids(self, excluded_uuids, fail):
|
def test_do_not_reap_excluded_uuids(self, excluded_uuids, fail, started):
|
||||||
|
"""Modified Test to account for ref_time in reap()"""
|
||||||
i = Instance(hostname='awx')
|
i = Instance(hostname='awx')
|
||||||
i.save()
|
i.save()
|
||||||
j = Job(
|
j = Job(
|
||||||
@@ -396,10 +400,13 @@ class TestJobReaper(object):
|
|||||||
celery_task_id='abc123',
|
celery_task_id='abc123',
|
||||||
)
|
)
|
||||||
j.save()
|
j.save()
|
||||||
|
if started:
|
||||||
|
Job.objects.filter(id=j.id).update(started=started)
|
||||||
|
|
||||||
# if the UUID is excluded, don't reap it
|
# if the UUID is excluded, don't reap it
|
||||||
reaper.reap(i, excluded_uuids=excluded_uuids)
|
reaper.reap(i, excluded_uuids=excluded_uuids, ref_time=now)
|
||||||
job = Job.objects.first()
|
job = Job.objects.first()
|
||||||
|
|
||||||
if fail:
|
if fail:
|
||||||
assert job.status == 'failed'
|
assert job.status == 'failed'
|
||||||
assert 'marked as failed' in job.job_explanation
|
assert 'marked as failed' in job.job_explanation
|
||||||
@@ -415,3 +422,20 @@ class TestJobReaper(object):
|
|||||||
reaper.reap(i)
|
reaper.reap(i)
|
||||||
|
|
||||||
assert WorkflowJob.objects.first().status == 'running'
|
assert WorkflowJob.objects.first().status == 'running'
|
||||||
|
|
||||||
|
def test_should_not_reap_new(self):
|
||||||
|
"""
|
||||||
|
This test is designed specifically to ensure that jobs that are launched after the dispatcher has provided a list of UUIDs aren't reaped.
|
||||||
|
It is very racy and this test is designed with that in mind
|
||||||
|
"""
|
||||||
|
i = Instance(hostname='awx')
|
||||||
|
# ref_time is set to 10 seconds in the past to mimic someone launching a job in the heartbeat window.
|
||||||
|
ref_time = tz_now() - datetime.timedelta(seconds=10)
|
||||||
|
# creating job at current time
|
||||||
|
job = Job.objects.create(status='running', controller_node=i.hostname)
|
||||||
|
reaper.reap(i, ref_time=ref_time)
|
||||||
|
# explictly refreshing from db to ensure up to date cache
|
||||||
|
job.refresh_from_db()
|
||||||
|
assert job.started > ref_time
|
||||||
|
assert job.status == 'running'
|
||||||
|
assert job.job_explanation == ''
|
||||||
|
|||||||
Reference in New Issue
Block a user