Files
awx/awx/main/dispatch/reaper.py
Gabriel Muniz e15f4de0dd Fix race with heartbeat and reaper logic (#13713)
* Fix race with heartbeat and reaper logic

* Fix tests to fail when over drift over heartbeat time

* replaced modified with started time for reap() code and added test

* fixed logic bug and cleaned up tests

* Added comments to tests to call out reasoning
2023-03-17 14:24:31 -04:00

91 lines
3.5 KiB
Python

from datetime import timedelta
import logging
from django.db.models import Q
from django.conf import settings
from django.utils.timezone import now as tz_now
from django.contrib.contenttypes.models import ContentType
from awx.main.models import Instance, UnifiedJob, WorkflowJob
logger = logging.getLogger('awx.main.dispatch')
def startup_reaping():
"""
If this particular instance is starting, then we know that any running jobs are invalid
so we will reap those jobs as a special action here
"""
jobs = UnifiedJob.objects.filter(status='running', controller_node=Instance.objects.my_hostname())
job_ids = []
for j in jobs:
job_ids.append(j.id)
reap_job(
j,
'failed',
job_explanation='Task was marked as running at system start up. The system must have not shut down properly, so it has been marked as failed.',
)
if job_ids:
logger.error(f'Unified jobs {job_ids} were reaped on dispatch startup')
def reap_job(j, status, job_explanation=None):
j.refresh_from_db(fields=['status', 'job_explanation'])
status_before = j.status
if status_before not in ('running', 'waiting'):
# just in case, don't reap jobs that aren't running
return
j.status = status
j.start_args = '' # blank field to remove encrypted passwords
if j.job_explanation:
j.job_explanation += ' ' # Separate messages for readability
if job_explanation is None:
j.job_explanation += 'Task was marked as running but was not present in the job queue, so it has been marked as failed.'
else:
j.job_explanation += job_explanation
j.save(update_fields=['status', 'start_args', 'job_explanation'])
if hasattr(j, 'send_notification_templates'):
j.send_notification_templates('failed')
j.websocket_emit_status(status)
logger.error(f'{j.log_format} is no longer {status_before}; reaping')
def reap_waiting(instance=None, status='failed', job_explanation=None, grace_period=None, excluded_uuids=None, ref_time=None):
"""
Reap all jobs in waiting for this instance.
"""
if grace_period is None:
grace_period = settings.JOB_WAITING_GRACE_PERIOD + settings.TASK_MANAGER_TIMEOUT
if instance is None:
hostname = Instance.objects.my_hostname()
else:
hostname = instance.hostname
if ref_time is None:
ref_time = tz_now()
jobs = UnifiedJob.objects.filter(status='waiting', modified__lte=ref_time - timedelta(seconds=grace_period), controller_node=hostname)
if excluded_uuids:
jobs = jobs.exclude(celery_task_id__in=excluded_uuids)
for j in jobs:
reap_job(j, status, job_explanation=job_explanation)
def reap(instance=None, status='failed', job_explanation=None, excluded_uuids=None, ref_time=None):
"""
Reap all jobs in running for this instance.
"""
if instance is None:
hostname = Instance.objects.my_hostname()
else:
hostname = instance.hostname
workflow_ctype_id = ContentType.objects.get_for_model(WorkflowJob).id
base_Q = Q(status='running') & (Q(execution_node=hostname) | Q(controller_node=hostname)) & ~Q(polymorphic_ctype_id=workflow_ctype_id)
if ref_time:
jobs = UnifiedJob.objects.filter(base_Q & Q(started__lte=ref_time))
else:
jobs = UnifiedJob.objects.filter(base_Q)
if excluded_uuids:
jobs = jobs.exclude(celery_task_id__in=excluded_uuids)
for j in jobs:
reap_job(j, status, job_explanation=job_explanation)