mirror of
https://github.com/ansible/awx.git
synced 2026-02-21 21:20:08 -03:30
Add grace period settings for task manager timeout, and pod / job waiting reapers
Co-authored-by: Alan Rominger <arominge@redhat.com>
This commit is contained in:
committed by
Alan Rominger
parent
c649809eb2
commit
3c51cb130f
@@ -2,6 +2,7 @@ from datetime import timedelta
|
||||
import logging
|
||||
|
||||
from django.db.models import Q
|
||||
from django.conf import settings
|
||||
from django.utils.timezone import now as tz_now
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
|
||||
@@ -34,7 +35,9 @@ def startup_reaping():
|
||||
|
||||
|
||||
def reap_job(j, status):
|
||||
if UnifiedJob.objects.get(id=j.id).status not in ('running', 'waiting'):
|
||||
j.refresh_from_db(fields=['status'])
|
||||
status_before = j.status
|
||||
if status_before not in ('running', 'waiting'):
|
||||
# just in case, don't reap jobs that aren't running
|
||||
return
|
||||
j.status = status
|
||||
@@ -49,13 +52,16 @@ def reap_job(j, status):
|
||||
if hasattr(j, 'send_notification_templates'):
|
||||
j.send_notification_templates('failed')
|
||||
j.websocket_emit_status(status)
|
||||
logger.error('{} is no longer running; reaping'.format(j.log_format))
|
||||
logger.error(f'{j.log_format} is no longer {status_before}; reaping')
|
||||
|
||||
|
||||
def reap_waiting(instance=None, status='failed', grace_period=60):
|
||||
def reap_waiting(instance=None, status='failed', grace_period=None, excluded_uuids=None):
|
||||
"""
|
||||
Reap all jobs in waiting for this instance.
|
||||
"""
|
||||
if grace_period is None:
|
||||
grace_period = settings.JOB_WAITING_GRACE_PERIOD + settings.TASK_MANAGER_TIMEOUT
|
||||
|
||||
me = instance
|
||||
if me is None:
|
||||
try:
|
||||
@@ -65,11 +71,13 @@ def reap_waiting(instance=None, status='failed', grace_period=60):
|
||||
return
|
||||
now = tz_now()
|
||||
jobs = UnifiedJob.objects.filter(status='waiting', modified__lte=now - timedelta(seconds=grace_period), controller_node=me.hostname)
|
||||
if excluded_uuids:
|
||||
jobs = jobs.exclude(celery_task_id__in=excluded_uuids)
|
||||
for j in jobs:
|
||||
reap_job(j, status)
|
||||
|
||||
|
||||
def reap(instance=None, status='failed', excluded_uuids=[]):
|
||||
def reap(instance=None, status='failed', excluded_uuids=None):
|
||||
"""
|
||||
Reap all jobs in running for this instance.
|
||||
"""
|
||||
@@ -83,6 +91,8 @@ def reap(instance=None, status='failed', excluded_uuids=[]):
|
||||
workflow_ctype_id = ContentType.objects.get_for_model(WorkflowJob).id
|
||||
jobs = UnifiedJob.objects.filter(
|
||||
Q(status='running') & (Q(execution_node=me.hostname) | Q(controller_node=me.hostname)) & ~Q(polymorphic_ctype_id=workflow_ctype_id)
|
||||
).exclude(celery_task_id__in=excluded_uuids)
|
||||
)
|
||||
if excluded_uuids:
|
||||
jobs = jobs.exclude(celery_task_id__in=excluded_uuids)
|
||||
for j in jobs:
|
||||
reap_job(j, status)
|
||||
|
||||
Reference in New Issue
Block a user