From 59bd73bff8f8b2c3fd6cc470197b480d1ddb0f6d Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Thu, 12 Aug 2021 12:55:08 -0400 Subject: [PATCH] add setting for notification job status retry loop * We trigger notifications when the callback receiver processes the playbook_on_stats event. This is the last event in ansible-playbook and the process should exist very shortly after this event is emitted. The trouble comes in with the isolated node feature. There is a management playbook that runs periodically that pulls the events from the remote node. It's possible that the management playbooks runs, gets the playbook_on_stats event, but does not see that the playbook is finished running. Therefore the job status is still seen as 'running' BUT we have kicked of the notification for the job. The notification worker will enter a loop waiting on the job to enter the finished state. In this case the time it takes for the job to enter the finished state can be long, roughly 2 * the management playbook run time. * This new setting allows the user to increase the time that the notification spends waiting for the job to enter the finished state. --- awx/main/tasks/system.py | 2 +- awx/settings/defaults.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/awx/main/tasks/system.py b/awx/main/tasks/system.py index 008c3bcb2f..4397a79bed 100644 --- a/awx/main/tasks/system.py +++ b/awx/main/tasks/system.py @@ -846,7 +846,7 @@ def handle_work_error(task_id, *args, **kwargs): def handle_success_and_failure_notifications(job_id): uj = UnifiedJob.objects.get(pk=job_id) retries = 0 - while retries < 5: + while retries < settings.AWX_NOTIFICATION_JOB_FINISH_MAX_RETRY: if uj.finished: uj.send_notification_templates('succeeded' if uj.status == 'successful' else 'failed') return diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index b11710495e..d3ea0c245d 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -983,6 +983,9 @@ BROADCAST_WEBSOCKET_NEW_INSTANCE_POLL_RATE_SECONDS = 10 # How often websocket process will generate stats BROADCAST_WEBSOCKET_STATS_POLL_RATE_SECONDS = 5 +# Number of times to retry sending a notification when waiting on a job to finish. +AWX_NOTIFICATION_JOB_FINISH_MAX_RETRY = 5 + DJANGO_GUID = {'GUID_HEADER_NAME': 'X-API-Request-Id'} # Name of the default task queue