From d0c5c3d3cf7f2290d5741f4eb33db999388399e3 Mon Sep 17 00:00:00 2001 From: chris meyers Date: Mon, 25 Oct 2021 15:31:48 -0400 Subject: [PATCH] add work_unit_id to job lifecycle --- awx/main/models/unified_jobs.py | 7 ++++++- awx/main/tasks.py | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index 5281f25e6f..671daf104d 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -1497,7 +1497,12 @@ class UnifiedJob( return False def log_lifecycle(self, state, blocked_by=None): - extra = {'type': self._meta.model_name, 'task_id': self.id, 'state': state} + extra = { + 'type': self._meta.model_name, + 'task_id': self.id, + 'state': state, + 'work_unit_id': self.work_unit_id, + } if self.unified_job_template: extra["template_name"] = self.unified_job_template.name if state == "blocked" and blocked_by: diff --git a/awx/main/tasks.py b/awx/main/tasks.py index acc60fbf98..93ab6ccd81 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -3107,7 +3107,21 @@ class AWXReceptorJob: _kw['tlsclient'] = get_tls_client(use_stream_tls) result = receptor_ctl.submit_work(worktype=self.work_type, payload=sockout.makefile('rb'), params=self.receptor_params, signwork=self.sign_work, **_kw) self.unit_id = result['unitid'] + # Update the job with the work unit in-memory so that the log_lifecycle + # will print out the work unit that is to be associated with the job in the database + # via the update_model() call. + # We want to log the work_unit_id as early as possible. A failure can happen in between + # when we start the job in receptor and when we associate the job <-> work_unit_id. + # In that case, there will be work running in receptor and Controller will not know + # which Job it is associated with. + # We do not programatically handle this case. Ideally, we would handle this with a reaper case. + # The two distinct job lifecycle log events below allow for us to at least detect when this + # edge case occurs. If the lifecycle event work_unit_id_received occurs without the + # work_unit_id_assigned event then this case may have occured. + self.task.instance.work_unit_id = result['unitid'] # Set work_unit_id in-memory only + self.task.instance.log_lifecycle("work_unit_id_received") self.task.update_model(self.task.instance.pk, work_unit_id=result['unitid']) + self.task.instance.log_lifecycle("work_unit_id_assigned") sockin.close() sockout.close()