Fix cancel bug - WorkflowManager cancel in transaction (#14608)

This fixes a bug where jobs within a workflow job were not canceled
  when the workflow job was canceled by the user

The fix is to submit the cancel request as a part of the
  transaction that WorkflowManager commits its work in
  this requires that we send the message without expecting a reply
  so this changes the control-with-reply cancel to just a control function
This commit is contained in:
Alan Rominger 2023-10-30 15:30:18 -04:00 committed by GitHub
parent f4c53aaf22
commit 93c329d9d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 13 additions and 4 deletions

View File

@ -37,8 +37,11 @@ class Control(object):
def running(self, *args, **kwargs):
return self.control_with_reply('running', *args, **kwargs)
def cancel(self, task_ids, *args, **kwargs):
return self.control_with_reply('cancel', *args, extra_data={'task_ids': task_ids}, **kwargs)
def cancel(self, task_ids, with_reply=True):
if with_reply:
return self.control_with_reply('cancel', extra_data={'task_ids': task_ids})
else:
self.control({'control': 'cancel', 'task_ids': task_ids, 'reply_to': None}, extra_data={'task_ids': task_ids})
def schedule(self, *args, **kwargs):
return self.control_with_reply('schedule', *args, **kwargs)

View File

@ -89,8 +89,9 @@ class AWXConsumerBase(object):
if task_ids and not msg:
logger.info(f'Could not locate running tasks to cancel with ids={task_ids}')
with pg_bus_conn() as conn:
conn.notify(reply_queue, json.dumps(msg))
if reply_queue is not None:
with pg_bus_conn() as conn:
conn.notify(reply_queue, json.dumps(msg))
elif control == 'reload':
for worker in self.pool.workers:
worker.quit()

View File

@ -1439,6 +1439,11 @@ class UnifiedJob(
if not self.celery_task_id:
return
canceled = []
if not connection.get_autocommit():
# this condition is purpose-written for the task manager, when it cancels jobs in workflows
ControlDispatcher('dispatcher', self.controller_node).cancel([self.celery_task_id], with_reply=False)
return True # task manager itself needs to act under assumption that cancel was received
try:
# Use control and reply mechanism to cancel and obtain confirmation
timeout = 5