Refactor canceling to work through messaging and signals, not database

If canceled attempted before, still allow attempting another cancel
in this case, attempt to send the sigterm signal again.
Keep clicking, you might help!

Replace other cancel_callbacks with sigterm watcher
  adapt special inventory mechanism for this too

Get rid of the cancel_watcher method with exception in main thread

Handle academic case of sigterm race condition

Process cancelation as control signal

Fully connect cancel method and run_dispatcher to control

Never transition workflows directly to canceled, add logs
This commit is contained in:
Alan Rominger
2021-11-08 16:57:12 -05:00
parent f9428c10b9
commit c59bbdecdb
13 changed files with 137 additions and 91 deletions

View File

@@ -6,17 +6,16 @@ import os
import stat
# Django
from django.utils.timezone import now
from django.conf import settings
from django_guid import get_guid
from django.utils.functional import cached_property
from django.db import connections
# AWX
from awx.main.redact import UriCleaner
from awx.main.constants import MINIMAL_EVENTS, ANSIBLE_RUNNER_NEEDS_UPDATE_MESSAGE
from awx.main.utils.update_model import update_model
from awx.main.queue import CallbackQueueDispatcher
from awx.main.tasks.signals import signal_callback
logger = logging.getLogger('awx.main.tasks.callback')
@@ -175,28 +174,6 @@ class RunnerCallback:
return False
def cancel_callback(self):
"""
Ansible runner callback to tell the job when/if it is canceled
"""
unified_job_id = self.instance.pk
if signal_callback():
return True
try:
self.instance = self.update_model(unified_job_id)
except Exception:
logger.exception(f'Encountered error during cancel check for {unified_job_id}, canceling now')
return True
if not self.instance:
logger.error('unified job {} was deleted while running, canceling'.format(unified_job_id))
return True
if self.instance.cancel_flag or self.instance.status == 'canceled':
cancel_wait = (now() - self.instance.modified).seconds if self.instance.modified else 0
if cancel_wait > 5:
logger.warning('Request to cancel {} took {} seconds to complete.'.format(self.instance.log_format, cancel_wait))
return True
return False
def finished_callback(self, runner_obj):
"""
Ansible runner callback triggered on finished run
@@ -227,6 +204,8 @@ class RunnerCallback:
with disable_activity_stream():
self.instance = self.update_model(self.instance.pk, job_args=json.dumps(runner_config.command), job_cwd=runner_config.cwd, job_env=job_env)
# We opened a connection just for that save, close it here now
connections.close_all()
elif status_data['status'] == 'failed':
# For encrypted ssh_key_data, ansible-runner worker will open and write the
# ssh_key_data to a named pipe. Then, once the podman container starts, ssh-agent will