mirror of
https://github.com/ansible/awx.git
synced 2026-03-20 18:37:39 -02:30
Merge pull request #11745 from AlanCoding/cancel_rework_no_close
Close database connections while processing job output
This commit is contained in:
@@ -37,18 +37,24 @@ class Control(object):
|
|||||||
def running(self, *args, **kwargs):
|
def running(self, *args, **kwargs):
|
||||||
return self.control_with_reply('running', *args, **kwargs)
|
return self.control_with_reply('running', *args, **kwargs)
|
||||||
|
|
||||||
|
def cancel(self, task_ids, *args, **kwargs):
|
||||||
|
return self.control_with_reply('cancel', *args, extra_data={'task_ids': task_ids}, **kwargs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def generate_reply_queue_name(cls):
|
def generate_reply_queue_name(cls):
|
||||||
return f"reply_to_{str(uuid.uuid4()).replace('-','_')}"
|
return f"reply_to_{str(uuid.uuid4()).replace('-','_')}"
|
||||||
|
|
||||||
def control_with_reply(self, command, timeout=5):
|
def control_with_reply(self, command, timeout=5, extra_data=None):
|
||||||
logger.warning('checking {} {} for {}'.format(self.service, command, self.queuename))
|
logger.warning('checking {} {} for {}'.format(self.service, command, self.queuename))
|
||||||
reply_queue = Control.generate_reply_queue_name()
|
reply_queue = Control.generate_reply_queue_name()
|
||||||
self.result = None
|
self.result = None
|
||||||
|
|
||||||
with pg_bus_conn(new_connection=True) as conn:
|
with pg_bus_conn(new_connection=True) as conn:
|
||||||
conn.listen(reply_queue)
|
conn.listen(reply_queue)
|
||||||
conn.notify(self.queuename, json.dumps({'control': command, 'reply_to': reply_queue}))
|
send_data = {'control': command, 'reply_to': reply_queue}
|
||||||
|
if extra_data:
|
||||||
|
send_data.update(extra_data)
|
||||||
|
conn.notify(self.queuename, json.dumps(send_data))
|
||||||
|
|
||||||
for reply in conn.events(select_timeout=timeout, yield_timeouts=True):
|
for reply in conn.events(select_timeout=timeout, yield_timeouts=True):
|
||||||
if reply is None:
|
if reply is None:
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ class AWXConsumerBase(object):
|
|||||||
def control(self, body):
|
def control(self, body):
|
||||||
logger.warning(f'Received control signal:\n{body}')
|
logger.warning(f'Received control signal:\n{body}')
|
||||||
control = body.get('control')
|
control = body.get('control')
|
||||||
if control in ('status', 'running'):
|
if control in ('status', 'running', 'cancel'):
|
||||||
reply_queue = body['reply_to']
|
reply_queue = body['reply_to']
|
||||||
if control == 'status':
|
if control == 'status':
|
||||||
msg = '\n'.join([self.listening_on, self.pool.debug()])
|
msg = '\n'.join([self.listening_on, self.pool.debug()])
|
||||||
@@ -72,6 +72,17 @@ class AWXConsumerBase(object):
|
|||||||
for worker in self.pool.workers:
|
for worker in self.pool.workers:
|
||||||
worker.calculate_managed_tasks()
|
worker.calculate_managed_tasks()
|
||||||
msg.extend(worker.managed_tasks.keys())
|
msg.extend(worker.managed_tasks.keys())
|
||||||
|
elif control == 'cancel':
|
||||||
|
msg = []
|
||||||
|
task_ids = set(body['task_ids'])
|
||||||
|
for worker in self.pool.workers:
|
||||||
|
task = worker.current_task
|
||||||
|
if task and task['uuid'] in task_ids:
|
||||||
|
logger.warn(f'Sending SIGTERM to task id={task["uuid"]}, task={task.get("task")}, args={task.get("args")}')
|
||||||
|
os.kill(worker.pid, signal.SIGTERM)
|
||||||
|
msg.append(task['uuid'])
|
||||||
|
if task_ids and not msg:
|
||||||
|
logger.info(f'Could not locate running tasks to cancel with ids={task_ids}')
|
||||||
|
|
||||||
with pg_bus_conn() as conn:
|
with pg_bus_conn() as conn:
|
||||||
conn.notify(reply_queue, json.dumps(msg))
|
conn.notify(reply_queue, json.dumps(msg))
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
# Copyright (c) 2015 Ansible, Inc.
|
# Copyright (c) 2015 Ansible, Inc.
|
||||||
# All Rights Reserved.
|
# All Rights Reserved.
|
||||||
import logging
|
import logging
|
||||||
|
import yaml
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.cache import cache as django_cache
|
from django.core.cache import cache as django_cache
|
||||||
@@ -30,7 +31,16 @@ class Command(BaseCommand):
|
|||||||
'--reload',
|
'--reload',
|
||||||
dest='reload',
|
dest='reload',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help=('cause the dispatcher to recycle all of its worker processes;' 'running jobs will run to completion first'),
|
help=('cause the dispatcher to recycle all of its worker processes; running jobs will run to completion first'),
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--cancel',
|
||||||
|
dest='cancel',
|
||||||
|
help=(
|
||||||
|
'Cancel a particular task id. Takes either a single id string, or a JSON list of multiple ids. '
|
||||||
|
'Can take in output from the --running argument as input to cancel all tasks. '
|
||||||
|
'Only running tasks can be canceled, queued tasks must be started before they can be canceled.'
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle(self, *arg, **options):
|
def handle(self, *arg, **options):
|
||||||
@@ -42,6 +52,16 @@ class Command(BaseCommand):
|
|||||||
return
|
return
|
||||||
if options.get('reload'):
|
if options.get('reload'):
|
||||||
return Control('dispatcher').control({'control': 'reload'})
|
return Control('dispatcher').control({'control': 'reload'})
|
||||||
|
if options.get('cancel'):
|
||||||
|
cancel_str = options.get('cancel')
|
||||||
|
try:
|
||||||
|
cancel_data = yaml.safe_load(cancel_str)
|
||||||
|
except Exception:
|
||||||
|
cancel_data = [cancel_str]
|
||||||
|
if not isinstance(cancel_data, list):
|
||||||
|
cancel_data = [cancel_str]
|
||||||
|
print(Control('dispatcher').cancel(cancel_data))
|
||||||
|
return
|
||||||
|
|
||||||
# It's important to close these because we're _about_ to fork, and we
|
# It's important to close these because we're _about_ to fork, and we
|
||||||
# don't want the forked processes to inherit the open sockets
|
# don't want the forked processes to inherit the open sockets
|
||||||
|
|||||||
@@ -412,6 +412,11 @@ class TaskManagerJobMixin(TaskManagerUnifiedJobMixin):
|
|||||||
class Meta:
|
class Meta:
|
||||||
abstract = True
|
abstract = True
|
||||||
|
|
||||||
|
def get_jobs_fail_chain(self):
|
||||||
|
if self.project_update_id:
|
||||||
|
return [self.project_update]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
class TaskManagerUpdateOnLaunchMixin(TaskManagerUnifiedJobMixin):
|
class TaskManagerUpdateOnLaunchMixin(TaskManagerUnifiedJobMixin):
|
||||||
class Meta:
|
class Meta:
|
||||||
|
|||||||
@@ -1395,23 +1395,6 @@ class UnifiedJob(
|
|||||||
# Done!
|
# Done!
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@property
|
|
||||||
def actually_running(self):
|
|
||||||
# returns True if the job is running in the appropriate dispatcher process
|
|
||||||
running = False
|
|
||||||
if all([self.status == 'running', self.celery_task_id, self.execution_node]):
|
|
||||||
# If the job is marked as running, but the dispatcher
|
|
||||||
# doesn't know about it (or the dispatcher doesn't reply),
|
|
||||||
# then cancel the job
|
|
||||||
timeout = 5
|
|
||||||
try:
|
|
||||||
running = self.celery_task_id in ControlDispatcher('dispatcher', self.controller_node or self.execution_node).running(timeout=timeout)
|
|
||||||
except socket.timeout:
|
|
||||||
logger.error('could not reach dispatcher on {} within {}s'.format(self.execution_node, timeout))
|
|
||||||
except Exception:
|
|
||||||
logger.exception("error encountered when checking task status")
|
|
||||||
return running
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def can_cancel(self):
|
def can_cancel(self):
|
||||||
return bool(self.status in CAN_CANCEL)
|
return bool(self.status in CAN_CANCEL)
|
||||||
@@ -1421,27 +1404,61 @@ class UnifiedJob(
|
|||||||
return 'Previous Task Canceled: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}' % (self.model_to_str(), self.name, self.id)
|
return 'Previous Task Canceled: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}' % (self.model_to_str(), self.name, self.id)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def fallback_cancel(self):
|
||||||
|
if not self.celery_task_id:
|
||||||
|
self.refresh_from_db(fields=['celery_task_id'])
|
||||||
|
self.cancel_dispatcher_process()
|
||||||
|
|
||||||
|
def cancel_dispatcher_process(self):
|
||||||
|
"""Returns True if dispatcher running this job acknowledged request and sent SIGTERM"""
|
||||||
|
if not self.celery_task_id:
|
||||||
|
return
|
||||||
|
canceled = []
|
||||||
|
try:
|
||||||
|
# Use control and reply mechanism to cancel and obtain confirmation
|
||||||
|
timeout = 5
|
||||||
|
canceled = ControlDispatcher('dispatcher', self.controller_node).cancel([self.celery_task_id])
|
||||||
|
except socket.timeout:
|
||||||
|
logger.error(f'could not reach dispatcher on {self.controller_node} within {timeout}s')
|
||||||
|
except Exception:
|
||||||
|
logger.exception("error encountered when checking task status")
|
||||||
|
return bool(self.celery_task_id in canceled) # True or False, whether confirmation was obtained
|
||||||
|
|
||||||
def cancel(self, job_explanation=None, is_chain=False):
|
def cancel(self, job_explanation=None, is_chain=False):
|
||||||
if self.can_cancel:
|
if self.can_cancel:
|
||||||
if not is_chain:
|
if not is_chain:
|
||||||
for x in self.get_jobs_fail_chain():
|
for x in self.get_jobs_fail_chain():
|
||||||
x.cancel(job_explanation=self._build_job_explanation(), is_chain=True)
|
x.cancel(job_explanation=self._build_job_explanation(), is_chain=True)
|
||||||
|
|
||||||
|
cancel_fields = []
|
||||||
if not self.cancel_flag:
|
if not self.cancel_flag:
|
||||||
self.cancel_flag = True
|
self.cancel_flag = True
|
||||||
self.start_args = '' # blank field to remove encrypted passwords
|
self.start_args = '' # blank field to remove encrypted passwords
|
||||||
cancel_fields = ['cancel_flag', 'start_args']
|
cancel_fields.extend(['cancel_flag', 'start_args'])
|
||||||
if self.status in ('pending', 'waiting', 'new'):
|
connection.on_commit(lambda: self.websocket_emit_status("canceled"))
|
||||||
self.status = 'canceled'
|
|
||||||
cancel_fields.append('status')
|
|
||||||
if self.status == 'running' and not self.actually_running:
|
|
||||||
self.status = 'canceled'
|
|
||||||
cancel_fields.append('status')
|
|
||||||
if job_explanation is not None:
|
if job_explanation is not None:
|
||||||
self.job_explanation = job_explanation
|
self.job_explanation = job_explanation
|
||||||
cancel_fields.append('job_explanation')
|
cancel_fields.append('job_explanation')
|
||||||
self.save(update_fields=cancel_fields)
|
|
||||||
self.websocket_emit_status("canceled")
|
controller_notified = False
|
||||||
|
if self.celery_task_id:
|
||||||
|
controller_notified = self.cancel_dispatcher_process()
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Avoid race condition where we have stale model from pending state but job has already started,
|
||||||
|
# its checking signal but not cancel_flag, so re-send signal after this database commit
|
||||||
|
connection.on_commit(self.fallback_cancel)
|
||||||
|
|
||||||
|
# If a SIGTERM signal was sent to the control process, and acked by the dispatcher
|
||||||
|
# then we want to let its own cleanup change status, otherwise change status now
|
||||||
|
if not controller_notified:
|
||||||
|
if self.status != 'canceled':
|
||||||
|
self.status = 'canceled'
|
||||||
|
cancel_fields.append('status')
|
||||||
|
|
||||||
|
self.save(update_fields=cancel_fields)
|
||||||
|
|
||||||
return self.cancel_flag
|
return self.cancel_flag
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -723,11 +723,10 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, SurveyJobMixin, JobNotificatio
|
|||||||
def preferred_instance_groups(self):
|
def preferred_instance_groups(self):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
@property
|
def cancel_dispatcher_process(self):
|
||||||
def actually_running(self):
|
|
||||||
# WorkflowJobs don't _actually_ run anything in the dispatcher, so
|
# WorkflowJobs don't _actually_ run anything in the dispatcher, so
|
||||||
# there's no point in asking the dispatcher if it knows about this task
|
# there's no point in asking the dispatcher if it knows about this task
|
||||||
return self.status == 'running'
|
return True
|
||||||
|
|
||||||
|
|
||||||
class WorkflowApprovalTemplate(UnifiedJobTemplate, RelatedJobsMixin):
|
class WorkflowApprovalTemplate(UnifiedJobTemplate, RelatedJobsMixin):
|
||||||
|
|||||||
@@ -6,17 +6,16 @@ import os
|
|||||||
import stat
|
import stat
|
||||||
|
|
||||||
# Django
|
# Django
|
||||||
from django.utils.timezone import now
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django_guid import get_guid
|
from django_guid import get_guid
|
||||||
from django.utils.functional import cached_property
|
from django.utils.functional import cached_property
|
||||||
|
from django.db import connections
|
||||||
|
|
||||||
# AWX
|
# AWX
|
||||||
from awx.main.redact import UriCleaner
|
from awx.main.redact import UriCleaner
|
||||||
from awx.main.constants import MINIMAL_EVENTS, ANSIBLE_RUNNER_NEEDS_UPDATE_MESSAGE
|
from awx.main.constants import MINIMAL_EVENTS, ANSIBLE_RUNNER_NEEDS_UPDATE_MESSAGE
|
||||||
from awx.main.utils.update_model import update_model
|
from awx.main.utils.update_model import update_model
|
||||||
from awx.main.queue import CallbackQueueDispatcher
|
from awx.main.queue import CallbackQueueDispatcher
|
||||||
from awx.main.tasks.signals import signal_callback
|
|
||||||
|
|
||||||
logger = logging.getLogger('awx.main.tasks.callback')
|
logger = logging.getLogger('awx.main.tasks.callback')
|
||||||
|
|
||||||
@@ -175,28 +174,6 @@ class RunnerCallback:
|
|||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def cancel_callback(self):
|
|
||||||
"""
|
|
||||||
Ansible runner callback to tell the job when/if it is canceled
|
|
||||||
"""
|
|
||||||
unified_job_id = self.instance.pk
|
|
||||||
if signal_callback():
|
|
||||||
return True
|
|
||||||
try:
|
|
||||||
self.instance = self.update_model(unified_job_id)
|
|
||||||
except Exception:
|
|
||||||
logger.exception(f'Encountered error during cancel check for {unified_job_id}, canceling now')
|
|
||||||
return True
|
|
||||||
if not self.instance:
|
|
||||||
logger.error('unified job {} was deleted while running, canceling'.format(unified_job_id))
|
|
||||||
return True
|
|
||||||
if self.instance.cancel_flag or self.instance.status == 'canceled':
|
|
||||||
cancel_wait = (now() - self.instance.modified).seconds if self.instance.modified else 0
|
|
||||||
if cancel_wait > 5:
|
|
||||||
logger.warning('Request to cancel {} took {} seconds to complete.'.format(self.instance.log_format, cancel_wait))
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def finished_callback(self, runner_obj):
|
def finished_callback(self, runner_obj):
|
||||||
"""
|
"""
|
||||||
Ansible runner callback triggered on finished run
|
Ansible runner callback triggered on finished run
|
||||||
@@ -227,6 +204,8 @@ class RunnerCallback:
|
|||||||
|
|
||||||
with disable_activity_stream():
|
with disable_activity_stream():
|
||||||
self.instance = self.update_model(self.instance.pk, job_args=json.dumps(runner_config.command), job_cwd=runner_config.cwd, job_env=job_env)
|
self.instance = self.update_model(self.instance.pk, job_args=json.dumps(runner_config.command), job_cwd=runner_config.cwd, job_env=job_env)
|
||||||
|
# We opened a connection just for that save, close it here now
|
||||||
|
connections.close_all()
|
||||||
elif status_data['status'] == 'failed':
|
elif status_data['status'] == 'failed':
|
||||||
# For encrypted ssh_key_data, ansible-runner worker will open and write the
|
# For encrypted ssh_key_data, ansible-runner worker will open and write the
|
||||||
# ssh_key_data to a named pipe. Then, once the podman container starts, ssh-agent will
|
# ssh_key_data to a named pipe. Then, once the podman container starts, ssh-agent will
|
||||||
|
|||||||
@@ -487,6 +487,7 @@ class BaseTask(object):
|
|||||||
self.instance.log_lifecycle("preparing_playbook")
|
self.instance.log_lifecycle("preparing_playbook")
|
||||||
if self.instance.cancel_flag or signal_callback():
|
if self.instance.cancel_flag or signal_callback():
|
||||||
self.instance = self.update_model(self.instance.pk, status='canceled')
|
self.instance = self.update_model(self.instance.pk, status='canceled')
|
||||||
|
|
||||||
if self.instance.status != 'running':
|
if self.instance.status != 'running':
|
||||||
# Stop the task chain and prevent starting the job if it has
|
# Stop the task chain and prevent starting the job if it has
|
||||||
# already been canceled.
|
# already been canceled.
|
||||||
@@ -589,7 +590,7 @@ class BaseTask(object):
|
|||||||
event_handler=self.runner_callback.event_handler,
|
event_handler=self.runner_callback.event_handler,
|
||||||
finished_callback=self.runner_callback.finished_callback,
|
finished_callback=self.runner_callback.finished_callback,
|
||||||
status_handler=self.runner_callback.status_handler,
|
status_handler=self.runner_callback.status_handler,
|
||||||
cancel_callback=self.runner_callback.cancel_callback,
|
cancel_callback=signal_callback,
|
||||||
**params,
|
**params,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -1626,7 +1627,7 @@ class RunInventoryUpdate(SourceControlMixin, BaseTask):
|
|||||||
|
|
||||||
handler = SpecialInventoryHandler(
|
handler = SpecialInventoryHandler(
|
||||||
self.runner_callback.event_handler,
|
self.runner_callback.event_handler,
|
||||||
self.runner_callback.cancel_callback,
|
signal_callback,
|
||||||
verbosity=inventory_update.verbosity,
|
verbosity=inventory_update.verbosity,
|
||||||
job_timeout=self.get_instance_timeout(self.instance),
|
job_timeout=self.get_instance_timeout(self.instance),
|
||||||
start_time=inventory_update.started,
|
start_time=inventory_update.started,
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import yaml
|
|||||||
|
|
||||||
# Django
|
# Django
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django.db import connections
|
||||||
|
|
||||||
# Runner
|
# Runner
|
||||||
import ansible_runner
|
import ansible_runner
|
||||||
@@ -25,6 +26,7 @@ from awx.main.utils.common import (
|
|||||||
cleanup_new_process,
|
cleanup_new_process,
|
||||||
)
|
)
|
||||||
from awx.main.constants import MAX_ISOLATED_PATH_COLON_DELIMITER
|
from awx.main.constants import MAX_ISOLATED_PATH_COLON_DELIMITER
|
||||||
|
from awx.main.tasks.signals import signal_state, signal_callback, SignalExit
|
||||||
|
|
||||||
# Receptorctl
|
# Receptorctl
|
||||||
from receptorctl.socket_interface import ReceptorControl
|
from receptorctl.socket_interface import ReceptorControl
|
||||||
@@ -335,24 +337,32 @@ class AWXReceptorJob:
|
|||||||
shutil.rmtree(artifact_dir)
|
shutil.rmtree(artifact_dir)
|
||||||
|
|
||||||
resultsock, resultfile = receptor_ctl.get_work_results(self.unit_id, return_socket=True, return_sockfile=True)
|
resultsock, resultfile = receptor_ctl.get_work_results(self.unit_id, return_socket=True, return_sockfile=True)
|
||||||
# Both "processor" and "cancel_watcher" are spawned in separate threads.
|
|
||||||
# We wait for the first one to return. If cancel_watcher returns first,
|
|
||||||
# we yank the socket out from underneath the processor, which will cause it
|
|
||||||
# to exit. A reference to the processor_future is passed into the cancel_watcher_future,
|
|
||||||
# Which exits if the job has finished normally. The context manager ensures we do not
|
|
||||||
# leave any threads laying around.
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
||||||
processor_future = executor.submit(self.processor, resultfile)
|
|
||||||
cancel_watcher_future = executor.submit(self.cancel_watcher, processor_future)
|
|
||||||
futures = [processor_future, cancel_watcher_future]
|
|
||||||
first_future = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)
|
|
||||||
|
|
||||||
res = list(first_future.done)[0].result()
|
connections.close_all()
|
||||||
if res.status == 'canceled':
|
|
||||||
|
# "processor" and the main thread will be separate threads.
|
||||||
|
# If a cancel happens, the main thread will encounter an exception, in which case
|
||||||
|
# we yank the socket out from underneath the processor, which will cause it to exit.
|
||||||
|
# The ThreadPoolExecutor context manager ensures we do not leave any threads laying around.
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
||||||
|
processor_future = executor.submit(self.processor, resultfile)
|
||||||
|
|
||||||
|
try:
|
||||||
|
signal_state.raise_exception = True
|
||||||
|
# address race condition where SIGTERM was issued after this dispatcher task started
|
||||||
|
if signal_callback():
|
||||||
|
raise SignalExit()
|
||||||
|
res = processor_future.result()
|
||||||
|
except SignalExit:
|
||||||
receptor_ctl.simple_command(f"work cancel {self.unit_id}")
|
receptor_ctl.simple_command(f"work cancel {self.unit_id}")
|
||||||
resultsock.shutdown(socket.SHUT_RDWR)
|
resultsock.shutdown(socket.SHUT_RDWR)
|
||||||
resultfile.close()
|
resultfile.close()
|
||||||
elif res.status == 'error':
|
result = namedtuple('result', ['status', 'rc'])
|
||||||
|
res = result('canceled', 1)
|
||||||
|
finally:
|
||||||
|
signal_state.raise_exception = False
|
||||||
|
|
||||||
|
if res.status == 'error':
|
||||||
# If ansible-runner ran, but an error occured at runtime, the traceback information
|
# If ansible-runner ran, but an error occured at runtime, the traceback information
|
||||||
# is saved via the status_handler passed in to the processor.
|
# is saved via the status_handler passed in to the processor.
|
||||||
if 'result_traceback' in self.task.runner_callback.extra_update_fields:
|
if 'result_traceback' in self.task.runner_callback.extra_update_fields:
|
||||||
@@ -446,18 +456,6 @@ class AWXReceptorJob:
|
|||||||
return 'local'
|
return 'local'
|
||||||
return 'ansible-runner'
|
return 'ansible-runner'
|
||||||
|
|
||||||
@cleanup_new_process
|
|
||||||
def cancel_watcher(self, processor_future):
|
|
||||||
while True:
|
|
||||||
if processor_future.done():
|
|
||||||
return processor_future.result()
|
|
||||||
|
|
||||||
if self.task.runner_callback.cancel_callback():
|
|
||||||
result = namedtuple('result', ['status', 'rc'])
|
|
||||||
return result('canceled', 1)
|
|
||||||
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pod_definition(self):
|
def pod_definition(self):
|
||||||
ee = self.task.instance.execution_environment
|
ee = self.task.instance.execution_environment
|
||||||
|
|||||||
@@ -9,12 +9,17 @@ logger = logging.getLogger('awx.main.tasks.signals')
|
|||||||
__all__ = ['with_signal_handling', 'signal_callback']
|
__all__ = ['with_signal_handling', 'signal_callback']
|
||||||
|
|
||||||
|
|
||||||
|
class SignalExit(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class SignalState:
|
class SignalState:
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.sigterm_flag = False
|
self.sigterm_flag = False
|
||||||
self.is_active = False
|
self.is_active = False
|
||||||
self.original_sigterm = None
|
self.original_sigterm = None
|
||||||
self.original_sigint = None
|
self.original_sigint = None
|
||||||
|
self.raise_exception = False
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.reset()
|
self.reset()
|
||||||
@@ -22,6 +27,9 @@ class SignalState:
|
|||||||
def set_flag(self, *args):
|
def set_flag(self, *args):
|
||||||
"""Method to pass into the python signal.signal method to receive signals"""
|
"""Method to pass into the python signal.signal method to receive signals"""
|
||||||
self.sigterm_flag = True
|
self.sigterm_flag = True
|
||||||
|
if self.raise_exception:
|
||||||
|
self.raise_exception = False # so it is not raised a second time in error handling
|
||||||
|
raise SignalExit()
|
||||||
|
|
||||||
def connect_signals(self):
|
def connect_signals(self):
|
||||||
self.original_sigterm = signal.getsignal(signal.SIGTERM)
|
self.original_sigterm = signal.getsignal(signal.SIGTERM)
|
||||||
|
|||||||
@@ -244,7 +244,7 @@ class TestAutoScaling:
|
|||||||
assert not self.pool.should_grow
|
assert not self.pool.should_grow
|
||||||
alive_pid = self.pool.workers[1].pid
|
alive_pid = self.pool.workers[1].pid
|
||||||
self.pool.workers[0].process.terminate()
|
self.pool.workers[0].process.terminate()
|
||||||
time.sleep(1) # wait a moment for sigterm
|
time.sleep(2) # wait a moment for sigterm
|
||||||
|
|
||||||
# clean up and the dead worker
|
# clean up and the dead worker
|
||||||
self.pool.cleanup()
|
self.pool.cleanup()
|
||||||
|
|||||||
@@ -22,6 +22,10 @@ def test_unified_job_workflow_attributes():
|
|||||||
assert job.workflow_job_id == 1
|
assert job.workflow_job_id == 1
|
||||||
|
|
||||||
|
|
||||||
|
def mock_on_commit(f):
|
||||||
|
f()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def unified_job(mocker):
|
def unified_job(mocker):
|
||||||
mocker.patch.object(UnifiedJob, 'can_cancel', return_value=True)
|
mocker.patch.object(UnifiedJob, 'can_cancel', return_value=True)
|
||||||
@@ -30,12 +34,14 @@ def unified_job(mocker):
|
|||||||
j.cancel_flag = None
|
j.cancel_flag = None
|
||||||
j.save = mocker.MagicMock()
|
j.save = mocker.MagicMock()
|
||||||
j.websocket_emit_status = mocker.MagicMock()
|
j.websocket_emit_status = mocker.MagicMock()
|
||||||
|
j.fallback_cancel = mocker.MagicMock()
|
||||||
return j
|
return j
|
||||||
|
|
||||||
|
|
||||||
def test_cancel(unified_job):
|
def test_cancel(unified_job):
|
||||||
|
|
||||||
unified_job.cancel()
|
with mock.patch('awx.main.models.unified_jobs.connection.on_commit', wraps=mock_on_commit):
|
||||||
|
unified_job.cancel()
|
||||||
|
|
||||||
assert unified_job.cancel_flag is True
|
assert unified_job.cancel_flag is True
|
||||||
assert unified_job.status == 'canceled'
|
assert unified_job.status == 'canceled'
|
||||||
@@ -50,10 +56,11 @@ def test_cancel(unified_job):
|
|||||||
def test_cancel_job_explanation(unified_job):
|
def test_cancel_job_explanation(unified_job):
|
||||||
job_explanation = 'giggity giggity'
|
job_explanation = 'giggity giggity'
|
||||||
|
|
||||||
unified_job.cancel(job_explanation=job_explanation)
|
with mock.patch('awx.main.models.unified_jobs.connection.on_commit'):
|
||||||
|
unified_job.cancel(job_explanation=job_explanation)
|
||||||
|
|
||||||
assert unified_job.job_explanation == job_explanation
|
assert unified_job.job_explanation == job_explanation
|
||||||
unified_job.save.assert_called_with(update_fields=['cancel_flag', 'start_args', 'status', 'job_explanation'])
|
unified_job.save.assert_called_with(update_fields=['cancel_flag', 'start_args', 'job_explanation', 'status'])
|
||||||
|
|
||||||
|
|
||||||
def test_organization_copy_to_jobs():
|
def test_organization_copy_to_jobs():
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ class SpecialInventoryHandler(logging.Handler):
|
|||||||
def emit(self, record):
|
def emit(self, record):
|
||||||
# check cancel and timeout status regardless of log level
|
# check cancel and timeout status regardless of log level
|
||||||
this_time = now()
|
this_time = now()
|
||||||
if (this_time - self.last_check).total_seconds() > 0.5: # cancel callback is expensive
|
if (this_time - self.last_check).total_seconds() > 0.1:
|
||||||
self.last_check = this_time
|
self.last_check = this_time
|
||||||
if self.cancel_callback():
|
if self.cancel_callback():
|
||||||
raise PostRunError('Inventory update has been canceled', status='canceled')
|
raise PostRunError('Inventory update has been canceled', status='canceled')
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ In AWX, a task of a certain job type is kicked off (_i.e._, RunJob, RunProjectUp
|
|||||||
|
|
||||||
The callbacks and handlers are:
|
The callbacks and handlers are:
|
||||||
* `event_handler`: Called each time a new event is created in `ansible-runner`. AWX will dispatch the event to `redis` to be processed on the other end by the callback receiver.
|
* `event_handler`: Called each time a new event is created in `ansible-runner`. AWX will dispatch the event to `redis` to be processed on the other end by the callback receiver.
|
||||||
* `cancel_callback`: Called periodically by `ansible-runner`; this is so that AWX can inform `ansible-runner` if the job should be canceled or not.
|
* `cancel_callback`: Called periodically by `ansible-runner`; this is so that AWX can inform `ansible-runner` if the job should be canceled or not. Only applies for system jobs now, and other jobs are canceled via receptor.
|
||||||
* `finished_callback`: Called once by `ansible-runner` to denote that the process that was asked to run is finished. AWX will construct the special control event, `EOF`, with the associated total number of events that it observed.
|
* `finished_callback`: Called once by `ansible-runner` to denote that the process that was asked to run is finished. AWX will construct the special control event, `EOF`, with the associated total number of events that it observed.
|
||||||
* `status_handler`: Called by `ansible-runner` as the process transitions state internally. AWX uses the `starting` status to know that `ansible-runner` has made all of its decisions around the process that it will launch. AWX gathers and associates these decisions with the Job for historical observation.
|
* `status_handler`: Called by `ansible-runner` as the process transitions state internally. AWX uses the `starting` status to know that `ansible-runner` has made all of its decisions around the process that it will launch. AWX gathers and associates these decisions with the Job for historical observation.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user