diff --git a/awx/main/exceptions.py b/awx/main/exceptions.py index 6a9bb7ece4..2cd9a44418 100644 --- a/awx/main/exceptions.py +++ b/awx/main/exceptions.py @@ -36,3 +36,7 @@ class PostRunError(Exception): self.status = status self.tb = tb super(PostRunError, self).__init__(msg) + + +class ReceptorNodeNotFound(RuntimeError): + pass diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 9ff1120ea5..94d348400e 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -85,7 +85,7 @@ from awx.main.models import ( build_safe_env, ) from awx.main.constants import ACTIVE_STATES -from awx.main.exceptions import AwxTaskError, PostRunError +from awx.main.exceptions import AwxTaskError, PostRunError, ReceptorNodeNotFound from awx.main.queue import CallbackQueueDispatcher from awx.main.dispatch.publish import task from awx.main.dispatch import get_local_queuename, reaper @@ -1546,6 +1546,8 @@ class BaseTask(object): # ensure failure notification sends even if playbook_on_stats event is not triggered handle_success_and_failure_notifications.apply_async([self.instance.job.id]) + except ReceptorNodeNotFound as exc: + extra_update_fields['job_explanation'] = str(exc) except Exception: # this could catch programming or file system errors extra_update_fields['result_traceback'] = traceback.format_exc() @@ -3069,7 +3071,7 @@ class AWXReceptorJob: receptor_ctl.simple_command(f"work release {self.unit_id}") # If an error occured without the job itself failing, it could be a broken instance if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)): - execution_node_health_check(self.task.instance.execution_node) + execution_node_health_check.delay(self.task.instance.execution_node) @property def sign_work(self): diff --git a/awx/main/utils/receptor.py b/awx/main/utils/receptor.py index cadf51a1b1..edc3887587 100644 --- a/awx/main/utils/receptor.py +++ b/awx/main/utils/receptor.py @@ -1,12 +1,14 @@ import logging import yaml import time +from enum import Enum, unique from receptorctl.socket_interface import ReceptorControl +from awx.main.exceptions import ReceptorNodeNotFound + from django.conf import settings -from enum import Enum, unique logger = logging.getLogger('awx.main.utils.receptor') @@ -63,6 +65,7 @@ def get_conn_type(node_name, receptor_ctl): for node in all_nodes: if node.get('NodeID') == node_name: return ReceptorConnectionType(node.get('ConnType')) + raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh') def administrative_workunit_reaper(work_list=None): @@ -183,6 +186,9 @@ def worker_info(node_name, work_type='ansible-runner'): else: error_list.append(details) + except ReceptorNodeNotFound as exc: + error_list.append(str(exc)) + # If we have a connection error, missing keys would be trivial consequence of that if not data['errors']: # see tasks.py usage of keys