Error handling when node is missing from mesh for jobs and checks

This commit is contained in:
Alan Rominger
2021-10-15 11:23:38 -04:00
committed by Shane McDonald
parent 206c85778e
commit f34c96ecf5
3 changed files with 15 additions and 3 deletions

View File

@@ -36,3 +36,7 @@ class PostRunError(Exception):
self.status = status self.status = status
self.tb = tb self.tb = tb
super(PostRunError, self).__init__(msg) super(PostRunError, self).__init__(msg)
class ReceptorNodeNotFound(RuntimeError):
pass

View File

@@ -85,7 +85,7 @@ from awx.main.models import (
build_safe_env, build_safe_env,
) )
from awx.main.constants import ACTIVE_STATES from awx.main.constants import ACTIVE_STATES
from awx.main.exceptions import AwxTaskError, PostRunError from awx.main.exceptions import AwxTaskError, PostRunError, ReceptorNodeNotFound
from awx.main.queue import CallbackQueueDispatcher from awx.main.queue import CallbackQueueDispatcher
from awx.main.dispatch.publish import task from awx.main.dispatch.publish import task
from awx.main.dispatch import get_local_queuename, reaper from awx.main.dispatch import get_local_queuename, reaper
@@ -1546,6 +1546,8 @@ class BaseTask(object):
# ensure failure notification sends even if playbook_on_stats event is not triggered # ensure failure notification sends even if playbook_on_stats event is not triggered
handle_success_and_failure_notifications.apply_async([self.instance.job.id]) handle_success_and_failure_notifications.apply_async([self.instance.job.id])
except ReceptorNodeNotFound as exc:
extra_update_fields['job_explanation'] = str(exc)
except Exception: except Exception:
# this could catch programming or file system errors # this could catch programming or file system errors
extra_update_fields['result_traceback'] = traceback.format_exc() extra_update_fields['result_traceback'] = traceback.format_exc()
@@ -3069,7 +3071,7 @@ class AWXReceptorJob:
receptor_ctl.simple_command(f"work release {self.unit_id}") receptor_ctl.simple_command(f"work release {self.unit_id}")
# If an error occured without the job itself failing, it could be a broken instance # If an error occured without the job itself failing, it could be a broken instance
if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)): if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)):
execution_node_health_check(self.task.instance.execution_node) execution_node_health_check.delay(self.task.instance.execution_node)
@property @property
def sign_work(self): def sign_work(self):

View File

@@ -1,12 +1,14 @@
import logging import logging
import yaml import yaml
import time import time
from enum import Enum, unique
from receptorctl.socket_interface import ReceptorControl from receptorctl.socket_interface import ReceptorControl
from awx.main.exceptions import ReceptorNodeNotFound
from django.conf import settings from django.conf import settings
from enum import Enum, unique
logger = logging.getLogger('awx.main.utils.receptor') logger = logging.getLogger('awx.main.utils.receptor')
@@ -63,6 +65,7 @@ def get_conn_type(node_name, receptor_ctl):
for node in all_nodes: for node in all_nodes:
if node.get('NodeID') == node_name: if node.get('NodeID') == node_name:
return ReceptorConnectionType(node.get('ConnType')) return ReceptorConnectionType(node.get('ConnType'))
raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh')
def administrative_workunit_reaper(work_list=None): def administrative_workunit_reaper(work_list=None):
@@ -183,6 +186,9 @@ def worker_info(node_name, work_type='ansible-runner'):
else: else:
error_list.append(details) error_list.append(details)
except ReceptorNodeNotFound as exc:
error_list.append(str(exc))
# If we have a connection error, missing keys would be trivial consequence of that # If we have a connection error, missing keys would be trivial consequence of that
if not data['errors']: if not data['errors']:
# see tasks.py usage of keys # see tasks.py usage of keys