Error handling when node is missing from mesh for jobs and checks

This commit is contained in:
Alan Rominger 2021-10-15 11:23:38 -04:00 committed by Shane McDonald
parent 206c85778e
commit f34c96ecf5
No known key found for this signature in database
GPG Key ID: 6F374AF6E9EB9374
3 changed files with 15 additions and 3 deletions

View File

@ -36,3 +36,7 @@ class PostRunError(Exception):
self.status = status
self.tb = tb
super(PostRunError, self).__init__(msg)
class ReceptorNodeNotFound(RuntimeError):
pass

View File

@ -85,7 +85,7 @@ from awx.main.models import (
build_safe_env,
)
from awx.main.constants import ACTIVE_STATES
from awx.main.exceptions import AwxTaskError, PostRunError
from awx.main.exceptions import AwxTaskError, PostRunError, ReceptorNodeNotFound
from awx.main.queue import CallbackQueueDispatcher
from awx.main.dispatch.publish import task
from awx.main.dispatch import get_local_queuename, reaper
@ -1546,6 +1546,8 @@ class BaseTask(object):
# ensure failure notification sends even if playbook_on_stats event is not triggered
handle_success_and_failure_notifications.apply_async([self.instance.job.id])
except ReceptorNodeNotFound as exc:
extra_update_fields['job_explanation'] = str(exc)
except Exception:
# this could catch programming or file system errors
extra_update_fields['result_traceback'] = traceback.format_exc()
@ -3069,7 +3071,7 @@ class AWXReceptorJob:
receptor_ctl.simple_command(f"work release {self.unit_id}")
# If an error occured without the job itself failing, it could be a broken instance
if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)):
execution_node_health_check(self.task.instance.execution_node)
execution_node_health_check.delay(self.task.instance.execution_node)
@property
def sign_work(self):

View File

@ -1,12 +1,14 @@
import logging
import yaml
import time
from enum import Enum, unique
from receptorctl.socket_interface import ReceptorControl
from awx.main.exceptions import ReceptorNodeNotFound
from django.conf import settings
from enum import Enum, unique
logger = logging.getLogger('awx.main.utils.receptor')
@ -63,6 +65,7 @@ def get_conn_type(node_name, receptor_ctl):
for node in all_nodes:
if node.get('NodeID') == node_name:
return ReceptorConnectionType(node.get('ConnType'))
raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh')
def administrative_workunit_reaper(work_list=None):
@ -183,6 +186,9 @@ def worker_info(node_name, work_type='ansible-runner'):
else:
error_list.append(details)
except ReceptorNodeNotFound as exc:
error_list.append(str(exc))
# If we have a connection error, missing keys would be trivial consequence of that
if not data['errors']:
# see tasks.py usage of keys