mirror of
https://github.com/ansible/awx.git
synced 2026-02-28 16:28:43 -03:30
Error handling when node is missing from mesh for jobs and checks
This commit is contained in:
committed by
Shane McDonald
parent
206c85778e
commit
f34c96ecf5
@@ -36,3 +36,7 @@ class PostRunError(Exception):
|
|||||||
self.status = status
|
self.status = status
|
||||||
self.tb = tb
|
self.tb = tb
|
||||||
super(PostRunError, self).__init__(msg)
|
super(PostRunError, self).__init__(msg)
|
||||||
|
|
||||||
|
|
||||||
|
class ReceptorNodeNotFound(RuntimeError):
|
||||||
|
pass
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ from awx.main.models import (
|
|||||||
build_safe_env,
|
build_safe_env,
|
||||||
)
|
)
|
||||||
from awx.main.constants import ACTIVE_STATES
|
from awx.main.constants import ACTIVE_STATES
|
||||||
from awx.main.exceptions import AwxTaskError, PostRunError
|
from awx.main.exceptions import AwxTaskError, PostRunError, ReceptorNodeNotFound
|
||||||
from awx.main.queue import CallbackQueueDispatcher
|
from awx.main.queue import CallbackQueueDispatcher
|
||||||
from awx.main.dispatch.publish import task
|
from awx.main.dispatch.publish import task
|
||||||
from awx.main.dispatch import get_local_queuename, reaper
|
from awx.main.dispatch import get_local_queuename, reaper
|
||||||
@@ -1546,6 +1546,8 @@ class BaseTask(object):
|
|||||||
# ensure failure notification sends even if playbook_on_stats event is not triggered
|
# ensure failure notification sends even if playbook_on_stats event is not triggered
|
||||||
handle_success_and_failure_notifications.apply_async([self.instance.job.id])
|
handle_success_and_failure_notifications.apply_async([self.instance.job.id])
|
||||||
|
|
||||||
|
except ReceptorNodeNotFound as exc:
|
||||||
|
extra_update_fields['job_explanation'] = str(exc)
|
||||||
except Exception:
|
except Exception:
|
||||||
# this could catch programming or file system errors
|
# this could catch programming or file system errors
|
||||||
extra_update_fields['result_traceback'] = traceback.format_exc()
|
extra_update_fields['result_traceback'] = traceback.format_exc()
|
||||||
@@ -3069,7 +3071,7 @@ class AWXReceptorJob:
|
|||||||
receptor_ctl.simple_command(f"work release {self.unit_id}")
|
receptor_ctl.simple_command(f"work release {self.unit_id}")
|
||||||
# If an error occured without the job itself failing, it could be a broken instance
|
# If an error occured without the job itself failing, it could be a broken instance
|
||||||
if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)):
|
if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)):
|
||||||
execution_node_health_check(self.task.instance.execution_node)
|
execution_node_health_check.delay(self.task.instance.execution_node)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sign_work(self):
|
def sign_work(self):
|
||||||
|
|||||||
@@ -1,12 +1,14 @@
|
|||||||
import logging
|
import logging
|
||||||
import yaml
|
import yaml
|
||||||
import time
|
import time
|
||||||
|
from enum import Enum, unique
|
||||||
|
|
||||||
from receptorctl.socket_interface import ReceptorControl
|
from receptorctl.socket_interface import ReceptorControl
|
||||||
|
|
||||||
|
from awx.main.exceptions import ReceptorNodeNotFound
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from enum import Enum, unique
|
|
||||||
|
|
||||||
logger = logging.getLogger('awx.main.utils.receptor')
|
logger = logging.getLogger('awx.main.utils.receptor')
|
||||||
|
|
||||||
@@ -63,6 +65,7 @@ def get_conn_type(node_name, receptor_ctl):
|
|||||||
for node in all_nodes:
|
for node in all_nodes:
|
||||||
if node.get('NodeID') == node_name:
|
if node.get('NodeID') == node_name:
|
||||||
return ReceptorConnectionType(node.get('ConnType'))
|
return ReceptorConnectionType(node.get('ConnType'))
|
||||||
|
raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh')
|
||||||
|
|
||||||
|
|
||||||
def administrative_workunit_reaper(work_list=None):
|
def administrative_workunit_reaper(work_list=None):
|
||||||
@@ -183,6 +186,9 @@ def worker_info(node_name, work_type='ansible-runner'):
|
|||||||
else:
|
else:
|
||||||
error_list.append(details)
|
error_list.append(details)
|
||||||
|
|
||||||
|
except ReceptorNodeNotFound as exc:
|
||||||
|
error_list.append(str(exc))
|
||||||
|
|
||||||
# If we have a connection error, missing keys would be trivial consequence of that
|
# If we have a connection error, missing keys would be trivial consequence of that
|
||||||
if not data['errors']:
|
if not data['errors']:
|
||||||
# see tasks.py usage of keys
|
# see tasks.py usage of keys
|
||||||
|
|||||||
Reference in New Issue
Block a user