mirror of
https://github.com/ansible/awx.git
synced 2026-05-15 21:37:42 -02:30
Allow manually running a health check, and make other adjustments to the health check trigger (#11002)
* Full finalize the planned work for health checks of execution nodes * Implementation of instance health_check endpoint * Also do version conditional to node_type * Do not use receptor mesh to check main cluster nodes health * Fix bugs from testing health check of cluster nodes, add doc * Add a few fields to health check serializer missed before * Light refactoring of error field processing * Fix errors clearing error, write more unit tests * Update health check info in docs * Bump migration of health check after rebase * Mark string for translation * Add related health_check link for system auditors too * Handle health_check cluster node timeout, add errors for peer judgement
This commit is contained in:
@@ -177,7 +177,7 @@ def dispatch_startup():
|
||||
def inform_cluster_of_shutdown():
|
||||
try:
|
||||
this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID)
|
||||
this_inst.mark_offline(update_last_seen=True) # No thank you to new jobs while shut down
|
||||
this_inst.mark_offline(update_last_seen=True, errors=_('Instance received normal shutdown signal'))
|
||||
try:
|
||||
reaper.reap(this_inst)
|
||||
except Exception:
|
||||
@@ -408,14 +408,37 @@ def cleanup_execution_environment_images():
|
||||
logger.debug(f"Failed to delete image {image_name}")
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def cluster_node_health_check(node):
|
||||
'''
|
||||
Used for the health check endpoint, refreshes the status of the instance, but must be ran on target node
|
||||
'''
|
||||
if node == '':
|
||||
logger.warn('Local health check incorrectly called with blank string')
|
||||
return
|
||||
elif node != settings.CLUSTER_HOST_ID:
|
||||
logger.warn(f'Local health check for {node} incorrectly sent to {settings.CLUSTER_HOST_ID}')
|
||||
return
|
||||
try:
|
||||
this_inst = Instance.objects.me()
|
||||
except Instance.DoesNotExist:
|
||||
logger.warn(f'Instance record for {node} missing, could not check capacity.')
|
||||
return
|
||||
this_inst.local_health_check()
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def execution_node_health_check(node):
|
||||
if node == '':
|
||||
logger.warn('Remote health check incorrectly called with blank string')
|
||||
return
|
||||
try:
|
||||
instance = Instance.objects.get(hostname=node)
|
||||
except Instance.DoesNotExist:
|
||||
logger.warn(f'Instance record for {node} missing, could not check capacity.')
|
||||
return
|
||||
data = worker_info(node)
|
||||
|
||||
data = worker_info(node, work_type='ansible-runner' if instance.node_type == 'execution' else 'local')
|
||||
|
||||
prior_capacity = instance.capacity
|
||||
|
||||
@@ -424,7 +447,7 @@ def execution_node_health_check(node):
|
||||
cpu=data.get('cpu_count', 0),
|
||||
memory=data.get('mem_in_bytes', 0),
|
||||
uuid=data.get('uuid'),
|
||||
has_error=bool(data.get('errors')),
|
||||
errors='\n'.join(data.get('errors', [])),
|
||||
)
|
||||
|
||||
if data['errors']:
|
||||
@@ -436,6 +459,8 @@ def execution_node_health_check(node):
|
||||
else:
|
||||
logger.info('Set capacity of execution node {} to {}, worker info data:\n{}'.format(node, instance.capacity, json.dumps(data, indent=2)))
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def inspect_execution_nodes(instance_list):
|
||||
with advisory_lock('inspect_execution_nodes_lock', wait=False):
|
||||
@@ -488,10 +513,12 @@ def inspect_execution_nodes(instance_list):
|
||||
logger.warn(f'Execution node attempting to rejoin as instance {hostname}.')
|
||||
execution_node_health_check.apply_async([hostname])
|
||||
elif instance.capacity == 0:
|
||||
# Periodically re-run the health check of errored nodes, in case someone fixed it
|
||||
# TODO: perhaps decrease the frequency of these checks
|
||||
logger.debug(f'Restarting health check for execution node {hostname} with known errors.')
|
||||
execution_node_health_check.apply_async([hostname])
|
||||
# nodes with proven connection but need remediation run health checks are reduced frequency
|
||||
if not instance.last_health_check or (nowtime - instance.last_health_check).total_seconds() >= settings.EXECUTION_NODE_REMEDIATION_CHECKS:
|
||||
# Periodically re-run the health check of errored nodes, in case someone fixed it
|
||||
# TODO: perhaps decrease the frequency of these checks
|
||||
logger.debug(f'Restarting health check for execution node {hostname} with known errors.')
|
||||
execution_node_health_check.apply_async([hostname])
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
@@ -556,7 +583,7 @@ def cluster_node_heartbeat():
|
||||
# If auto deprovisining is on, don't bother setting the capacity to 0
|
||||
# since we will delete the node anyway.
|
||||
if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES:
|
||||
other_inst.mark_offline()
|
||||
other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
|
||||
logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))
|
||||
elif settings.AWX_AUTO_DEPROVISION_INSTANCES:
|
||||
deprovision_hostname = other_inst.hostname
|
||||
@@ -3028,12 +3055,17 @@ class AWXReceptorJob:
|
||||
# We establish a connection to the Receptor socket
|
||||
receptor_ctl = get_receptor_ctl()
|
||||
|
||||
res = None
|
||||
try:
|
||||
return self._run_internal(receptor_ctl)
|
||||
res = self._run_internal(receptor_ctl)
|
||||
return res
|
||||
finally:
|
||||
# Make sure to always release the work unit if we established it
|
||||
if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK:
|
||||
receptor_ctl.simple_command(f"work release {self.unit_id}")
|
||||
# If an error occured without the job itself failing, it could be a broken instance
|
||||
if self.work_type == 'ansible-runner' and res is None or getattr(res, 'rc', None) is None:
|
||||
execution_node_health_check(self.task.instance.execution_node)
|
||||
|
||||
def _run_internal(self, receptor_ctl):
|
||||
# Create a socketpair. Where the left side will be used for writing our payload
|
||||
|
||||
Reference in New Issue
Block a user