mirror of
https://github.com/ansible/awx.git
synced 2026-03-06 11:11:07 -03:30
Make sure that the health checks handle the state transitions properly
- nodes with states Provisioning, Provisioning Fail, Deprovisioning, and Deprovisioning Fail should bypass health checks and should never transition due to the existing machinery - nodes with states Unavailable and Installed can transition to Ready if they check out as healthy - nodes in the Ready state should transition to Unavailable if they fail a check
This commit is contained in:
@@ -441,6 +441,7 @@ class InstanceHealthCheck(GenericAPIView):
|
|||||||
def post(self, request, *args, **kwargs):
|
def post(self, request, *args, **kwargs):
|
||||||
obj = self.get_object()
|
obj = self.get_object()
|
||||||
|
|
||||||
|
# Note: hop nodes are already excluded by the get_queryset method
|
||||||
if obj.node_type == 'execution':
|
if obj.node_type == 'execution':
|
||||||
from awx.main.tasks.system import execution_node_health_check
|
from awx.main.tasks.system import execution_node_health_check
|
||||||
|
|
||||||
|
|||||||
@@ -242,15 +242,18 @@ class Instance(HasPolicyEditsMixin, BaseModel):
|
|||||||
return self.last_seen < ref_time - timedelta(seconds=grace_period)
|
return self.last_seen < ref_time - timedelta(seconds=grace_period)
|
||||||
|
|
||||||
def mark_offline(self, update_last_seen=False, perform_save=True, errors=''):
|
def mark_offline(self, update_last_seen=False, perform_save=True, errors=''):
|
||||||
if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and self.errors == errors and (not update_last_seen):
|
if self.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
|
||||||
return
|
return
|
||||||
|
if self.node_state == Instance.States.UNAVAILABLE and self.errors == errors and (not update_last_seen):
|
||||||
|
return
|
||||||
|
self.node_state = Instance.States.UNAVAILABLE
|
||||||
self.cpu_capacity = self.mem_capacity = self.capacity = 0
|
self.cpu_capacity = self.mem_capacity = self.capacity = 0
|
||||||
self.errors = errors
|
self.errors = errors
|
||||||
if update_last_seen:
|
if update_last_seen:
|
||||||
self.last_seen = now()
|
self.last_seen = now()
|
||||||
|
|
||||||
if perform_save:
|
if perform_save:
|
||||||
update_fields = ['capacity', 'cpu_capacity', 'mem_capacity', 'errors']
|
update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors']
|
||||||
if update_last_seen:
|
if update_last_seen:
|
||||||
update_fields += ['last_seen']
|
update_fields += ['last_seen']
|
||||||
self.save(update_fields=update_fields)
|
self.save(update_fields=update_fields)
|
||||||
@@ -307,6 +310,9 @@ class Instance(HasPolicyEditsMixin, BaseModel):
|
|||||||
if not errors:
|
if not errors:
|
||||||
self.refresh_capacity_fields()
|
self.refresh_capacity_fields()
|
||||||
self.errors = ''
|
self.errors = ''
|
||||||
|
if self.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
|
||||||
|
self.node_state = Instance.States.READY
|
||||||
|
update_fields.append('node_state')
|
||||||
else:
|
else:
|
||||||
self.mark_offline(perform_save=False, errors=errors)
|
self.mark_offline(perform_save=False, errors=errors)
|
||||||
update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])
|
update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])
|
||||||
@@ -325,7 +331,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
|
|||||||
# playbook event data; we should consider this a zero capacity event
|
# playbook event data; we should consider this a zero capacity event
|
||||||
redis.Redis.from_url(settings.BROKER_URL).ping()
|
redis.Redis.from_url(settings.BROKER_URL).ping()
|
||||||
except redis.ConnectionError:
|
except redis.ConnectionError:
|
||||||
errors = _('Failed to connect ot Redis')
|
errors = _('Failed to connect to Redis')
|
||||||
|
|
||||||
self.save_health_data(awx_application_version, get_cpu_count(), get_mem_in_bytes(), update_last_seen=True, errors=errors)
|
self.save_health_data(awx_application_version, get_cpu_count(), get_mem_in_bytes(), update_last_seen=True, errors=errors)
|
||||||
|
|
||||||
|
|||||||
@@ -122,7 +122,7 @@ def inform_cluster_of_shutdown():
|
|||||||
reaper.reap_waiting(this_inst, grace_period=0)
|
reaper.reap_waiting(this_inst, grace_period=0)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception('failed to reap waiting jobs for {}'.format(this_inst.hostname))
|
logger.exception('failed to reap waiting jobs for {}'.format(this_inst.hostname))
|
||||||
logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname))
|
logger.warning('Normal shutdown signal for instance {}, removed self from capacity pool.'.format(this_inst.hostname))
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception('Encountered problem with normal shutdown signal.')
|
logger.exception('Encountered problem with normal shutdown signal.')
|
||||||
|
|
||||||
@@ -407,6 +407,9 @@ def execution_node_health_check(node):
|
|||||||
if instance.node_type != 'execution':
|
if instance.node_type != 'execution':
|
||||||
raise RuntimeError(f'Execution node health check ran against {instance.node_type} node {instance.hostname}')
|
raise RuntimeError(f'Execution node health check ran against {instance.node_type} node {instance.hostname}')
|
||||||
|
|
||||||
|
if instance.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
|
||||||
|
raise RuntimeError(f"Execution node health check ran against node {instance.hostname} in state {instance.node_state}")
|
||||||
|
|
||||||
data = worker_info(node)
|
data = worker_info(node)
|
||||||
|
|
||||||
prior_capacity = instance.capacity
|
prior_capacity = instance.capacity
|
||||||
@@ -463,7 +466,7 @@ def inspect_execution_nodes(instance_list):
|
|||||||
|
|
||||||
# Only execution nodes should be dealt with by execution_node_health_check
|
# Only execution nodes should be dealt with by execution_node_health_check
|
||||||
if instance.node_type == 'hop':
|
if instance.node_type == 'hop':
|
||||||
if was_lost and (not instance.is_lost(ref_time=nowtime)):
|
if was_lost:
|
||||||
logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh')
|
logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh')
|
||||||
instance.save_health_data(errors='')
|
instance.save_health_data(errors='')
|
||||||
continue
|
continue
|
||||||
@@ -487,7 +490,7 @@ def inspect_execution_nodes(instance_list):
|
|||||||
def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
|
def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
|
||||||
logger.debug("Cluster node heartbeat task.")
|
logger.debug("Cluster node heartbeat task.")
|
||||||
nowtime = now()
|
nowtime = now()
|
||||||
instance_list = list(Instance.objects.all())
|
instance_list = list(Instance.objects.filter(node_state__in=(Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED)))
|
||||||
this_inst = None
|
this_inst = None
|
||||||
lost_instances = []
|
lost_instances = []
|
||||||
|
|
||||||
@@ -551,9 +554,9 @@ def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
|
|||||||
try:
|
try:
|
||||||
if settings.AWX_AUTO_DEPROVISION_INSTANCES:
|
if settings.AWX_AUTO_DEPROVISION_INSTANCES:
|
||||||
deprovision_hostname = other_inst.hostname
|
deprovision_hostname = other_inst.hostname
|
||||||
other_inst.delete()
|
other_inst.delete() # FIXME: what about associated inbound links?
|
||||||
logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))
|
logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))
|
||||||
elif other_inst.capacity != 0 or (not other_inst.errors):
|
elif other_inst.node_state == Instance.States.READY:
|
||||||
other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
|
other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
|
||||||
logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))
|
logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user