Make sure that the health checks handle the state transitions properly

- nodes with states Provisioning, Provisioning Fail, Deprovisioning,
  and Deprovisioning Fail should bypass health checks and should never
  transition due to the existing machinery
- nodes with states Unavailable and Installed can transition to Ready
  if they check out as healthy
- nodes in the Ready state should transition to Unavailable if they
  fail a check
This commit is contained in:
Jeff Bradberry 2022-07-27 17:17:13 -04:00
parent 81e68cb9bf
commit 3bcd539b3d
3 changed files with 18 additions and 8 deletions

View File

@ -441,6 +441,7 @@ class InstanceHealthCheck(GenericAPIView):
def post(self, request, *args, **kwargs):
obj = self.get_object()
# Note: hop nodes are already excluded by the get_queryset method
if obj.node_type == 'execution':
from awx.main.tasks.system import execution_node_health_check

View File

@ -242,15 +242,18 @@ class Instance(HasPolicyEditsMixin, BaseModel):
return self.last_seen < ref_time - timedelta(seconds=grace_period)
def mark_offline(self, update_last_seen=False, perform_save=True, errors=''):
if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and self.errors == errors and (not update_last_seen):
if self.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
return
if self.node_state == Instance.States.UNAVAILABLE and self.errors == errors and (not update_last_seen):
return
self.node_state = Instance.States.UNAVAILABLE
self.cpu_capacity = self.mem_capacity = self.capacity = 0
self.errors = errors
if update_last_seen:
self.last_seen = now()
if perform_save:
update_fields = ['capacity', 'cpu_capacity', 'mem_capacity', 'errors']
update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors']
if update_last_seen:
update_fields += ['last_seen']
self.save(update_fields=update_fields)
@ -307,6 +310,9 @@ class Instance(HasPolicyEditsMixin, BaseModel):
if not errors:
self.refresh_capacity_fields()
self.errors = ''
if self.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
self.node_state = Instance.States.READY
update_fields.append('node_state')
else:
self.mark_offline(perform_save=False, errors=errors)
update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])
@ -325,7 +331,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
# playbook event data; we should consider this a zero capacity event
redis.Redis.from_url(settings.BROKER_URL).ping()
except redis.ConnectionError:
errors = _('Failed to connect ot Redis')
errors = _('Failed to connect to Redis')
self.save_health_data(awx_application_version, get_cpu_count(), get_mem_in_bytes(), update_last_seen=True, errors=errors)

View File

@ -122,7 +122,7 @@ def inform_cluster_of_shutdown():
reaper.reap_waiting(this_inst, grace_period=0)
except Exception:
logger.exception('failed to reap waiting jobs for {}'.format(this_inst.hostname))
logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname))
logger.warning('Normal shutdown signal for instance {}, removed self from capacity pool.'.format(this_inst.hostname))
except Exception:
logger.exception('Encountered problem with normal shutdown signal.')
@ -407,6 +407,9 @@ def execution_node_health_check(node):
if instance.node_type != 'execution':
raise RuntimeError(f'Execution node health check ran against {instance.node_type} node {instance.hostname}')
if instance.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
raise RuntimeError(f"Execution node health check ran against node {instance.hostname} in state {instance.node_state}")
data = worker_info(node)
prior_capacity = instance.capacity
@ -463,7 +466,7 @@ def inspect_execution_nodes(instance_list):
# Only execution nodes should be dealt with by execution_node_health_check
if instance.node_type == 'hop':
if was_lost and (not instance.is_lost(ref_time=nowtime)):
if was_lost:
logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh')
instance.save_health_data(errors='')
continue
@ -487,7 +490,7 @@ def inspect_execution_nodes(instance_list):
def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
logger.debug("Cluster node heartbeat task.")
nowtime = now()
instance_list = list(Instance.objects.all())
instance_list = list(Instance.objects.filter(node_state__in=(Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED)))
this_inst = None
lost_instances = []
@ -551,9 +554,9 @@ def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
try:
if settings.AWX_AUTO_DEPROVISION_INSTANCES:
deprovision_hostname = other_inst.hostname
other_inst.delete()
other_inst.delete() # FIXME: what about associated inbound links?
logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))
elif other_inst.capacity != 0 or (not other_inst.errors):
elif other_inst.node_state == Instance.States.READY:
other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))