Make sure that the health checks handle the state transitions properly

- nodes with states Provisioning, Provisioning Fail, Deprovisioning, and Deprovisioning Fail should bypass health checks and should never transition due to the existing machinery - nodes with states Unavailable and Installed can transition to Ready if they check out as healthy - nodes in the Ready state should transition to Unavailable if they fail a check
2026-02-28 00:08:44 -03:30 · 2022-07-27 17:17:13 -04:00
parent 81e68cb9bf
commit 3bcd539b3d
3 changed files with 18 additions and 8 deletions
--- a/awx/api/views/init.py
+++ b/awx/api/views/init.py
@@ -441,6 +441,7 @@ class InstanceHealthCheck(GenericAPIView):
    def post(self, request, *args, **kwargs):
        obj = self.get_object()

+        # Note: hop nodes are already excluded by the get_queryset method
        if obj.node_type == 'execution':
            from awx.main.tasks.system import execution_node_health_check

--- a/awx/main/models/ha.py
+++ b/awx/main/models/ha.py
@@ -242,15 +242,18 @@ class Instance(HasPolicyEditsMixin, BaseModel):
        return self.last_seen < ref_time - timedelta(seconds=grace_period)

    def mark_offline(self, update_last_seen=False, perform_save=True, errors=''):
-        if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and self.errors == errors and (not update_last_seen):
+        if self.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
            return
+        if self.node_state == Instance.States.UNAVAILABLE and self.errors == errors and (not update_last_seen):
+            return
+        self.node_state = Instance.States.UNAVAILABLE
        self.cpu_capacity = self.mem_capacity = self.capacity = 0
        self.errors = errors
        if update_last_seen:
            self.last_seen = now()

        if perform_save:
-            update_fields = ['capacity', 'cpu_capacity', 'mem_capacity', 'errors']
+            update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors']
            if update_last_seen:
                update_fields += ['last_seen']
            self.save(update_fields=update_fields)
@@ -307,6 +310,9 @@ class Instance(HasPolicyEditsMixin, BaseModel):
        if not errors:
            self.refresh_capacity_fields()
            self.errors = ''
+            if self.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
+                self.node_state = Instance.States.READY
+                update_fields.append('node_state')
        else:
            self.mark_offline(perform_save=False, errors=errors)
        update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])
@@ -325,7 +331,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
            # playbook event data; we should consider this a zero capacity event
            redis.Redis.from_url(settings.BROKER_URL).ping()
        except redis.ConnectionError:
-            errors = _('Failed to connect ot Redis')
+            errors = _('Failed to connect to Redis')

        self.save_health_data(awx_application_version, get_cpu_count(), get_mem_in_bytes(), update_last_seen=True, errors=errors)

--- a/awx/main/tasks/system.py
+++ b/awx/main/tasks/system.py
@@ -122,7 +122,7 @@ def inform_cluster_of_shutdown():
            reaper.reap_waiting(this_inst, grace_period=0)
        except Exception:
            logger.exception('failed to reap waiting jobs for {}'.format(this_inst.hostname))
-        logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname))
+        logger.warning('Normal shutdown signal for instance {}, removed self from capacity pool.'.format(this_inst.hostname))
    except Exception:
        logger.exception('Encountered problem with normal shutdown signal.')

@@ -407,6 +407,9 @@ def execution_node_health_check(node):
    if instance.node_type != 'execution':
        raise RuntimeError(f'Execution node health check ran against {instance.node_type} node {instance.hostname}')

+    if instance.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
+        raise RuntimeError(f"Execution node health check ran against node {instance.hostname} in state {instance.node_state}")
+
    data = worker_info(node)

    prior_capacity = instance.capacity
@@ -463,7 +466,7 @@ def inspect_execution_nodes(instance_list):

            # Only execution nodes should be dealt with by execution_node_health_check
            if instance.node_type == 'hop':
-                if was_lost and (not instance.is_lost(ref_time=nowtime)):
+                if was_lost:
                    logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh')
                    instance.save_health_data(errors='')
                continue
@@ -487,7 +490,7 @@ def inspect_execution_nodes(instance_list):
 def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
    logger.debug("Cluster node heartbeat task.")
    nowtime = now()
-    instance_list = list(Instance.objects.all())
+    instance_list = list(Instance.objects.filter(node_state__in=(Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED)))
    this_inst = None
    lost_instances = []

@@ -551,9 +554,9 @@ def cluster_node_heartbeat(dispatch_time=None, worker_tasks=None):
        try:
            if settings.AWX_AUTO_DEPROVISION_INSTANCES:
                deprovision_hostname = other_inst.hostname
-                other_inst.delete()
+                other_inst.delete()  # FIXME: what about associated inbound links?
                logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))
-            elif other_inst.capacity != 0 or (not other_inst.errors):
+            elif other_inst.node_state == Instance.States.READY:
                other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
                logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))