Fill in errors for hop nodes when Last Seen is out of date, and clear them when not (#11714)

* Process unresponsive and newly responsive hop nodes * Use more natural way to zero hop node capacity, add test * Use warning as opposed to warn for log messages
2026-03-01 16:58:46 -03:30 · 2022-03-09 13:21:32 -05:00
parent 4c9d028a35
commit 99bbc347ec
3 changed files with 38 additions and 22 deletions
--- a/awx/main/models/ha.py
+++ b/awx/main/models/ha.py
@@ -233,13 +233,19 @@ class Instance(HasPolicyEditsMixin, BaseModel):

    def refresh_capacity_fields(self):
        """Update derived capacity fields from cpu and memory (no save)"""
-        self.cpu_capacity = get_cpu_effective_capacity(self.cpu)
-        self.mem_capacity = get_mem_effective_capacity(self.memory)
+        if self.node_type == 'hop':
+            self.cpu_capacity = 0
+            self.mem_capacity = 0  # formula has a non-zero offset, so we make sure it is 0 for hop nodes
+        else:
+            self.cpu_capacity = get_cpu_effective_capacity(self.cpu)
+            self.mem_capacity = get_mem_effective_capacity(self.memory)
        self.set_capacity_value()

-    def save_health_data(self, version, cpu, memory, uuid=None, update_last_seen=False, errors=''):
-        self.last_health_check = now()
-        update_fields = ['last_health_check']
+    def save_health_data(self, version=None, cpu=0, memory=0, uuid=None, update_last_seen=False, errors=''):
+        update_fields = ['errors']
+        if self.node_type != 'hop':
+            self.last_health_check = now()
+            update_fields.append('last_health_check')

        if update_last_seen:
            self.last_seen = self.last_health_check
@@ -251,7 +257,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
            self.uuid = uuid
            update_fields.append('uuid')

-        if self.version != version:
+        if version is not None and self.version != version:
            self.version = version
            update_fields.append('version')

@@ -270,7 +276,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
            self.errors = ''
        else:
            self.mark_offline(perform_save=False, errors=errors)
-        update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity', 'errors'])
+        update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])

        # disabling activity stream will avoid extra queries, which is important for heatbeat actions
        from awx.main.signals import disable_activity_stream
--- a/awx/main/tasks/system.py
+++ b/awx/main/tasks/system.py
@@ -436,7 +436,6 @@ def inspect_execution_nodes(instance_list):
        workers = mesh_status['Advertisements']
        for ad in workers:
            hostname = ad['NodeID']
-            changed = False

            if hostname in node_lookup:
                instance = node_lookup[hostname]
@@ -458,11 +457,11 @@ def inspect_execution_nodes(instance_list):

            # Only execution nodes should be dealt with by execution_node_health_check
            if instance.node_type == 'hop':
+                logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh')
+                instance.save_health_data(errors='')
                continue

-            if changed:
-                execution_node_health_check.apply_async([hostname])
-            elif was_lost:
+            if was_lost:
                # if the instance *was* lost, but has appeared again,
                # attempt to re-establish the initial capacity and version
                # check
@@ -534,20 +533,14 @@ def cluster_node_heartbeat():
        except Exception:
            logger.exception('failed to reap jobs for {}'.format(other_inst.hostname))
        try:
-            # Capacity could already be 0 because:
-            #  * It's a new node and it never had a heartbeat
-            #  * It was set to 0 by another tower node running this method
-            #  * It was set to 0 by this node, but auto deprovisioning is off
-            #
-            # If auto deprovisioning is on, don't bother setting the capacity to 0
-            # since we will delete the node anyway.
-            if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES:
-                other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
-                logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))
-            elif settings.AWX_AUTO_DEPROVISION_INSTANCES:
+            if settings.AWX_AUTO_DEPROVISION_INSTANCES:
                deprovision_hostname = other_inst.hostname
                other_inst.delete()
                logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))
+            elif other_inst.capacity != 0 or (not other_inst.errors):
+                other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
+                logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))
+
        except DatabaseError as e:
            if 'did not affect any rows' in str(e):
                logger.debug('Another instance has marked {} as lost'.format(other_inst.hostname))
--- a/awx/main/tests/functional/test_instances.py
+++ b/awx/main/tests/functional/test_instances.py
@@ -363,6 +363,23 @@ def test_health_check_oh_no():
    assert instance.errors == 'This it not a real instance!'


+@pytest.mark.django_db
+def test_errors_field_alone():
+    instance = Instance.objects.create(hostname='foo-1', enabled=True, node_type='hop')
+
+    instance.save_health_data(errors='Node went missing!')
+    assert instance.errors == 'Node went missing!'
+    assert instance.capacity == 0
+    assert instance.memory == instance.mem_capacity == 0
+    assert instance.cpu == instance.cpu_capacity == 0
+
+    instance.save_health_data(errors='')
+    assert not instance.errors
+    assert instance.capacity == 0
+    assert instance.memory == instance.mem_capacity == 0
+    assert instance.cpu == instance.cpu_capacity == 0
+
+
@pytest.mark.django_db
 class TestInstanceGroupOrdering:
    def test_ad_hoc_instance_groups(self, instance_group_factory, inventory, default_instance_group):