Handle receptorctl advertisements for hop nodes

counting it towards their heartbeat. Also, leave off the link to the health check endpoint from hop node Instances.
2026-02-26 15:36:04 -03:30 · 2022-01-24 15:39:54 -05:00
parent 56f8f8d3f4
commit 334c33ca07
3 changed files with 12 additions and 9 deletions
--- a/awx/api/serializers.py
+++ b/awx/api/serializers.py
@@ -4833,7 +4833,8 @@ class InstanceSerializer(BaseSerializer):
        res['jobs'] = self.reverse('api:instance_unified_jobs_list', kwargs={'pk': obj.pk})
        res['instance_groups'] = self.reverse('api:instance_instance_groups_list', kwargs={'pk': obj.pk})
        if self.context['request'].user.is_superuser or self.context['request'].user.is_system_auditor:
-            res['health_check'] = self.reverse('api:instance_health_check', kwargs={'pk': obj.pk})
+            if obj.node_type != 'hop':
+                res['health_check'] = self.reverse('api:instance_health_check', kwargs={'pk': obj.pk})
        return res

    def get_consumed_capacity(self, obj):
--- a/awx/main/models/ha.py
+++ b/awx/main/models/ha.py
@@ -195,7 +195,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
        if ref_time is None:
            ref_time = now()
        grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2
-        if self.node_type == 'execution':
+        if self.node_type in ('execution', 'hop'):
            grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD
        return self.last_seen < ref_time - timedelta(seconds=grace_period)

--- a/awx/main/tasks/system.py
+++ b/awx/main/tasks/system.py
@@ -436,14 +436,11 @@ def inspect_execution_nodes(instance_list):
        workers = mesh_status['Advertisements']
        for ad in workers:
            hostname = ad['NodeID']
-            if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []):
-                continue
-
            changed = False
            if hostname in node_lookup:
                instance = node_lookup[hostname]
            else:
-                logger.warn(f"Unrecognized node on mesh advertising ansible-runner work type: {hostname}")
+                logger.warn(f"Unrecognized node advertising on mesh: {hostname}")
                continue

            was_lost = instance.is_lost(ref_time=nowtime)
@@ -454,6 +451,10 @@ def inspect_execution_nodes(instance_list):
            instance.last_seen = last_seen
            instance.save(update_fields=['last_seen'])

+            # Make sure that hop nodes don't fall through and have the execution_node_health_check task applied
+            if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []):
+                continue
+
            if changed:
                execution_node_health_check.apply_async([hostname])
            elif was_lost:
@@ -482,7 +483,6 @@ def cluster_node_heartbeat():
    for inst in instance_list:
        if inst.hostname == settings.CLUSTER_HOST_ID:
            this_inst = inst
-            instance_list.remove(inst)
            break
    else:
        (changed, this_inst) = Instance.objects.get_or_register()
@@ -506,7 +506,9 @@ def cluster_node_heartbeat():
        raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
    # IFF any node has a greater version than we do, then we'll shutdown services
    for other_inst in instance_list:
-        if other_inst.version == "" or other_inst.version.startswith('ansible-runner') or other_inst.node_type == 'execution':
+        if other_inst.node_type in ('execution', 'hop'):
+            continue
+        if other_inst.version == "" or other_inst.version.startswith('ansible-runner'):
            continue
        if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG:
            logger.error(
@@ -530,7 +532,7 @@ def cluster_node_heartbeat():
            #  * It was set to 0 by another tower node running this method
            #  * It was set to 0 by this node, but auto deprovisioning is off
            #
-            # If auto deprovisining is on, don't bother setting the capacity to 0
+            # If auto deprovisioning is on, don't bother setting the capacity to 0
            # since we will delete the node anyway.
            if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES:
                other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))