diff --git a/awx/api/serializers.py b/awx/api/serializers.py index e38efff349..31d1103786 100644 --- a/awx/api/serializers.py +++ b/awx/api/serializers.py @@ -4833,7 +4833,8 @@ class InstanceSerializer(BaseSerializer): res['jobs'] = self.reverse('api:instance_unified_jobs_list', kwargs={'pk': obj.pk}) res['instance_groups'] = self.reverse('api:instance_instance_groups_list', kwargs={'pk': obj.pk}) if self.context['request'].user.is_superuser or self.context['request'].user.is_system_auditor: - res['health_check'] = self.reverse('api:instance_health_check', kwargs={'pk': obj.pk}) + if obj.node_type != 'hop': + res['health_check'] = self.reverse('api:instance_health_check', kwargs={'pk': obj.pk}) return res def get_consumed_capacity(self, obj): diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 653b4f1814..03b6dbfe79 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -195,7 +195,7 @@ class Instance(HasPolicyEditsMixin, BaseModel): if ref_time is None: ref_time = now() grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2 - if self.node_type == 'execution': + if self.node_type in ('execution', 'hop'): grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD return self.last_seen < ref_time - timedelta(seconds=grace_period) diff --git a/awx/main/tasks/system.py b/awx/main/tasks/system.py index e596668f89..c14ac8d7d9 100644 --- a/awx/main/tasks/system.py +++ b/awx/main/tasks/system.py @@ -436,14 +436,11 @@ def inspect_execution_nodes(instance_list): workers = mesh_status['Advertisements'] for ad in workers: hostname = ad['NodeID'] - if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []): - continue - changed = False if hostname in node_lookup: instance = node_lookup[hostname] else: - logger.warn(f"Unrecognized node on mesh advertising ansible-runner work type: {hostname}") + logger.warn(f"Unrecognized node advertising on mesh: {hostname}") continue was_lost = instance.is_lost(ref_time=nowtime) @@ -454,6 +451,10 @@ def inspect_execution_nodes(instance_list): instance.last_seen = last_seen instance.save(update_fields=['last_seen']) + # Make sure that hop nodes don't fall through and have the execution_node_health_check task applied + if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []): + continue + if changed: execution_node_health_check.apply_async([hostname]) elif was_lost: @@ -482,7 +483,6 @@ def cluster_node_heartbeat(): for inst in instance_list: if inst.hostname == settings.CLUSTER_HOST_ID: this_inst = inst - instance_list.remove(inst) break else: (changed, this_inst) = Instance.objects.get_or_register() @@ -506,7 +506,9 @@ def cluster_node_heartbeat(): raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID)) # IFF any node has a greater version than we do, then we'll shutdown services for other_inst in instance_list: - if other_inst.version == "" or other_inst.version.startswith('ansible-runner') or other_inst.node_type == 'execution': + if other_inst.node_type in ('execution', 'hop'): + continue + if other_inst.version == "" or other_inst.version.startswith('ansible-runner'): continue if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG: logger.error( @@ -530,7 +532,7 @@ def cluster_node_heartbeat(): # * It was set to 0 by another tower node running this method # * It was set to 0 by this node, but auto deprovisioning is off # - # If auto deprovisining is on, don't bother setting the capacity to 0 + # If auto deprovisioning is on, don't bother setting the capacity to 0 # since we will delete the node anyway. if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES: other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))