Handle receptorctl advertisements for hop nodes

counting it towards their heartbeat.  Also, leave off the link to the
health check endpoint from hop node Instances.
This commit is contained in:
Jeff Bradberry 2022-01-24 15:39:54 -05:00
parent 56f8f8d3f4
commit 334c33ca07
3 changed files with 12 additions and 9 deletions

View File

@ -4833,7 +4833,8 @@ class InstanceSerializer(BaseSerializer):
res['jobs'] = self.reverse('api:instance_unified_jobs_list', kwargs={'pk': obj.pk})
res['instance_groups'] = self.reverse('api:instance_instance_groups_list', kwargs={'pk': obj.pk})
if self.context['request'].user.is_superuser or self.context['request'].user.is_system_auditor:
res['health_check'] = self.reverse('api:instance_health_check', kwargs={'pk': obj.pk})
if obj.node_type != 'hop':
res['health_check'] = self.reverse('api:instance_health_check', kwargs={'pk': obj.pk})
return res
def get_consumed_capacity(self, obj):

View File

@ -195,7 +195,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
if ref_time is None:
ref_time = now()
grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2
if self.node_type == 'execution':
if self.node_type in ('execution', 'hop'):
grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD
return self.last_seen < ref_time - timedelta(seconds=grace_period)

View File

@ -436,14 +436,11 @@ def inspect_execution_nodes(instance_list):
workers = mesh_status['Advertisements']
for ad in workers:
hostname = ad['NodeID']
if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []):
continue
changed = False
if hostname in node_lookup:
instance = node_lookup[hostname]
else:
logger.warn(f"Unrecognized node on mesh advertising ansible-runner work type: {hostname}")
logger.warn(f"Unrecognized node advertising on mesh: {hostname}")
continue
was_lost = instance.is_lost(ref_time=nowtime)
@ -454,6 +451,10 @@ def inspect_execution_nodes(instance_list):
instance.last_seen = last_seen
instance.save(update_fields=['last_seen'])
# Make sure that hop nodes don't fall through and have the execution_node_health_check task applied
if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []):
continue
if changed:
execution_node_health_check.apply_async([hostname])
elif was_lost:
@ -482,7 +483,6 @@ def cluster_node_heartbeat():
for inst in instance_list:
if inst.hostname == settings.CLUSTER_HOST_ID:
this_inst = inst
instance_list.remove(inst)
break
else:
(changed, this_inst) = Instance.objects.get_or_register()
@ -506,7 +506,9 @@ def cluster_node_heartbeat():
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
# IFF any node has a greater version than we do, then we'll shutdown services
for other_inst in instance_list:
if other_inst.version == "" or other_inst.version.startswith('ansible-runner') or other_inst.node_type == 'execution':
if other_inst.node_type in ('execution', 'hop'):
continue
if other_inst.version == "" or other_inst.version.startswith('ansible-runner'):
continue
if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG:
logger.error(
@ -530,7 +532,7 @@ def cluster_node_heartbeat():
# * It was set to 0 by another tower node running this method
# * It was set to 0 by this node, but auto deprovisioning is off
#
# If auto deprovisining is on, don't bother setting the capacity to 0
# If auto deprovisioning is on, don't bother setting the capacity to 0
# since we will delete the node anyway.
if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES:
other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))