mirror of
https://github.com/ansible/awx.git
synced 2026-01-12 02:19:58 -03:30
Handle receptorctl advertisements for hop nodes
counting it towards their heartbeat. Also, leave off the link to the health check endpoint from hop node Instances.
This commit is contained in:
parent
56f8f8d3f4
commit
334c33ca07
@ -4833,7 +4833,8 @@ class InstanceSerializer(BaseSerializer):
|
||||
res['jobs'] = self.reverse('api:instance_unified_jobs_list', kwargs={'pk': obj.pk})
|
||||
res['instance_groups'] = self.reverse('api:instance_instance_groups_list', kwargs={'pk': obj.pk})
|
||||
if self.context['request'].user.is_superuser or self.context['request'].user.is_system_auditor:
|
||||
res['health_check'] = self.reverse('api:instance_health_check', kwargs={'pk': obj.pk})
|
||||
if obj.node_type != 'hop':
|
||||
res['health_check'] = self.reverse('api:instance_health_check', kwargs={'pk': obj.pk})
|
||||
return res
|
||||
|
||||
def get_consumed_capacity(self, obj):
|
||||
|
||||
@ -195,7 +195,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
|
||||
if ref_time is None:
|
||||
ref_time = now()
|
||||
grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2
|
||||
if self.node_type == 'execution':
|
||||
if self.node_type in ('execution', 'hop'):
|
||||
grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD
|
||||
return self.last_seen < ref_time - timedelta(seconds=grace_period)
|
||||
|
||||
|
||||
@ -436,14 +436,11 @@ def inspect_execution_nodes(instance_list):
|
||||
workers = mesh_status['Advertisements']
|
||||
for ad in workers:
|
||||
hostname = ad['NodeID']
|
||||
if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []):
|
||||
continue
|
||||
|
||||
changed = False
|
||||
if hostname in node_lookup:
|
||||
instance = node_lookup[hostname]
|
||||
else:
|
||||
logger.warn(f"Unrecognized node on mesh advertising ansible-runner work type: {hostname}")
|
||||
logger.warn(f"Unrecognized node advertising on mesh: {hostname}")
|
||||
continue
|
||||
|
||||
was_lost = instance.is_lost(ref_time=nowtime)
|
||||
@ -454,6 +451,10 @@ def inspect_execution_nodes(instance_list):
|
||||
instance.last_seen = last_seen
|
||||
instance.save(update_fields=['last_seen'])
|
||||
|
||||
# Make sure that hop nodes don't fall through and have the execution_node_health_check task applied
|
||||
if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []):
|
||||
continue
|
||||
|
||||
if changed:
|
||||
execution_node_health_check.apply_async([hostname])
|
||||
elif was_lost:
|
||||
@ -482,7 +483,6 @@ def cluster_node_heartbeat():
|
||||
for inst in instance_list:
|
||||
if inst.hostname == settings.CLUSTER_HOST_ID:
|
||||
this_inst = inst
|
||||
instance_list.remove(inst)
|
||||
break
|
||||
else:
|
||||
(changed, this_inst) = Instance.objects.get_or_register()
|
||||
@ -506,7 +506,9 @@ def cluster_node_heartbeat():
|
||||
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
|
||||
# IFF any node has a greater version than we do, then we'll shutdown services
|
||||
for other_inst in instance_list:
|
||||
if other_inst.version == "" or other_inst.version.startswith('ansible-runner') or other_inst.node_type == 'execution':
|
||||
if other_inst.node_type in ('execution', 'hop'):
|
||||
continue
|
||||
if other_inst.version == "" or other_inst.version.startswith('ansible-runner'):
|
||||
continue
|
||||
if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG:
|
||||
logger.error(
|
||||
@ -530,7 +532,7 @@ def cluster_node_heartbeat():
|
||||
# * It was set to 0 by another tower node running this method
|
||||
# * It was set to 0 by this node, but auto deprovisioning is off
|
||||
#
|
||||
# If auto deprovisining is on, don't bother setting the capacity to 0
|
||||
# If auto deprovisioning is on, don't bother setting the capacity to 0
|
||||
# since we will delete the node anyway.
|
||||
if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES:
|
||||
other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user