mirror of
https://github.com/ansible/awx.git
synced 2026-03-01 00:38:45 -03:30
add auto-discovered nodes to default IG
* add advisory_lock to avoid IG update race logic * update IG by way of policy_instance_list
This commit is contained in:
@@ -429,47 +429,57 @@ def execution_node_health_check(node):
|
|||||||
|
|
||||||
|
|
||||||
def inspect_execution_nodes(instance_list):
|
def inspect_execution_nodes(instance_list):
|
||||||
node_lookup = {}
|
with advisory_lock('inspect_execution_nodes_lock', wait=True):
|
||||||
for inst in instance_list:
|
node_lookup = {}
|
||||||
if inst.node_type == 'execution':
|
for inst in instance_list:
|
||||||
node_lookup[inst.hostname] = inst
|
if inst.node_type == 'execution':
|
||||||
|
node_lookup[inst.hostname] = inst
|
||||||
|
|
||||||
ctl = get_receptor_ctl()
|
ctl = get_receptor_ctl()
|
||||||
connections = ctl.simple_command('status')['Advertisements']
|
connections = ctl.simple_command('status')['Advertisements']
|
||||||
nowtime = now()
|
nowtime = now()
|
||||||
for ad in connections:
|
for ad in connections:
|
||||||
hostname = ad['NodeID']
|
hostname = ad['NodeID']
|
||||||
commands = ad.get('WorkCommands') or []
|
commands = ad.get('WorkCommands') or []
|
||||||
if 'ansible-runner' not in commands:
|
if 'ansible-runner' not in commands:
|
||||||
continue
|
continue
|
||||||
changed = False
|
changed = False
|
||||||
if hostname in node_lookup:
|
if hostname in node_lookup:
|
||||||
instance = node_lookup[hostname]
|
instance = node_lookup[hostname]
|
||||||
else:
|
else:
|
||||||
defaults = dict(enabled=False)
|
defaults = dict(enabled=False)
|
||||||
(changed, instance) = Instance.objects.register(hostname=hostname, node_type='execution', defaults=defaults)
|
(changed, instance) = Instance.objects.register(hostname=hostname, node_type='execution', defaults=defaults)
|
||||||
was_lost = instance.is_lost(ref_time=nowtime)
|
logger.warn("Registered execution node '{}' (marked as disabled by default)".format(hostname))
|
||||||
last_seen = parse_date(ad['Time'])
|
|
||||||
|
|
||||||
if instance.last_seen and instance.last_seen >= last_seen:
|
default_ig = InstanceGroup.objects.get(name='default')
|
||||||
continue
|
if instance.hostname not in default_ig.policy_instance_list:
|
||||||
instance.last_seen = last_seen
|
default_ig.policy_instance_list += [instance.hostname]
|
||||||
instance.save(update_fields=['last_seen'])
|
default_ig.save()
|
||||||
|
logger.warn("Updated `default` instance group's policy_instance_list to include execution node '{}'".format(hostname))
|
||||||
|
else:
|
||||||
|
logger.warn("`default` instance group's policy_instance_list already listed execution node '{}'".format(hostname))
|
||||||
|
|
||||||
if changed:
|
was_lost = instance.is_lost(ref_time=nowtime)
|
||||||
logger.warn("Registered execution node '{}'".format(hostname))
|
last_seen = parse_date(ad['Time'])
|
||||||
execution_node_health_check.apply_async([hostname])
|
|
||||||
elif was_lost:
|
if instance.last_seen and instance.last_seen >= last_seen:
|
||||||
# if the instance *was* lost, but has appeared again,
|
continue
|
||||||
# attempt to re-establish the initial capacity and version
|
instance.last_seen = last_seen
|
||||||
# check
|
instance.save(update_fields=['last_seen'])
|
||||||
logger.warn(f'Execution node attempting to rejoin as instance {hostname}.')
|
|
||||||
execution_node_health_check.apply_async([hostname])
|
if changed:
|
||||||
elif instance.capacity == 0:
|
execution_node_health_check.apply_async([hostname])
|
||||||
# Periodically re-run the health check of errored nodes, in case someone fixed it
|
elif was_lost:
|
||||||
# TODO: perhaps decrease the frequency of these checks
|
# if the instance *was* lost, but has appeared again,
|
||||||
logger.debug(f'Restarting health check for execution node {hostname} with known errors.')
|
# attempt to re-establish the initial capacity and version
|
||||||
execution_node_health_check.apply_async([hostname])
|
# check
|
||||||
|
logger.warn(f'Execution node attempting to rejoin as instance {hostname}.')
|
||||||
|
execution_node_health_check.apply_async([hostname])
|
||||||
|
elif instance.capacity == 0:
|
||||||
|
# Periodically re-run the health check of errored nodes, in case someone fixed it
|
||||||
|
# TODO: perhaps decrease the frequency of these checks
|
||||||
|
logger.debug(f'Restarting health check for execution node {hostname} with known errors.')
|
||||||
|
execution_node_health_check.apply_async([hostname])
|
||||||
|
|
||||||
|
|
||||||
@task(queue=get_local_queuename)
|
@task(queue=get_local_queuename)
|
||||||
|
|||||||
Reference in New Issue
Block a user