add auto-discovered nodes to default IG

* add advisory_lock to avoid IG update race logic * update IG by way of policy_instance_list
2026-04-29 21:55:27 -02:30 · 2021-08-24 17:55:09 -07:00
parent 561fc289fb
commit f317fca9e4
1 changed files with 48 additions and 38 deletions
--- a/awx/main/tasks.py
+++ b/awx/main/tasks.py
@@ -429,47 +429,57 @@ def execution_node_health_check(node):
 def inspect_execution_nodes(instance_list):
-    node_lookup = {}
+    with advisory_lock('inspect_execution_nodes_lock', wait=True):
-    for inst in instance_list:
+        node_lookup = {}
-        if inst.node_type == 'execution':
+        for inst in instance_list:
-            node_lookup[inst.hostname] = inst
+            if inst.node_type == 'execution':
                node_lookup[inst.hostname] = inst
-    ctl = get_receptor_ctl()
+        ctl = get_receptor_ctl()
-    connections = ctl.simple_command('status')['Advertisements']
+        connections = ctl.simple_command('status')['Advertisements']
-    nowtime = now()
+        nowtime = now()
-    for ad in connections:
+        for ad in connections:
-        hostname = ad['NodeID']
+            hostname = ad['NodeID']
-        commands = ad.get('WorkCommands') or []
+            commands = ad.get('WorkCommands') or []
-        if 'ansible-runner' not in commands:
+            if 'ansible-runner' not in commands:
-            continue
+                continue
-        changed = False
+            changed = False
-        if hostname in node_lookup:
+            if hostname in node_lookup:
-            instance = node_lookup[hostname]
+                instance = node_lookup[hostname]
-        else:
+            else:
-            defaults = dict(enabled=False)
+                defaults = dict(enabled=False)
-            (changed, instance) = Instance.objects.register(hostname=hostname, node_type='execution', defaults=defaults)
+                (changed, instance) = Instance.objects.register(hostname=hostname, node_type='execution', defaults=defaults)
-        was_lost = instance.is_lost(ref_time=nowtime)
+                logger.warn("Registered execution node '{}' (marked as disabled by default)".format(hostname))
        last_seen = parse_date(ad['Time'])
-        if instance.last_seen and instance.last_seen >= last_seen:
+                default_ig = InstanceGroup.objects.get(name='default')
-            continue
+                if instance.hostname not in default_ig.policy_instance_list:
-        instance.last_seen = last_seen
+                    default_ig.policy_instance_list += [instance.hostname]
-        instance.save(update_fields=['last_seen'])
+                    default_ig.save()
                    logger.warn("Updated `default` instance group's policy_instance_list to include execution node '{}'".format(hostname))
                else:
                    logger.warn("`default` instance group's policy_instance_list already listed execution node '{}'".format(hostname))
-        if changed:
+            was_lost = instance.is_lost(ref_time=nowtime)
-            logger.warn("Registered execution node '{}'".format(hostname))
+            last_seen = parse_date(ad['Time'])
-            execution_node_health_check.apply_async([hostname])
+
-        elif was_lost:
+            if instance.last_seen and instance.last_seen >= last_seen:
-            # if the instance *was* lost, but has appeared again,
+                continue
-            # attempt to re-establish the initial capacity and version
+            instance.last_seen = last_seen
-            # check
+            instance.save(update_fields=['last_seen'])
-            logger.warn(f'Execution node attempting to rejoin as instance {hostname}.')
+
-            execution_node_health_check.apply_async([hostname])
+            if changed:
-        elif instance.capacity == 0:
+                execution_node_health_check.apply_async([hostname])
-            # Periodically re-run the health check of errored nodes, in case someone fixed it
+            elif was_lost:
-            # TODO: perhaps decrease the frequency of these checks
+                # if the instance *was* lost, but has appeared again,
-            logger.debug(f'Restarting health check for execution node {hostname} with known errors.')
+                # attempt to re-establish the initial capacity and version
-            execution_node_health_check.apply_async([hostname])
+                # check
                logger.warn(f'Execution node attempting to rejoin as instance {hostname}.')
                execution_node_health_check.apply_async([hostname])
            elif instance.capacity == 0:
                # Periodically re-run the health check of errored nodes, in case someone fixed it
                # TODO: perhaps decrease the frequency of these checks
                logger.debug(f'Restarting health check for execution node {hostname} with known errors.')
                execution_node_health_check.apply_async([hostname])
@task(queue=get_local_queuename)