Control the visibility and use of hop node Instances

- the list, detail, and health check API views should not include them - the Instance-InstanceGroup association views should not allow them to be changed - the ping view excludes them - list_instances management command excludes them - Instance.set_capacity_value sets hop nodes to 0 capacity - TaskManager will exclude them from the nodes available for job execution - TaskManager.reap_jobs_from_orphaned_instances will consider hop nodes to be an orphaned instance - The apply_cluster_membership_policies task will not manipulate hop nodes - get_broadcast_hosts will ignore hop nodes - active_count also will ignore hop nodes
2026-05-12 20:07:37 -02:30 · 2021-12-16 15:53:15 -05:00
parent c8f1e714e1
commit f340f491dc
9 changed files with 25 additions and 16 deletions
--- a/awx/api/views/init.py
+++ b/awx/api/views/init.py
@@ -364,11 +364,14 @@ class InstanceList(ListAPIView):
    serializer_class = serializers.InstanceSerializer
    search_fields = ('hostname',)
    def get_queryset(self):
        return super().get_queryset().exclude(node_type='hop')
 class InstanceDetail(RetrieveUpdateAPIView):
    name = _("Instance Detail")
-    model = models.Instance
+    queryset = models.Instance.objects.exclude(node_type='hop')
    serializer_class = serializers.InstanceSerializer
    def update(self, request, *args, **kwargs):
@@ -406,13 +409,15 @@ class InstanceInstanceGroupsList(InstanceGroupMembershipMixin, SubListCreateAtta
    def is_valid_relation(self, parent, sub, created=False):
        if parent.node_type == 'control':
            return {'msg': _(f"Cannot change instance group membership of control-only node: {parent.hostname}.")}
        if parent.node_type == 'hop':
            return {'msg': _(f"Cannot change instance group membership of hop node: {parent.hostname}.")}
        return None
 class InstanceHealthCheck(GenericAPIView):
    name = _('Instance Health Check')
-    model = models.Instance
+    queryset = models.Instance.objects.exclude(node_type='hop')
    serializer_class = serializers.InstanceHealthCheckSerializer
    permission_classes = (IsSystemAdminOrAuditor,)
@@ -503,6 +508,8 @@ class InstanceGroupInstanceList(InstanceGroupMembershipMixin, SubListAttachDetac
    def is_valid_relation(self, parent, sub, created=False):
        if sub.node_type == 'control':
            return {'msg': _(f"Cannot change instance group membership of control-only node: {sub.hostname}.")}
        if sub.node_type == 'hop':
            return {'msg': _(f"Cannot change instance group membership of hop node: {sub.hostname}.")}
        return None
--- a/awx/api/views/root.py
+++ b/awx/api/views/root.py
@@ -149,7 +149,7 @@ class ApiV2PingView(APIView):
        response = {'ha': is_ha_environment(), 'version': get_awx_version(), 'active_node': settings.CLUSTER_HOST_ID, 'install_uuid': settings.INSTALL_UUID}
        response['instances'] = []
-        for instance in Instance.objects.all():
+        for instance in Instance.objects.exclude(node_type='hop'):
            response['instances'].append(
                dict(
                    node=instance.hostname,
--- a/awx/main/management/commands/list_instances.py
+++ b/awx/main/management/commands/list_instances.py
@@ -13,7 +13,7 @@ class Ungrouped(object):
    @property
    def instances(self):
-        return Instance.objects.filter(rampart_groups__isnull=True)
+        return Instance.objects.filter(rampart_groups__isnull=True).exclude(node_type='hop')
    @property
    def capacity(self):
--- a/awx/main/managers.py
+++ b/awx/main/managers.py
@@ -188,7 +188,7 @@ class InstanceManager(models.Manager):
    def active_count(self):
        """Return count of active Tower nodes for licensing."""
-        return self.all().count()
+        return self.exclude(node_type='hop').count()
 class InstanceGroupManager(models.Manager):
--- a/awx/main/models/ha.py
+++ b/awx/main/models/ha.py
@@ -29,7 +29,7 @@ from awx.main.models.mixins import RelatedJobsMixin
 # ansible-runner
 from ansible_runner.utils.capacity import get_cpu_count, get_mem_in_bytes
-__all__ = ('Instance', 'InstanceGroup', 'TowerScheduleState')
+__all__ = ('Instance', 'InstanceGroup', 'InstanceLink', 'TowerScheduleState')
 logger = logging.getLogger('awx.main.models.ha')
@@ -215,7 +215,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
    def set_capacity_value(self):
        """Sets capacity according to capacity adjustment rule (no save)"""
-        if self.enabled:
+        if self.enabled and self.node_type != 'hop':
            lower_cap = min(self.mem_capacity, self.cpu_capacity)
            higher_cap = max(self.mem_capacity, self.cpu_capacity)
            self.capacity = lower_cap + (higher_cap - lower_cap) * self.capacity_adjustment
@@ -320,7 +320,7 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin):
    @property
    def capacity(self):
-        return sum([inst.capacity for inst in self.instances.all()])
+        return sum(inst.capacity for inst in self.instances.all())
    @property
    def jobs_running(self):
--- a/awx/main/scheduler/task_manager.py
+++ b/awx/main/scheduler/task_manager.py
@@ -13,7 +13,6 @@ from django.db import transaction, connection
 from django.utils.translation import ugettext_lazy as _, gettext_noop
 from django.utils.timezone import now as tz_now
 from django.conf import settings
 from django.db.models import Q
 # AWX
 from awx.main.dispatch.reaper import reap_job
@@ -69,7 +68,7 @@ class TaskManager:
        """
        Init AFTER we know this instance of the task manager will run because the lock is acquired.
        """
-        instances = Instance.objects.filter(~Q(hostname=None), enabled=True)
+        instances = Instance.objects.filter(hostname__isnull=False, enabled=True).exclude(node_type='hop')
        self.real_instances = {i.hostname: i for i in instances}
        instances_partial = [
@@ -484,7 +483,7 @@ class TaskManager:
        return created_dependencies
    def process_pending_tasks(self, pending_tasks):
-        running_workflow_templates = set([wf.unified_job_template_id for wf in self.get_running_workflow_jobs()])
+        running_workflow_templates = {wf.unified_job_template_id for wf in self.get_running_workflow_jobs()}
        tasks_to_update_job_explanation = []
        for task in pending_tasks:
            if self.start_task_limit <= 0:
@@ -593,7 +592,7 @@ class TaskManager:
        # elsewhere
        for j in UnifiedJob.objects.filter(
            status__in=['pending', 'waiting', 'running'],
-        ).exclude(execution_node__in=Instance.objects.values_list('hostname', flat=True)):
+        ).exclude(execution_node__in=Instance.objects.exclude(node_type='hop').values_list('hostname', flat=True)):
            if j.execution_node and not j.is_container_group_task:
                logger.error(f'{j.execution_node} is not a registered instance; reaping {j.log_format}')
                reap_job(j, 'failed')
--- a/awx/main/tasks.py
+++ b/awx/main/tasks.py
@@ -202,7 +202,8 @@ def apply_cluster_membership_policies():
            to_log = logger.debug
        to_log('Waited {} seconds to obtain lock name: cluster_policy_lock'.format(lock_time))
        started_compute = time.time()
-        all_instances = list(Instance.objects.order_by('id'))
+        # Hop nodes should never get assigned to an InstanceGroup.
        all_instances = list(Instance.objects.exclude(node_type='hop').order_by('id'))
        all_groups = list(InstanceGroup.objects.prefetch_related('instances'))
        total_instances = len(all_instances)
@@ -253,7 +254,7 @@ def apply_cluster_membership_policies():
        # Finally, process instance policy percentages
        for g in sorted(actual_groups, key=lambda x: len(x.instances)):
            exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control'
-            candidate_pool_ct = len([i for i in actual_instances if i.obj.node_type != exclude_type])
+            candidate_pool_ct = sum(1 for i in actual_instances if i.obj.node_type != exclude_type)
            if not candidate_pool_ct:
                continue
            policy_per_added = []
--- a/awx/main/tests/functional/test_instances.py
+++ b/awx/main/tests/functional/test_instances.py
@@ -84,8 +84,9 @@ def test_instance_dup(org_admin, organization, project, instance_factory, instan
    list_response2 = get(reverse('api:instance_list'), user=org_admin)
    api_num_instances_oa = list(list_response2.data.items())[0][1]
-    assert actual_num_instances == api_num_instances_auditor
+    assert api_num_instances_auditor == actual_num_instances
-    # Note: The org_admin will not see the default 'tower' node (instance fixture) because it is not in its group, as expected
+    # Note: The org_admin will not see the default 'tower' node
    # (instance fixture) because it is not in its group, as expected
    assert api_num_instances_oa == (actual_num_instances - 1)
--- a/awx/main/wsbroadcast.py
+++ b/awx/main/wsbroadcast.py
@@ -35,6 +35,7 @@ def get_broadcast_hosts():
    instances = (
        Instance.objects.exclude(hostname=Instance.objects.me().hostname)
        .exclude(node_type='execution')
        .exclude(node_type='hop')
        .order_by('hostname')
        .values('hostname', 'ip_address')
        .distinct()