Add max concurrent jobs and max forks per ig

The intention of this feature is primarily to provide some notion of max capacity of container groups, but the logic I've left generic. Default is 0, which will be interpereted as no maximum number of jobs or forks. Includes refactor of variable and method names for clarity. instances_by_hostname is an internal attribute of TaskManagerInstances. Clarify when we are expecting the actual TaskManagerInstances object. Unify how we process running tasks and consume capacity. This has the effect that we do less expensive work in after_lock_init and have 1 less loop over all the running tasks. Previously we looped for both building the dependency graph as well as for calculating the starting capacity of all the instances and instance groups. Now we acheive both tasks in the same loop. Because of how this changes the somewhat subtle "do-si-do" of how to initialize the Task Manager models, introduce a wrapper class that tries to take some of that burden off of other areas where we re-use this like in the serializer and the metrics. Also use this wrapper class to handle nicities of how to track capacity consumption on instances and instance groups. Add tests for max_forks and max_concurrent_jobs Fixup tests that use TaskManagerModels to accomodate changes. assign ig before call to consume capacity if we don't do it in that order, then we don't correctly account for the container group jobs we are starting in the middle of the task manager run
2026-06-04 06:27:58 -02:30 · 2022-10-23 23:20:41 -04:00
parent 65c3db8cb8
commit 86856f242a
11 changed files with 490 additions and 185 deletions
--- a/awx/api/serializers.py
+++ b/awx/api/serializers.py
@@ -113,7 +113,7 @@ from awx.main.utils import (
 )
 from awx.main.utils.filters import SmartFilter
 from awx.main.utils.named_url_graph import reset_counters
-from awx.main.scheduler.task_manager_models import TaskManagerInstanceGroups, TaskManagerInstances
+from awx.main.scheduler.task_manager_models import TaskManagerModels
 from awx.main.redact import UriCleaner, REPLACE_STR

 from awx.main.validators import vars_validate_or_raise
@@ -5071,6 +5071,22 @@ class InstanceGroupSerializer(BaseSerializer):
        label=_('Policy Instance Minimum'),
        help_text=_("Static minimum number of Instances that will be automatically assign to " "this group when new instances come online."),
    )
+    max_concurrent_jobs = serializers.IntegerField(
+        default=0,
+        min_value=0,
+        required=False,
+        initial=0,
+        label=_('Max Concurrent Jobs'),
+        help_text=_("Maximum number of concurrent jobs to run on a group. When set to zero, no maximum is enforced."),
+    )
+    max_forks = serializers.IntegerField(
+        default=0,
+        min_value=0,
+        required=False,
+        initial=0,
+        label=_('Max Forks'),
+        help_text=_("Maximum number of forks to execute concurrently on a group. When set to zero, no maximum is enforced."),
+    )
    policy_instance_list = serializers.ListField(
        child=serializers.CharField(),
        required=False,
@@ -5092,6 +5108,8 @@ class InstanceGroupSerializer(BaseSerializer):
            "consumed_capacity",
            "percent_capacity_remaining",
            "jobs_running",
+            "max_concurrent_jobs",
+            "max_forks",
            "jobs_total",
            "instances",
            "is_container_group",
@@ -5173,14 +5191,15 @@ class InstanceGroupSerializer(BaseSerializer):
        # Store capacity values (globally computed) in the context
        if 'task_manager_igs' not in self.context:
            instance_groups_queryset = None
-            jobs_qs = UnifiedJob.objects.filter(status__in=('running', 'waiting'))
            if self.parent:  # Is ListView:
                instance_groups_queryset = self.parent.instance

-            instances = TaskManagerInstances(jobs_qs)
-            instance_groups = TaskManagerInstanceGroups(instances_by_hostname=instances, instance_groups_queryset=instance_groups_queryset)
+            tm_models = TaskManagerModels.init_with_consumed_capacity(
+                instance_fields=['uuid', 'version', 'capacity', 'cpu', 'memory', 'managed_by_policy', 'enabled'],
+                instance_groups_queryset=instance_groups_queryset,
+            )

-            self.context['task_manager_igs'] = instance_groups
+            self.context['task_manager_igs'] = tm_models.instance_groups
        return self.context['task_manager_igs']

    def get_consumed_capacity(self, obj):