Apply capacity algorithm changes

* This also adds fields to the instance view for tracking cpu and memory usage as well as information on what the capacity ranges are * Also adds a flag for enabling/disabling instances which removes them from all queues and has them stop processing new work * The capacity is now based almost exclusively on some value relative to forks * capacity_adjustment allows you to commit an instance to a certain amount of forks, cpu focused or memory focused * Each job run adds a single fork overhead (that's the reasoning behind the +1)
2026-06-24 16:17:51 -02:30 · 2018-01-11 13:33:35 -05:00
parent 6a85fc38dd
commit 70bf78e29f
17 changed files with 248 additions and 76 deletions
--- a/awx/api/serializers.py
+++ b/awx/api/serializers.py
@@ -3977,8 +3977,10 @@ class InstanceSerializer(BaseSerializer):
    class Meta:
        model = Instance
-        fields = ("id", "type", "url", "related", "uuid", "hostname", "created", "modified",
+        read_only_fields = ('uuid', 'hostname', 'version')
-                  "version", "capacity", "consumed_capacity", "percent_capacity_remaining", "jobs_running")
+        fields = ("id", "type", "url", "related", "uuid", "hostname", "created", "modified", 'capacity_adjustment',
                  "version", "capacity", "consumed_capacity", "percent_capacity_remaining", "jobs_running",
                  "cpu", "memory", "cpu_capacity", "mem_capacity", "enabled")
    def get_related(self, obj):
        res = super(InstanceSerializer, self).get_related(obj)
--- a/awx/api/views.py
+++ b/awx/api/views.py
@@ -57,7 +57,7 @@ import pytz
 from wsgiref.util import FileWrapper
 # AWX
-from awx.main.tasks import send_notifications
+from awx.main.tasks import send_notifications, handle_ha_toplogy_changes
 from awx.main.access import get_user_queryset
 from awx.main.ha import is_ha_environment
 from awx.api.authentication import TokenGetAuthentication
@@ -560,7 +560,7 @@ class InstanceList(ListAPIView):
    new_in_320 = True
-class InstanceDetail(RetrieveAPIView):
+class InstanceDetail(RetrieveUpdateAPIView):
    view_name = _("Instance Detail")
    model = Instance
@@ -568,6 +568,20 @@ class InstanceDetail(RetrieveAPIView):
    new_in_320 = True
    def update(self, request, *args, **kwargs):
        r = super(InstanceDetail, self).update(request, *args, **kwargs)
        if status.is_success(r.status_code):
            obj = self.get_object()
            if obj.enabled:
                obj.refresh_capacity()
            else:
                obj.capacity = 0
            obj.save()
            handle_ha_toplogy_changes.apply_async()
            r.data = InstanceSerializer(obj, context=self.get_serializer_context()).to_representation(obj)
        return r
 class InstanceUnifiedJobsList(SubListAPIView):
    view_name = _("Instance Running Jobs")
--- a/awx/main/managers.py
+++ b/awx/main/managers.py
@@ -2,12 +2,9 @@
 # All Rights Reserved.
 import sys
 from datetime import timedelta
 import logging
 from django.db import models
 from django.utils.timezone import now
 from django.db.models import Sum
 from django.conf import settings
 from awx.main.utils.filters import SmartFilter
--- a/awx/main/migrations/0018_v330_instancegroup_policies.py
+++ b/awx/main/migrations/0018_v330_instancegroup_policies.py
@@ -1,31 +0,0 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 from django.db import migrations, models
 import awx.main.fields
 class Migration(migrations.Migration):
    dependencies = [
        ('main', '0017_v330_move_deprecated_stdout'),
    ]
    operations = [
        migrations.AddField(
            model_name='instancegroup',
            name='policy_instance_list',
            field=awx.main.fields.JSONField(default=[], help_text='List of exact-match Instances that will always be automatically assigned to this group',
                                            blank=True),
        ),
        migrations.AddField(
            model_name='instancegroup',
            name='policy_instance_minimum',
            field=models.IntegerField(default=0, help_text='Static minimum number of Instances to automatically assign to this group'),
        ),
        migrations.AddField(
            model_name='instancegroup',
            name='policy_instance_percentage',
            field=models.IntegerField(default=0, help_text='Percentage of Instances to automatically assign to this group'),
        ),
    ]
--- a/awx/main/migrations/0020_v330_instancegroup_policies.py
+++ b/awx/main/migrations/0020_v330_instancegroup_policies.py
@@ -0,0 +1,62 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 from django.db import migrations, models
 from decimal import Decimal
 import awx.main.fields
 class Migration(migrations.Migration):
    dependencies = [
        ('main', '0019_v330_custom_virtualenv'),
    ]
    operations = [
        migrations.AddField(
            model_name='instancegroup',
            name='policy_instance_list',
            field=awx.main.fields.JSONField(default=[], help_text='List of exact-match Instances that will always be automatically assigned to this group',
                                            blank=True),
        ),
        migrations.AddField(
            model_name='instancegroup',
            name='policy_instance_minimum',
            field=models.IntegerField(default=0, help_text='Static minimum number of Instances to automatically assign to this group'),
        ),
        migrations.AddField(
            model_name='instancegroup',
            name='policy_instance_percentage',
            field=models.IntegerField(default=0, help_text='Percentage of Instances to automatically assign to this group'),
        ),
        migrations.AddField(
            model_name='instance',
            name='capacity_adjustment',
            field=models.DecimalField(decimal_places=2, default=Decimal('1.0'), max_digits=3),
        ),
        migrations.AddField(
            model_name='instance',
            name='cpu',
            field=models.IntegerField(default=0, editable=False)
        ),
        migrations.AddField(
            model_name='instance',
            name='memory',
            field=models.BigIntegerField(default=0, editable=False)
        ),
        migrations.AddField(
            model_name='instance',
            name='cpu_capacity',
            field=models.IntegerField(default=0, editable=False)
        ),
        migrations.AddField(
            model_name='instance',
            name='mem_capacity',
            field=models.IntegerField(default=0, editable=False)
        ),
        migrations.AddField(
            model_name='instance',
            name='enabled',
            field=models.BooleanField(default=True)
        )
    ]
--- a/awx/main/models/ad_hoc_commands.py
+++ b/awx/main/models/ad_hoc_commands.py
@@ -184,7 +184,7 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin):
        # NOTE: We sorta have to assume the host count matches and that forks default to 5
        from awx.main.models.inventory import Host
        count_hosts = Host.objects.filter( enabled=True, inventory__ad_hoc_commands__pk=self.pk).count()
-        return min(count_hosts, 5 if self.forks == 0 else self.forks) * 10
+        return min(count_hosts, 5 if self.forks == 0 else self.forks) + 1
    def copy(self):
        data = {}
--- a/awx/main/models/ha.py
+++ b/awx/main/models/ha.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2015 Ansible, Inc.
 # All Rights Reserved.
 from decimal import Decimal
 from django.db import models, connection
 from django.db.models.signals import post_save, post_delete
 from django.dispatch import receiver
@@ -10,6 +12,7 @@ from django.utils.timezone import now, timedelta
 from solo.models import SingletonModel
 from awx import __version__ as awx_application_version
 from awx.api.versioning import reverse
 from awx.main.managers import InstanceManager, InstanceGroupManager
 from awx.main.fields import JSONField
@@ -17,6 +20,7 @@ from awx.main.models.inventory import InventoryUpdate
 from awx.main.models.jobs import Job
 from awx.main.models.projects import ProjectUpdate
 from awx.main.models.unified_jobs import UnifiedJob
 from awx.main.utils import get_cpu_capacity, get_mem_capacity, get_system_task_capacity
 __all__ = ('Instance', 'InstanceGroup', 'JobOrigin', 'TowerScheduleState',)
@@ -39,6 +43,30 @@ class Instance(models.Model):
        default=100,
        editable=False,
    )
    capacity_adjustment = models.DecimalField(
        default=Decimal(1.0),
        max_digits=3,
        decimal_places=2,
    )
    enabled = models.BooleanField(
        default=True
    )
    cpu = models.IntegerField(
        default=0,
        editable=False,
    )
    memory = models.BigIntegerField(
        default=0,
        editable=False,
    )
    cpu_capacity = models.IntegerField(
        default=0,
        editable=False,
    )
    mem_capacity = models.IntegerField(
        default=0,
        editable=False,
    )
    class Meta:
        app_label = 'main'
@@ -68,6 +96,20 @@ class Instance(models.Model):
        return Instance.objects.filter(rampart_groups__controller__instances=self).exists()
    def refresh_capacity(self):
        cpu = get_cpu_capacity()
        mem = get_mem_capacity()
        self.capacity = get_system_task_capacity(self.capacity_adjustment)
        self.cpu = cpu[0]
        self.memory = mem[0]
        self.cpu_capacity = cpu[1]
        self.mem_capacity = mem[1]
        self.version = awx_application_version
        self.save(update_fields=['capacity', 'version', 'modified', 'cpu',
                                 'memory', 'cpu_capacity', 'mem_capacity'])
 class InstanceGroup(models.Model):
    """A model representing a Queue/Group of AWX Instances."""
    objects = InstanceGroupManager()
--- a/awx/main/models/inventory.py
+++ b/awx/main/models/inventory.py
@@ -1602,7 +1602,7 @@ class InventoryUpdate(UnifiedJob, InventorySourceOptions, JobNotificationMixin,
    @property
    def task_impact(self):
-        return 50
+        return 1
    # InventoryUpdate credential required
    # Custom and SCM InventoryUpdate credential not required
--- a/awx/main/models/jobs.py
+++ b/awx/main/models/jobs.py
@@ -623,7 +623,7 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
            count_hosts = 1
        else:
            count_hosts = Host.objects.filter(inventory__jobs__pk=self.pk).count()
-        return min(count_hosts, 5 if self.forks == 0 else self.forks) * 10
+        return min(count_hosts, 5 if self.forks == 0 else self.forks) + 1
    @property
    def successful_hosts(self):
@@ -1190,7 +1190,7 @@ class SystemJob(UnifiedJob, SystemJobOptions, JobNotificationMixin):
    @property
    def task_impact(self):
-        return 150
+        return 5
    @property
    def preferred_instance_groups(self):
--- a/awx/main/models/projects.py
+++ b/awx/main/models/projects.py
@@ -492,7 +492,7 @@ class ProjectUpdate(UnifiedJob, ProjectOptions, JobNotificationMixin, TaskManage
    @property
    def task_impact(self):
-        return 0 if self.job_type == 'run' else 20
+        return 0 if self.job_type == 'run' else 1
    @property
    def result_stdout(self):
--- a/awx/main/tasks.py
+++ b/awx/main/tasks.py
@@ -2,7 +2,6 @@
 # All Rights Reserved.
 # Python
 import codecs
 from collections import OrderedDict, namedtuple
 import ConfigParser
 import cStringIO
@@ -54,9 +53,8 @@ from awx.main.queue import CallbackQueueDispatcher
 from awx.main.expect import run, isolated_manager
 from awx.main.utils import (get_ansible_version, get_ssh_version, decrypt_field, update_scm_url,
                            check_proot_installed, build_proot_temp_dir, get_licenser,
-                            wrap_args_with_proot, get_system_task_capacity, OutputEventFilter,
+                            wrap_args_with_proot, OutputEventFilter, ignore_inventory_computed_fields,
-                            ignore_inventory_computed_fields, ignore_inventory_group_removal,
+                            ignore_inventory_group_removal, get_type_for_model, extract_ansible_vars)
                            get_type_for_model, extract_ansible_vars)
 from awx.main.utils.reload import restart_local_services, stop_local_services
 from awx.main.utils.pglock import advisory_lock
 from awx.main.utils.ha import update_celery_worker_routes, register_celery_worker_queues
@@ -307,6 +305,7 @@ def cluster_node_heartbeat(self):
    instance_list = list(Instance.objects.filter(rampart_groups__controller__isnull=True).distinct())
    this_inst = None
    lost_instances = []
    for inst in list(instance_list):
        if inst.hostname == settings.CLUSTER_HOST_ID:
            this_inst = inst
@@ -316,11 +315,15 @@ def cluster_node_heartbeat(self):
            instance_list.remove(inst)
    if this_inst:
        startup_event = this_inst.is_lost(ref_time=nowtime)
-        if this_inst.capacity == 0:
+        if this_inst.capacity == 0 and this_inst.enabled:
            logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
-        this_inst.capacity = get_system_task_capacity()
+        if this_inst.enabled:
-        this_inst.version = awx_application_version
+            this_inst.refresh_capacity()
-        this_inst.save(update_fields=['capacity', 'version', 'modified'])
+            handle_ha_toplogy_changes.apply_async()
        elif this_inst.capacity != 0 and not this_inst.enabled:
            this_inst.capacity = 0
            this_inst.save(update_fields=['capacity'])
            handle_ha_toplogy_changes.apply_async()
        if startup_event:
            return
    else:
@@ -329,7 +332,7 @@ def cluster_node_heartbeat(self):
    for other_inst in instance_list:
        if other_inst.version == "":
            continue
-        if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version) and not settings.DEBUG:
+        if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG:
            logger.error("Host {} reports version {}, but this node {} is at {}, shutting down".format(other_inst.hostname,
                                                                                                       other_inst.version,
                                                                                                       this_inst.hostname,
--- a/awx/main/tests/functional/test_jobs.py
+++ b/awx/main/tests/functional/test_jobs.py
@@ -1,9 +1,11 @@
 from awx.main.models import Job, Instance
 from django.test.utils import override_settings
 import pytest
-
+import mock
 import json
 from awx.main.models import Job, Instance
 from awx.main.tasks import cluster_node_heartbeat
 from django.test.utils import override_settings
@pytest.mark.django_db
 def test_orphan_unified_job_creation(instance, inventory):
@@ -17,13 +19,19 @@ def test_orphan_unified_job_creation(instance, inventory):
@pytest.mark.django_db
@mock.patch('awx.main.utils.common.get_cpu_capacity', lambda: (2,8))
@mock.patch('awx.main.utils.common.get_mem_capacity', lambda: (8000,62))
@mock.patch('awx.main.tasks.handle_ha_toplogy_changes.apply_async', lambda: True)
 def test_job_capacity_and_with_inactive_node():
-    Instance.objects.create(hostname='test-1', capacity=50)
+    i = Instance.objects.create(hostname='test-1')
-    assert Instance.objects.total_capacity() == 50
+    i.refresh_capacity()
-    Instance.objects.create(hostname='test-2', capacity=50)
+    assert i.capacity == 62
-    assert Instance.objects.total_capacity() == 100
+    i.enabled = False
-    with override_settings(AWX_ACTIVE_NODE_TIME=0):
+    i.save()
-        assert Instance.objects.total_capacity() < 100
+    with override_settings(CLUSTER_HOST_ID=i.hostname):
        cluster_node_heartbeat()
        i = Instance.objects.get(id=i.id)
        assert i.capacity == 0
@pytest.mark.django_db
--- a/awx/main/tests/unit/utils/test_ha.py
+++ b/awx/main/tests/unit/utils/test_ha.py
@@ -60,6 +60,7 @@ class TestAddRemoveCeleryWorkerQueues():
                                                   static_queues, _worker_queues, 
                                                   groups, hostname,
                                                   added_expected, removed_expected):
        added_expected.append('tower_instance_router')
        instance = instance_generator(groups=groups, hostname=hostname)
        worker_queues = worker_queues_generator(_worker_queues)
        with mock.patch('awx.main.utils.ha.settings.AWX_CELERY_QUEUES_STATIC', static_queues):
--- a/awx/main/utils/common.py
+++ b/awx/main/utils/common.py
@@ -20,6 +20,8 @@ import six
 import psutil
 from StringIO import StringIO
 from decimal import Decimal
 # Decorator
 from decorator import decorator
@@ -45,7 +47,7 @@ __all__ = ['get_object_or_400', 'get_object_or_403', 'camelcase_to_underscore',
           'ignore_inventory_computed_fields', 'ignore_inventory_group_removal',
           '_inventory_updates', 'get_pk_from_dict', 'getattrd', 'NoDefaultProvided',
           'get_current_apps', 'set_current_apps', 'OutputEventFilter',
-           'extract_ansible_vars', 'get_search_fields', 'get_system_task_capacity',
+           'extract_ansible_vars', 'get_search_fields', 'get_system_task_capacity', 'get_cpu_capacity', 'get_mem_capacity',
           'wrap_args_with_proot', 'build_proot_temp_dir', 'check_proot_installed', 'model_to_dict',
           'model_instance_diff', 'timestamp_apiformat', 'parse_yaml_or_json', 'RequireDebugTrueOrTest',
           'has_model_field_prefetched', 'set_environ', 'IllegalArgumentError', 'get_custom_venv_choices']
@@ -632,19 +634,52 @@ def parse_yaml_or_json(vars_str, silent_failure=True):
    return vars_dict
-@memoize()
+def get_cpu_capacity():
-def get_system_task_capacity():
+    from django.conf import settings
    settings_forkcpu = getattr(settings, 'SYSTEM_TASK_FORKS_CPU', None)
    env_forkcpu = os.getenv('SYSTEM_TASK_FORKS_CPU', None)
    cpu = psutil.cpu_count()
    if env_forkcpu:
        forkcpu = int(env_forkcpu)
    elif settings_forkcpu:
        forkcpu = int(settings_forkcpu)
    else:
        forkcpu = 4
    return (cpu, cpu * forkcpu)
 def get_mem_capacity():
    from django.conf import settings
    settings_forkmem = getattr(settings, 'SYSTEM_TASK_FORKS_MEM', None)
    env_forkmem = os.getenv('SYSTEM_TASK_FORKS_MEM', None)
    if env_forkmem:
        forkmem = int(env_forkmem)
    elif settings_forkmem:
        forkmem = int(settings_forkmem)
    else:
        forkmem = 100
    mem = psutil.virtual_memory().total
    return (mem, max(1, ((mem / 1024 / 1024) - 2048) / forkmem))
 def get_system_task_capacity(scale=Decimal(1.0)):
    '''
    Measure system memory and use it as a baseline for determining the system's capacity
    '''
    from django.conf import settings
-    if hasattr(settings, 'SYSTEM_TASK_CAPACITY'):
+    settings_forks = getattr(settings, 'SYSTEM_TASK_FORKS_CAPACITY', None)
-        return settings.SYSTEM_TASK_CAPACITY
+    env_forks = os.getenv('SYSTEM_TASK_FORKS_CAPACITY', None)
-    mem = psutil.virtual_memory()
+
-    total_mem_value = mem.total / 1024 / 1024
+    if env_forks:
-    if total_mem_value <= 2048:
+        return int(env_forks)
-        return 50
+    elif settings_forks:
-    return 50 + ((total_mem_value / 1024) - 2) * 75
+        return int(settings_forks)
    _, cpu_cap = get_cpu_capacity()
    _, mem_cap = get_mem_capacity()
    return min(mem_cap, cpu_cap) + ((max(mem_cap, cpu_cap) - min(mem_cap, cpu_cap)) * scale)
 _inventory_updates = threading.local()
--- a/awx/main/utils/ha.py
+++ b/awx/main/utils/ha.py
@@ -24,7 +24,7 @@ def _add_remove_celery_worker_queues(app, instance, worker_queues, worker_name):
                queue['alias'] in settings.AWX_CELERY_QUEUES_STATIC:
            continue
-        if queue['name'] not in ig_names | set([instance.hostname]):
+        if queue['name'] not in ig_names | set([instance.hostname]) or not instance.enabled:
            app.control.cancel_consumer(queue['name'], reply=True, destination=[worker_name])
            removed_queues.append(queue['name'])
@@ -43,7 +43,6 @@ def update_celery_worker_routes(instance, conf):
        'awx.main.tasks.purge_old_stdout_files',
    ]
    routes_updated = {}
    # Instance is, effectively, a controller node
    if instance.is_controller():
        tasks.append('awx.main.tasks.awx_isolated_heartbeat')
--- a/awx/settings/defaults.py
+++ b/awx/settings/defaults.py
@@ -639,9 +639,6 @@ AWX_PROOT_BASE_PATH = "/tmp"
 # Note: This setting may be overridden by database settings.
 AWX_ANSIBLE_CALLBACK_PLUGINS = ""
 # Time at which an HA node is considered active
 AWX_ACTIVE_NODE_TIME = 7200
 # Automatically remove nodes that have missed their heartbeats after some time
 AWX_AUTO_DEPROVISION_INSTANCES = False
--- a/docs/clustering.md
+++ b/docs/clustering.md
@@ -28,6 +28,8 @@ It's important to point out a few existing things:
 * Existing old-style HA deployments will be transitioned automatically to the new HA system during the upgrade process to 3.1.
 * Manual projects will need to be synced to all instances by the customer
 Ansible Tower 3.3 adds support for container-based clusters using Openshift or Kubernetes
 ## Important Changes
 * There is no concept of primary/secondary in the new Tower system. *All* systems are primary.
@@ -226,6 +228,47 @@ show up in api endpoints and stats monitoring. These groups can be removed with
 $ awx-manage unregister_queue --queuename=<name>
 ```
 ### Configuring Instances and Instance Groups from the API
 Instance Groups can be created by posting to `/api/v2/instance_groups` as a System Admin.
 Once created, `Instances` can be associated with an Instance Group with:
 ```
 HTTP POST /api/v2/instance_groups/x/instances/ {'id': y}`
 ```
 An `Instance` that is added to an `InstanceGroup` will automatically reconfigure itself to listen on the group's work queue. See the following
 section `Instance Group Policies` for more details.
 ### Instance Group Policies
 Tower `Instances` can be configured to automatically join `Instance Groups` when they come online by defining a policy. These policies are evaluated for
 every new Instance that comes online.
 Instance Group Policies are controlled by 3 optional fields on an `Instance Group`:
 * `policy_instance_percentage`: This is a number between 0 - 100. It gaurantees that this percentage of active Tower instances will be added
  to this `Instance Group`. As new instances come online, if the number of Instances in this group relative to the total number of instances
  is less than the given percentage then new ones will be added until the percentage condition is satisfied.
 * `policy_instance_minimum`: This policy attempts to keep at least this many `Instances` in the `Instance Group`. If the number of
  available instances is lower than this minimum then all `Instances` will be placed in this `Instance Group`.
 * `policy_instance_list`: This is a fixed list of `Instance` names. These `Instances` will *always* be added to this `Instance Group`.
  Further, by adding Instances to this list you are declaring that you will manually manage those Instances and they will not be eligible under any other
  policy. This means they will not be automatically added to any other `Instance Group` even if the policy would cause them to be matched.
 > NOTES
 * `Instances` that are assigned directly to `Instance Groups` by posting to `/api/v2/instance_groups/x/instances` or
  `/api/v2/instances/x/instance_groups` are automatically added to the `policy_instance_list`. This means they are subject to the
  normal caveats for `policy_instance_list` and must be manually managed.
 * `policy_instance_percentage` and `policy_instance_minimum` work together. For example, if you have a `policy_instance_percentage` of
  50% and a `policy_instance_minimum` of 2 and you start 6 `Instances`. 3 of them would be assigned to the `Instance Group`. If you reduce the number
  of `Instances` to 2 then both of them would be assigned to the `Instance Group` to satisfy `policy_instance_minimum`. In this way, you can set a lower
  bound on the amount of available resources.
 * Policies don't actively prevent `Instances` from being associated with multiple `Instance Groups` but this can effectively be achieved by making the percentages
  sum to 100. If you have 4 `Instance Groups` assign each a percentage value of 25 and the `Instances` will be distributed among them with no overlap.
 ### Status and Monitoring
 Tower itself reports as much status as it can via the api at `/api/v2/ping` in order to provide validation of the health