mirror of
https://github.com/ansible/awx.git
synced 2026-03-23 20:05:03 -02:30
Apply capacity algorithm changes
* This also adds fields to the instance view for tracking cpu and memory usage as well as information on what the capacity ranges are * Also adds a flag for enabling/disabling instances which removes them from all queues and has them stop processing new work * The capacity is now based almost exclusively on some value relative to forks * capacity_adjustment allows you to commit an instance to a certain amount of forks, cpu focused or memory focused * Each job run adds a single fork overhead (that's the reasoning behind the +1)
This commit is contained in:
@@ -3977,8 +3977,10 @@ class InstanceSerializer(BaseSerializer):
|
|||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
model = Instance
|
model = Instance
|
||||||
fields = ("id", "type", "url", "related", "uuid", "hostname", "created", "modified",
|
read_only_fields = ('uuid', 'hostname', 'version')
|
||||||
"version", "capacity", "consumed_capacity", "percent_capacity_remaining", "jobs_running")
|
fields = ("id", "type", "url", "related", "uuid", "hostname", "created", "modified", 'capacity_adjustment',
|
||||||
|
"version", "capacity", "consumed_capacity", "percent_capacity_remaining", "jobs_running",
|
||||||
|
"cpu", "memory", "cpu_capacity", "mem_capacity", "enabled")
|
||||||
|
|
||||||
def get_related(self, obj):
|
def get_related(self, obj):
|
||||||
res = super(InstanceSerializer, self).get_related(obj)
|
res = super(InstanceSerializer, self).get_related(obj)
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ import pytz
|
|||||||
from wsgiref.util import FileWrapper
|
from wsgiref.util import FileWrapper
|
||||||
|
|
||||||
# AWX
|
# AWX
|
||||||
from awx.main.tasks import send_notifications
|
from awx.main.tasks import send_notifications, handle_ha_toplogy_changes
|
||||||
from awx.main.access import get_user_queryset
|
from awx.main.access import get_user_queryset
|
||||||
from awx.main.ha import is_ha_environment
|
from awx.main.ha import is_ha_environment
|
||||||
from awx.api.authentication import TokenGetAuthentication
|
from awx.api.authentication import TokenGetAuthentication
|
||||||
@@ -560,7 +560,7 @@ class InstanceList(ListAPIView):
|
|||||||
new_in_320 = True
|
new_in_320 = True
|
||||||
|
|
||||||
|
|
||||||
class InstanceDetail(RetrieveAPIView):
|
class InstanceDetail(RetrieveUpdateAPIView):
|
||||||
|
|
||||||
view_name = _("Instance Detail")
|
view_name = _("Instance Detail")
|
||||||
model = Instance
|
model = Instance
|
||||||
@@ -568,6 +568,20 @@ class InstanceDetail(RetrieveAPIView):
|
|||||||
new_in_320 = True
|
new_in_320 = True
|
||||||
|
|
||||||
|
|
||||||
|
def update(self, request, *args, **kwargs):
|
||||||
|
r = super(InstanceDetail, self).update(request, *args, **kwargs)
|
||||||
|
if status.is_success(r.status_code):
|
||||||
|
obj = self.get_object()
|
||||||
|
if obj.enabled:
|
||||||
|
obj.refresh_capacity()
|
||||||
|
else:
|
||||||
|
obj.capacity = 0
|
||||||
|
obj.save()
|
||||||
|
handle_ha_toplogy_changes.apply_async()
|
||||||
|
r.data = InstanceSerializer(obj, context=self.get_serializer_context()).to_representation(obj)
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
class InstanceUnifiedJobsList(SubListAPIView):
|
class InstanceUnifiedJobsList(SubListAPIView):
|
||||||
|
|
||||||
view_name = _("Instance Running Jobs")
|
view_name = _("Instance Running Jobs")
|
||||||
|
|||||||
@@ -2,12 +2,9 @@
|
|||||||
# All Rights Reserved.
|
# All Rights Reserved.
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
from datetime import timedelta
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.utils.timezone import now
|
|
||||||
from django.db.models import Sum
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from awx.main.utils.filters import SmartFilter
|
from awx.main.utils.filters import SmartFilter
|
||||||
|
|||||||
@@ -1,31 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from django.db import migrations, models
|
|
||||||
import awx.main.fields
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
('main', '0017_v330_move_deprecated_stdout'),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.AddField(
|
|
||||||
model_name='instancegroup',
|
|
||||||
name='policy_instance_list',
|
|
||||||
field=awx.main.fields.JSONField(default=[], help_text='List of exact-match Instances that will always be automatically assigned to this group',
|
|
||||||
blank=True),
|
|
||||||
),
|
|
||||||
migrations.AddField(
|
|
||||||
model_name='instancegroup',
|
|
||||||
name='policy_instance_minimum',
|
|
||||||
field=models.IntegerField(default=0, help_text='Static minimum number of Instances to automatically assign to this group'),
|
|
||||||
),
|
|
||||||
migrations.AddField(
|
|
||||||
model_name='instancegroup',
|
|
||||||
name='policy_instance_percentage',
|
|
||||||
field=models.IntegerField(default=0, help_text='Percentage of Instances to automatically assign to this group'),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
62
awx/main/migrations/0020_v330_instancegroup_policies.py
Normal file
62
awx/main/migrations/0020_v330_instancegroup_policies.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
from decimal import Decimal
|
||||||
|
import awx.main.fields
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('main', '0019_v330_custom_virtualenv'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='instancegroup',
|
||||||
|
name='policy_instance_list',
|
||||||
|
field=awx.main.fields.JSONField(default=[], help_text='List of exact-match Instances that will always be automatically assigned to this group',
|
||||||
|
blank=True),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='instancegroup',
|
||||||
|
name='policy_instance_minimum',
|
||||||
|
field=models.IntegerField(default=0, help_text='Static minimum number of Instances to automatically assign to this group'),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='instancegroup',
|
||||||
|
name='policy_instance_percentage',
|
||||||
|
field=models.IntegerField(default=0, help_text='Percentage of Instances to automatically assign to this group'),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='instance',
|
||||||
|
name='capacity_adjustment',
|
||||||
|
field=models.DecimalField(decimal_places=2, default=Decimal('1.0'), max_digits=3),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='instance',
|
||||||
|
name='cpu',
|
||||||
|
field=models.IntegerField(default=0, editable=False)
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='instance',
|
||||||
|
name='memory',
|
||||||
|
field=models.BigIntegerField(default=0, editable=False)
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='instance',
|
||||||
|
name='cpu_capacity',
|
||||||
|
field=models.IntegerField(default=0, editable=False)
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='instance',
|
||||||
|
name='mem_capacity',
|
||||||
|
field=models.IntegerField(default=0, editable=False)
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='instance',
|
||||||
|
name='enabled',
|
||||||
|
field=models.BooleanField(default=True)
|
||||||
|
)
|
||||||
|
]
|
||||||
@@ -184,7 +184,7 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin):
|
|||||||
# NOTE: We sorta have to assume the host count matches and that forks default to 5
|
# NOTE: We sorta have to assume the host count matches and that forks default to 5
|
||||||
from awx.main.models.inventory import Host
|
from awx.main.models.inventory import Host
|
||||||
count_hosts = Host.objects.filter( enabled=True, inventory__ad_hoc_commands__pk=self.pk).count()
|
count_hosts = Host.objects.filter( enabled=True, inventory__ad_hoc_commands__pk=self.pk).count()
|
||||||
return min(count_hosts, 5 if self.forks == 0 else self.forks) * 10
|
return min(count_hosts, 5 if self.forks == 0 else self.forks) + 1
|
||||||
|
|
||||||
def copy(self):
|
def copy(self):
|
||||||
data = {}
|
data = {}
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
# Copyright (c) 2015 Ansible, Inc.
|
# Copyright (c) 2015 Ansible, Inc.
|
||||||
# All Rights Reserved.
|
# All Rights Reserved.
|
||||||
|
|
||||||
|
from decimal import Decimal
|
||||||
|
|
||||||
from django.db import models, connection
|
from django.db import models, connection
|
||||||
from django.db.models.signals import post_save, post_delete
|
from django.db.models.signals import post_save, post_delete
|
||||||
from django.dispatch import receiver
|
from django.dispatch import receiver
|
||||||
@@ -10,6 +12,7 @@ from django.utils.timezone import now, timedelta
|
|||||||
|
|
||||||
from solo.models import SingletonModel
|
from solo.models import SingletonModel
|
||||||
|
|
||||||
|
from awx import __version__ as awx_application_version
|
||||||
from awx.api.versioning import reverse
|
from awx.api.versioning import reverse
|
||||||
from awx.main.managers import InstanceManager, InstanceGroupManager
|
from awx.main.managers import InstanceManager, InstanceGroupManager
|
||||||
from awx.main.fields import JSONField
|
from awx.main.fields import JSONField
|
||||||
@@ -17,6 +20,7 @@ from awx.main.models.inventory import InventoryUpdate
|
|||||||
from awx.main.models.jobs import Job
|
from awx.main.models.jobs import Job
|
||||||
from awx.main.models.projects import ProjectUpdate
|
from awx.main.models.projects import ProjectUpdate
|
||||||
from awx.main.models.unified_jobs import UnifiedJob
|
from awx.main.models.unified_jobs import UnifiedJob
|
||||||
|
from awx.main.utils import get_cpu_capacity, get_mem_capacity, get_system_task_capacity
|
||||||
|
|
||||||
__all__ = ('Instance', 'InstanceGroup', 'JobOrigin', 'TowerScheduleState',)
|
__all__ = ('Instance', 'InstanceGroup', 'JobOrigin', 'TowerScheduleState',)
|
||||||
|
|
||||||
@@ -39,6 +43,30 @@ class Instance(models.Model):
|
|||||||
default=100,
|
default=100,
|
||||||
editable=False,
|
editable=False,
|
||||||
)
|
)
|
||||||
|
capacity_adjustment = models.DecimalField(
|
||||||
|
default=Decimal(1.0),
|
||||||
|
max_digits=3,
|
||||||
|
decimal_places=2,
|
||||||
|
)
|
||||||
|
enabled = models.BooleanField(
|
||||||
|
default=True
|
||||||
|
)
|
||||||
|
cpu = models.IntegerField(
|
||||||
|
default=0,
|
||||||
|
editable=False,
|
||||||
|
)
|
||||||
|
memory = models.BigIntegerField(
|
||||||
|
default=0,
|
||||||
|
editable=False,
|
||||||
|
)
|
||||||
|
cpu_capacity = models.IntegerField(
|
||||||
|
default=0,
|
||||||
|
editable=False,
|
||||||
|
)
|
||||||
|
mem_capacity = models.IntegerField(
|
||||||
|
default=0,
|
||||||
|
editable=False,
|
||||||
|
)
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
app_label = 'main'
|
app_label = 'main'
|
||||||
@@ -68,6 +96,20 @@ class Instance(models.Model):
|
|||||||
return Instance.objects.filter(rampart_groups__controller__instances=self).exists()
|
return Instance.objects.filter(rampart_groups__controller__instances=self).exists()
|
||||||
|
|
||||||
|
|
||||||
|
def refresh_capacity(self):
|
||||||
|
cpu = get_cpu_capacity()
|
||||||
|
mem = get_mem_capacity()
|
||||||
|
self.capacity = get_system_task_capacity(self.capacity_adjustment)
|
||||||
|
self.cpu = cpu[0]
|
||||||
|
self.memory = mem[0]
|
||||||
|
self.cpu_capacity = cpu[1]
|
||||||
|
self.mem_capacity = mem[1]
|
||||||
|
self.version = awx_application_version
|
||||||
|
self.save(update_fields=['capacity', 'version', 'modified', 'cpu',
|
||||||
|
'memory', 'cpu_capacity', 'mem_capacity'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class InstanceGroup(models.Model):
|
class InstanceGroup(models.Model):
|
||||||
"""A model representing a Queue/Group of AWX Instances."""
|
"""A model representing a Queue/Group of AWX Instances."""
|
||||||
objects = InstanceGroupManager()
|
objects = InstanceGroupManager()
|
||||||
|
|||||||
@@ -1602,7 +1602,7 @@ class InventoryUpdate(UnifiedJob, InventorySourceOptions, JobNotificationMixin,
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def task_impact(self):
|
def task_impact(self):
|
||||||
return 50
|
return 1
|
||||||
|
|
||||||
# InventoryUpdate credential required
|
# InventoryUpdate credential required
|
||||||
# Custom and SCM InventoryUpdate credential not required
|
# Custom and SCM InventoryUpdate credential not required
|
||||||
|
|||||||
@@ -623,7 +623,7 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
|
|||||||
count_hosts = 1
|
count_hosts = 1
|
||||||
else:
|
else:
|
||||||
count_hosts = Host.objects.filter(inventory__jobs__pk=self.pk).count()
|
count_hosts = Host.objects.filter(inventory__jobs__pk=self.pk).count()
|
||||||
return min(count_hosts, 5 if self.forks == 0 else self.forks) * 10
|
return min(count_hosts, 5 if self.forks == 0 else self.forks) + 1
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def successful_hosts(self):
|
def successful_hosts(self):
|
||||||
@@ -1190,7 +1190,7 @@ class SystemJob(UnifiedJob, SystemJobOptions, JobNotificationMixin):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def task_impact(self):
|
def task_impact(self):
|
||||||
return 150
|
return 5
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def preferred_instance_groups(self):
|
def preferred_instance_groups(self):
|
||||||
|
|||||||
@@ -492,7 +492,7 @@ class ProjectUpdate(UnifiedJob, ProjectOptions, JobNotificationMixin, TaskManage
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def task_impact(self):
|
def task_impact(self):
|
||||||
return 0 if self.job_type == 'run' else 20
|
return 0 if self.job_type == 'run' else 1
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def result_stdout(self):
|
def result_stdout(self):
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
# All Rights Reserved.
|
# All Rights Reserved.
|
||||||
|
|
||||||
# Python
|
# Python
|
||||||
import codecs
|
|
||||||
from collections import OrderedDict, namedtuple
|
from collections import OrderedDict, namedtuple
|
||||||
import ConfigParser
|
import ConfigParser
|
||||||
import cStringIO
|
import cStringIO
|
||||||
@@ -54,9 +53,8 @@ from awx.main.queue import CallbackQueueDispatcher
|
|||||||
from awx.main.expect import run, isolated_manager
|
from awx.main.expect import run, isolated_manager
|
||||||
from awx.main.utils import (get_ansible_version, get_ssh_version, decrypt_field, update_scm_url,
|
from awx.main.utils import (get_ansible_version, get_ssh_version, decrypt_field, update_scm_url,
|
||||||
check_proot_installed, build_proot_temp_dir, get_licenser,
|
check_proot_installed, build_proot_temp_dir, get_licenser,
|
||||||
wrap_args_with_proot, get_system_task_capacity, OutputEventFilter,
|
wrap_args_with_proot, OutputEventFilter, ignore_inventory_computed_fields,
|
||||||
ignore_inventory_computed_fields, ignore_inventory_group_removal,
|
ignore_inventory_group_removal, get_type_for_model, extract_ansible_vars)
|
||||||
get_type_for_model, extract_ansible_vars)
|
|
||||||
from awx.main.utils.reload import restart_local_services, stop_local_services
|
from awx.main.utils.reload import restart_local_services, stop_local_services
|
||||||
from awx.main.utils.pglock import advisory_lock
|
from awx.main.utils.pglock import advisory_lock
|
||||||
from awx.main.utils.ha import update_celery_worker_routes, register_celery_worker_queues
|
from awx.main.utils.ha import update_celery_worker_routes, register_celery_worker_queues
|
||||||
@@ -307,6 +305,7 @@ def cluster_node_heartbeat(self):
|
|||||||
instance_list = list(Instance.objects.filter(rampart_groups__controller__isnull=True).distinct())
|
instance_list = list(Instance.objects.filter(rampart_groups__controller__isnull=True).distinct())
|
||||||
this_inst = None
|
this_inst = None
|
||||||
lost_instances = []
|
lost_instances = []
|
||||||
|
|
||||||
for inst in list(instance_list):
|
for inst in list(instance_list):
|
||||||
if inst.hostname == settings.CLUSTER_HOST_ID:
|
if inst.hostname == settings.CLUSTER_HOST_ID:
|
||||||
this_inst = inst
|
this_inst = inst
|
||||||
@@ -316,11 +315,15 @@ def cluster_node_heartbeat(self):
|
|||||||
instance_list.remove(inst)
|
instance_list.remove(inst)
|
||||||
if this_inst:
|
if this_inst:
|
||||||
startup_event = this_inst.is_lost(ref_time=nowtime)
|
startup_event = this_inst.is_lost(ref_time=nowtime)
|
||||||
if this_inst.capacity == 0:
|
if this_inst.capacity == 0 and this_inst.enabled:
|
||||||
logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
|
logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
|
||||||
this_inst.capacity = get_system_task_capacity()
|
if this_inst.enabled:
|
||||||
this_inst.version = awx_application_version
|
this_inst.refresh_capacity()
|
||||||
this_inst.save(update_fields=['capacity', 'version', 'modified'])
|
handle_ha_toplogy_changes.apply_async()
|
||||||
|
elif this_inst.capacity != 0 and not this_inst.enabled:
|
||||||
|
this_inst.capacity = 0
|
||||||
|
this_inst.save(update_fields=['capacity'])
|
||||||
|
handle_ha_toplogy_changes.apply_async()
|
||||||
if startup_event:
|
if startup_event:
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
@@ -329,7 +332,7 @@ def cluster_node_heartbeat(self):
|
|||||||
for other_inst in instance_list:
|
for other_inst in instance_list:
|
||||||
if other_inst.version == "":
|
if other_inst.version == "":
|
||||||
continue
|
continue
|
||||||
if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version) and not settings.DEBUG:
|
if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG:
|
||||||
logger.error("Host {} reports version {}, but this node {} is at {}, shutting down".format(other_inst.hostname,
|
logger.error("Host {} reports version {}, but this node {} is at {}, shutting down".format(other_inst.hostname,
|
||||||
other_inst.version,
|
other_inst.version,
|
||||||
this_inst.hostname,
|
this_inst.hostname,
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
from awx.main.models import Job, Instance
|
|
||||||
from django.test.utils import override_settings
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import mock
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from awx.main.models import Job, Instance
|
||||||
|
from awx.main.tasks import cluster_node_heartbeat
|
||||||
|
from django.test.utils import override_settings
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_orphan_unified_job_creation(instance, inventory):
|
def test_orphan_unified_job_creation(instance, inventory):
|
||||||
@@ -17,13 +19,19 @@ def test_orphan_unified_job_creation(instance, inventory):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
|
@mock.patch('awx.main.utils.common.get_cpu_capacity', lambda: (2,8))
|
||||||
|
@mock.patch('awx.main.utils.common.get_mem_capacity', lambda: (8000,62))
|
||||||
|
@mock.patch('awx.main.tasks.handle_ha_toplogy_changes.apply_async', lambda: True)
|
||||||
def test_job_capacity_and_with_inactive_node():
|
def test_job_capacity_and_with_inactive_node():
|
||||||
Instance.objects.create(hostname='test-1', capacity=50)
|
i = Instance.objects.create(hostname='test-1')
|
||||||
assert Instance.objects.total_capacity() == 50
|
i.refresh_capacity()
|
||||||
Instance.objects.create(hostname='test-2', capacity=50)
|
assert i.capacity == 62
|
||||||
assert Instance.objects.total_capacity() == 100
|
i.enabled = False
|
||||||
with override_settings(AWX_ACTIVE_NODE_TIME=0):
|
i.save()
|
||||||
assert Instance.objects.total_capacity() < 100
|
with override_settings(CLUSTER_HOST_ID=i.hostname):
|
||||||
|
cluster_node_heartbeat()
|
||||||
|
i = Instance.objects.get(id=i.id)
|
||||||
|
assert i.capacity == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
|
|||||||
@@ -60,6 +60,7 @@ class TestAddRemoveCeleryWorkerQueues():
|
|||||||
static_queues, _worker_queues,
|
static_queues, _worker_queues,
|
||||||
groups, hostname,
|
groups, hostname,
|
||||||
added_expected, removed_expected):
|
added_expected, removed_expected):
|
||||||
|
added_expected.append('tower_instance_router')
|
||||||
instance = instance_generator(groups=groups, hostname=hostname)
|
instance = instance_generator(groups=groups, hostname=hostname)
|
||||||
worker_queues = worker_queues_generator(_worker_queues)
|
worker_queues = worker_queues_generator(_worker_queues)
|
||||||
with mock.patch('awx.main.utils.ha.settings.AWX_CELERY_QUEUES_STATIC', static_queues):
|
with mock.patch('awx.main.utils.ha.settings.AWX_CELERY_QUEUES_STATIC', static_queues):
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ import six
|
|||||||
import psutil
|
import psutil
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
|
|
||||||
|
from decimal import Decimal
|
||||||
|
|
||||||
# Decorator
|
# Decorator
|
||||||
from decorator import decorator
|
from decorator import decorator
|
||||||
|
|
||||||
@@ -45,7 +47,7 @@ __all__ = ['get_object_or_400', 'get_object_or_403', 'camelcase_to_underscore',
|
|||||||
'ignore_inventory_computed_fields', 'ignore_inventory_group_removal',
|
'ignore_inventory_computed_fields', 'ignore_inventory_group_removal',
|
||||||
'_inventory_updates', 'get_pk_from_dict', 'getattrd', 'NoDefaultProvided',
|
'_inventory_updates', 'get_pk_from_dict', 'getattrd', 'NoDefaultProvided',
|
||||||
'get_current_apps', 'set_current_apps', 'OutputEventFilter',
|
'get_current_apps', 'set_current_apps', 'OutputEventFilter',
|
||||||
'extract_ansible_vars', 'get_search_fields', 'get_system_task_capacity',
|
'extract_ansible_vars', 'get_search_fields', 'get_system_task_capacity', 'get_cpu_capacity', 'get_mem_capacity',
|
||||||
'wrap_args_with_proot', 'build_proot_temp_dir', 'check_proot_installed', 'model_to_dict',
|
'wrap_args_with_proot', 'build_proot_temp_dir', 'check_proot_installed', 'model_to_dict',
|
||||||
'model_instance_diff', 'timestamp_apiformat', 'parse_yaml_or_json', 'RequireDebugTrueOrTest',
|
'model_instance_diff', 'timestamp_apiformat', 'parse_yaml_or_json', 'RequireDebugTrueOrTest',
|
||||||
'has_model_field_prefetched', 'set_environ', 'IllegalArgumentError', 'get_custom_venv_choices']
|
'has_model_field_prefetched', 'set_environ', 'IllegalArgumentError', 'get_custom_venv_choices']
|
||||||
@@ -632,19 +634,52 @@ def parse_yaml_or_json(vars_str, silent_failure=True):
|
|||||||
return vars_dict
|
return vars_dict
|
||||||
|
|
||||||
|
|
||||||
@memoize()
|
def get_cpu_capacity():
|
||||||
def get_system_task_capacity():
|
from django.conf import settings
|
||||||
|
settings_forkcpu = getattr(settings, 'SYSTEM_TASK_FORKS_CPU', None)
|
||||||
|
env_forkcpu = os.getenv('SYSTEM_TASK_FORKS_CPU', None)
|
||||||
|
cpu = psutil.cpu_count()
|
||||||
|
|
||||||
|
if env_forkcpu:
|
||||||
|
forkcpu = int(env_forkcpu)
|
||||||
|
elif settings_forkcpu:
|
||||||
|
forkcpu = int(settings_forkcpu)
|
||||||
|
else:
|
||||||
|
forkcpu = 4
|
||||||
|
return (cpu, cpu * forkcpu)
|
||||||
|
|
||||||
|
|
||||||
|
def get_mem_capacity():
|
||||||
|
from django.conf import settings
|
||||||
|
settings_forkmem = getattr(settings, 'SYSTEM_TASK_FORKS_MEM', None)
|
||||||
|
env_forkmem = os.getenv('SYSTEM_TASK_FORKS_MEM', None)
|
||||||
|
if env_forkmem:
|
||||||
|
forkmem = int(env_forkmem)
|
||||||
|
elif settings_forkmem:
|
||||||
|
forkmem = int(settings_forkmem)
|
||||||
|
else:
|
||||||
|
forkmem = 100
|
||||||
|
|
||||||
|
mem = psutil.virtual_memory().total
|
||||||
|
return (mem, max(1, ((mem / 1024 / 1024) - 2048) / forkmem))
|
||||||
|
|
||||||
|
|
||||||
|
def get_system_task_capacity(scale=Decimal(1.0)):
|
||||||
'''
|
'''
|
||||||
Measure system memory and use it as a baseline for determining the system's capacity
|
Measure system memory and use it as a baseline for determining the system's capacity
|
||||||
'''
|
'''
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
if hasattr(settings, 'SYSTEM_TASK_CAPACITY'):
|
settings_forks = getattr(settings, 'SYSTEM_TASK_FORKS_CAPACITY', None)
|
||||||
return settings.SYSTEM_TASK_CAPACITY
|
env_forks = os.getenv('SYSTEM_TASK_FORKS_CAPACITY', None)
|
||||||
mem = psutil.virtual_memory()
|
|
||||||
total_mem_value = mem.total / 1024 / 1024
|
if env_forks:
|
||||||
if total_mem_value <= 2048:
|
return int(env_forks)
|
||||||
return 50
|
elif settings_forks:
|
||||||
return 50 + ((total_mem_value / 1024) - 2) * 75
|
return int(settings_forks)
|
||||||
|
|
||||||
|
_, cpu_cap = get_cpu_capacity()
|
||||||
|
_, mem_cap = get_mem_capacity()
|
||||||
|
return min(mem_cap, cpu_cap) + ((max(mem_cap, cpu_cap) - min(mem_cap, cpu_cap)) * scale)
|
||||||
|
|
||||||
|
|
||||||
_inventory_updates = threading.local()
|
_inventory_updates = threading.local()
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ def _add_remove_celery_worker_queues(app, instance, worker_queues, worker_name):
|
|||||||
queue['alias'] in settings.AWX_CELERY_QUEUES_STATIC:
|
queue['alias'] in settings.AWX_CELERY_QUEUES_STATIC:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if queue['name'] not in ig_names | set([instance.hostname]):
|
if queue['name'] not in ig_names | set([instance.hostname]) or not instance.enabled:
|
||||||
app.control.cancel_consumer(queue['name'], reply=True, destination=[worker_name])
|
app.control.cancel_consumer(queue['name'], reply=True, destination=[worker_name])
|
||||||
removed_queues.append(queue['name'])
|
removed_queues.append(queue['name'])
|
||||||
|
|
||||||
@@ -43,7 +43,6 @@ def update_celery_worker_routes(instance, conf):
|
|||||||
'awx.main.tasks.purge_old_stdout_files',
|
'awx.main.tasks.purge_old_stdout_files',
|
||||||
]
|
]
|
||||||
routes_updated = {}
|
routes_updated = {}
|
||||||
|
|
||||||
# Instance is, effectively, a controller node
|
# Instance is, effectively, a controller node
|
||||||
if instance.is_controller():
|
if instance.is_controller():
|
||||||
tasks.append('awx.main.tasks.awx_isolated_heartbeat')
|
tasks.append('awx.main.tasks.awx_isolated_heartbeat')
|
||||||
|
|||||||
@@ -639,9 +639,6 @@ AWX_PROOT_BASE_PATH = "/tmp"
|
|||||||
# Note: This setting may be overridden by database settings.
|
# Note: This setting may be overridden by database settings.
|
||||||
AWX_ANSIBLE_CALLBACK_PLUGINS = ""
|
AWX_ANSIBLE_CALLBACK_PLUGINS = ""
|
||||||
|
|
||||||
# Time at which an HA node is considered active
|
|
||||||
AWX_ACTIVE_NODE_TIME = 7200
|
|
||||||
|
|
||||||
# Automatically remove nodes that have missed their heartbeats after some time
|
# Automatically remove nodes that have missed their heartbeats after some time
|
||||||
AWX_AUTO_DEPROVISION_INSTANCES = False
|
AWX_AUTO_DEPROVISION_INSTANCES = False
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,8 @@ It's important to point out a few existing things:
|
|||||||
* Existing old-style HA deployments will be transitioned automatically to the new HA system during the upgrade process to 3.1.
|
* Existing old-style HA deployments will be transitioned automatically to the new HA system during the upgrade process to 3.1.
|
||||||
* Manual projects will need to be synced to all instances by the customer
|
* Manual projects will need to be synced to all instances by the customer
|
||||||
|
|
||||||
|
Ansible Tower 3.3 adds support for container-based clusters using Openshift or Kubernetes
|
||||||
|
|
||||||
## Important Changes
|
## Important Changes
|
||||||
|
|
||||||
* There is no concept of primary/secondary in the new Tower system. *All* systems are primary.
|
* There is no concept of primary/secondary in the new Tower system. *All* systems are primary.
|
||||||
@@ -226,6 +228,47 @@ show up in api endpoints and stats monitoring. These groups can be removed with
|
|||||||
$ awx-manage unregister_queue --queuename=<name>
|
$ awx-manage unregister_queue --queuename=<name>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Configuring Instances and Instance Groups from the API
|
||||||
|
|
||||||
|
Instance Groups can be created by posting to `/api/v2/instance_groups` as a System Admin.
|
||||||
|
|
||||||
|
Once created, `Instances` can be associated with an Instance Group with:
|
||||||
|
|
||||||
|
```
|
||||||
|
HTTP POST /api/v2/instance_groups/x/instances/ {'id': y}`
|
||||||
|
```
|
||||||
|
|
||||||
|
An `Instance` that is added to an `InstanceGroup` will automatically reconfigure itself to listen on the group's work queue. See the following
|
||||||
|
section `Instance Group Policies` for more details.
|
||||||
|
|
||||||
|
### Instance Group Policies
|
||||||
|
|
||||||
|
Tower `Instances` can be configured to automatically join `Instance Groups` when they come online by defining a policy. These policies are evaluated for
|
||||||
|
every new Instance that comes online.
|
||||||
|
|
||||||
|
Instance Group Policies are controlled by 3 optional fields on an `Instance Group`:
|
||||||
|
|
||||||
|
* `policy_instance_percentage`: This is a number between 0 - 100. It gaurantees that this percentage of active Tower instances will be added
|
||||||
|
to this `Instance Group`. As new instances come online, if the number of Instances in this group relative to the total number of instances
|
||||||
|
is less than the given percentage then new ones will be added until the percentage condition is satisfied.
|
||||||
|
* `policy_instance_minimum`: This policy attempts to keep at least this many `Instances` in the `Instance Group`. If the number of
|
||||||
|
available instances is lower than this minimum then all `Instances` will be placed in this `Instance Group`.
|
||||||
|
* `policy_instance_list`: This is a fixed list of `Instance` names. These `Instances` will *always* be added to this `Instance Group`.
|
||||||
|
Further, by adding Instances to this list you are declaring that you will manually manage those Instances and they will not be eligible under any other
|
||||||
|
policy. This means they will not be automatically added to any other `Instance Group` even if the policy would cause them to be matched.
|
||||||
|
|
||||||
|
> NOTES
|
||||||
|
|
||||||
|
* `Instances` that are assigned directly to `Instance Groups` by posting to `/api/v2/instance_groups/x/instances` or
|
||||||
|
`/api/v2/instances/x/instance_groups` are automatically added to the `policy_instance_list`. This means they are subject to the
|
||||||
|
normal caveats for `policy_instance_list` and must be manually managed.
|
||||||
|
* `policy_instance_percentage` and `policy_instance_minimum` work together. For example, if you have a `policy_instance_percentage` of
|
||||||
|
50% and a `policy_instance_minimum` of 2 and you start 6 `Instances`. 3 of them would be assigned to the `Instance Group`. If you reduce the number
|
||||||
|
of `Instances` to 2 then both of them would be assigned to the `Instance Group` to satisfy `policy_instance_minimum`. In this way, you can set a lower
|
||||||
|
bound on the amount of available resources.
|
||||||
|
* Policies don't actively prevent `Instances` from being associated with multiple `Instance Groups` but this can effectively be achieved by making the percentages
|
||||||
|
sum to 100. If you have 4 `Instance Groups` assign each a percentage value of 25 and the `Instances` will be distributed among them with no overlap.
|
||||||
|
|
||||||
### Status and Monitoring
|
### Status and Monitoring
|
||||||
|
|
||||||
Tower itself reports as much status as it can via the api at `/api/v2/ping` in order to provide validation of the health
|
Tower itself reports as much status as it can via the api at `/api/v2/ping` in order to provide validation of the health
|
||||||
|
|||||||
Reference in New Issue
Block a user