Use global capacity algorithm in serializer

The task manager was doing work to compute currently consumed
capacity, this is moved into the manager and applied in the
same form to the instance group list.
This commit is contained in:
AlanCoding
2017-08-27 13:14:35 -04:00
parent ce3c969c08
commit 5327a4c622
8 changed files with 216 additions and 81 deletions

View File

@@ -3564,6 +3564,7 @@ class InstanceSerializer(BaseSerializer):
class InstanceGroupSerializer(BaseSerializer):
consumed_capacity = serializers.SerializerMethodField()
percent_capacity_remaining = serializers.SerializerMethodField()
jobs_running = serializers.SerializerMethodField()
instances = serializers.SerializerMethodField()
@@ -3581,17 +3582,37 @@ class InstanceGroupSerializer(BaseSerializer):
res['controller'] = self.reverse('api:instance_group_detail', kwargs={'pk': obj.controller_id})
return res
def get_jobs_qs(self):
# Store running jobs queryset in context, so it will be shared in ListView
if 'running_jobs' not in self.context:
self.context['running_jobs'] = UnifiedJob.objects.filter(
status__in=('running', 'waiting'))
return self.context['running_jobs']
def get_capacity_dict(self):
# Store capacity values (globally computed) in the context
if 'capacity_map' not in self.context:
ig_qs = None
if self.parent: # Is ListView:
ig_qs = self.parent.instance
self.context['capacity_map'] = InstanceGroup.objects.capacity_values(
qs=ig_qs, tasks=self.get_jobs_qs(), breakdown=True)
return self.context['capacity_map']
def get_consumed_capacity(self, obj):
return obj.consumed_capacity
return self.get_capacity_dict()[obj.name]['consumed_capacity']
def get_percent_capacity_remaining(self, obj):
if not obj.capacity or obj.consumed_capacity == obj.capacity:
if not obj.capacity:
return 0.0
else:
return float("{0:.2f}".format(((float(obj.capacity) - float(obj.consumed_capacity)) / (float(obj.capacity))) * 100))
return float("{0:.2f}".format(
((float(obj.capacity) - float(self.get_consumed_capacity(obj))) / (float(obj.capacity))) * 100)
)
def get_jobs_running(self, obj):
return UnifiedJob.objects.filter(instance_group=obj, status__in=('running', 'waiting',)).count()
jobs_qs = self.get_jobs_qs()
return sum(1 for job in jobs_qs if job.instance_group_id == obj.id)
def get_instances(self, obj):
return obj.instances.count()

View File

@@ -377,9 +377,11 @@ class InstanceAccess(BaseAccess):
def get_queryset(self):
if self.user.is_superuser or self.user.is_system_auditor:
return Instance.objects.all().distinct()
qs = Instance.objects.all().distinct()
else:
return Instance.objects.filter(rampart_groups__in=self.user.get_queryset(InstanceGroup)).distinct()
qs = Instance.objects.filter(
rampart_groups__in=self.user.get_queryset(InstanceGroup)).distinct()
return qs.prefetch_related('rampart_groups')
def can_add(self, data):
return False
@@ -397,9 +399,11 @@ class InstanceGroupAccess(BaseAccess):
def get_queryset(self):
if self.user.is_superuser or self.user.is_system_auditor:
return InstanceGroup.objects.all()
qs = InstanceGroup.objects.all()
else:
return InstanceGroup.objects.filter(organization__in=Organization.accessible_objects(self.user, 'admin_role'))
qs = InstanceGroup.objects.filter(
organization__in=Organization.accessible_pk_qs(self.user, 'admin_role'))
return qs.prefetch_related('instances')
def can_add(self, data):
return False

View File

@@ -3,6 +3,7 @@
import sys
from datetime import timedelta
import logging
from django.db import models
from django.utils.timezone import now
@@ -11,7 +12,9 @@ from django.conf import settings
from awx.main.utils.filters import SmartFilter
___all__ = ['HostManager', 'InstanceManager']
___all__ = ['HostManager', 'InstanceManager', 'InstanceGroupManager']
logger = logging.getLogger('awx.main.managers')
class HostManager(models.Manager):
@@ -48,6 +51,17 @@ class HostManager(models.Manager):
return qs
def get_ig_ig_mapping(ig_instance_mapping, instance_ig_mapping):
# Create IG mapping by union of all groups their instances are members of
ig_ig_mapping = {}
for group_name in ig_instance_mapping.keys():
ig_ig_set = set()
for instance_hostname in ig_instance_mapping[group_name]:
ig_ig_set |= instance_ig_mapping[instance_hostname]
ig_ig_mapping[group_name] = ig_ig_set
return ig_ig_mapping
class InstanceManager(models.Manager):
"""A custom manager class for the Instance model.
@@ -80,7 +94,7 @@ class InstanceManager(models.Manager):
# NOTE: TODO: Likely to repurpose this once standalone ramparts are a thing
return "tower"
def capacity_mapping(self):
def capacity_mapping(self, qs=None):
"""
Returns tuple of two dictionaries that shows mutual connections by name
for global accounting of capacity
@@ -88,24 +102,90 @@ class InstanceManager(models.Manager):
instance_ig_mapping: {'instance_name': <set of group names instance is member of>}
ig_ig_mapping: {'group_name': <set of group names that share instances>}
"""
qs = self.all().prefetch_related('rampart_groups')
if qs is None:
qs = self.all().prefetch_related('rampart_groups')
instance_ig_mapping = {}
ig_instance_mapping = {}
# Create simple dictionary of instance IG memberships
for instance in qs.all():
# Create dictionaries that represent basic m2m memberships
for instance in qs:
if instance.capacity == 0:
continue
instance_ig_mapping[instance.hostname] = set()
instance_ig_mapping[instance.hostname] = set(
group.name for group in instance.rampart_groups.all()
)
for group in instance.rampart_groups.all():
ig_instance_mapping.setdefault(group.name, set())
ig_instance_mapping[group.name].add(instance.hostname)
instance_ig_mapping[instance.hostname].add(group.name)
# Create IG mapping by union of all groups their instances are members of
ig_ig_mapping = {}
for group_name in ig_instance_mapping.keys():
ig_ig_set = set()
for instance_hostname in ig_instance_mapping[group_name]:
ig_ig_set |= instance_ig_mapping[instance_hostname]
ig_ig_mapping[group_name] = ig_ig_set
# Get IG capacity overlap mapping
ig_ig_mapping = get_ig_ig_mapping(ig_instance_mapping, instance_ig_mapping)
return instance_ig_mapping, ig_ig_mapping
class InstanceGroupManager(models.Manager):
"""A custom manager class for the Instance model.
Used for global capacity calculations
"""
def capacity_mapping(self, qs=None):
"""
Another entry-point to Instance manager method by same name
"""
if qs is None:
qs = self.all().prefetch_related('instances')
instance_ig_mapping = {}
ig_instance_mapping = {}
# Create dictionaries that represent basic m2m memberships
for group in qs:
ig_instance_mapping[group.name] = set(
instance.hostname for instance in group.instances.all() if
instance.capacity != 0
)
for inst in group.instances.all():
if inst.capacity == 0:
continue
instance_ig_mapping.setdefault(inst.hostname, set())
instance_ig_mapping[inst.hostname].add(group.name)
# Get IG capacity overlap mapping
ig_ig_mapping = get_ig_ig_mapping(ig_instance_mapping, instance_ig_mapping)
return instance_ig_mapping, ig_ig_mapping
def capacity_values(self, qs=None, tasks=None, breakdown=False, graph=None):
"""
Returns a dictionary of capacity values for all IGs
"""
if qs is None:
qs = self.all().prefetch_related('instances')
instance_ig_mapping, ig_ig_mapping = self.capacity_mapping(qs=qs)
if tasks is None:
tasks = self.model.unifiedjob_set.related.related_model.objects.filter(
status__in=('running', 'waiting'))
if graph is None:
graph = {group.name: {} for group in qs}
for group_name in graph:
graph[group_name]['consumed_capacity'] = 0
if breakdown:
graph[group_name]['committed_capacity'] = 0
graph[group_name]['running_capacity'] = 0
for t in tasks:
# TODO: dock capacity for isolated job management tasks running in queue
impact = t.task_impact
if t.status == 'waiting' or not t.execution_node:
# Subtract capacity from any peer groups that share instances
for group_name in ig_ig_mapping[t.instance_group.name]:
graph[group_name]['consumed_capacity'] += impact
if breakdown:
graph[group_name]['committed_capacity'] += impact
elif t.status == 'running':
# Subtract capacity from all groups that contain the instance
for group_name in instance_ig_mapping[t.execution_node]:
graph[group_name]['consumed_capacity'] += impact
if breakdown:
graph[group_name]['running_capacity'] += impact
else:
logger.error('Programming error, %s not in ["running", "waiting"]', t.log_format)
return graph

View File

@@ -11,7 +11,7 @@ from django.utils.timezone import now, timedelta
from solo.models import SingletonModel
from awx.api.versioning import reverse
from awx.main.managers import InstanceManager
from awx.main.managers import InstanceManager, InstanceGroupManager
from awx.main.models.inventory import InventoryUpdate
from awx.main.models.jobs import Job
from awx.main.models.projects import ProjectUpdate
@@ -66,6 +66,8 @@ class Instance(models.Model):
class InstanceGroup(models.Model):
"""A model representing a Queue/Group of AWX Instances."""
objects = InstanceGroupManager()
name = models.CharField(max_length=250, unique=True)
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
@@ -89,12 +91,7 @@ class InstanceGroup(models.Model):
@property
def capacity(self):
return sum([x[0] for x in self.instances.values_list('capacity')])
@property
def consumed_capacity(self):
return sum(x.task_impact for x in UnifiedJob.objects.filter(instance_group=self,
status__in=('running', 'waiting')))
return sum([inst.capacity for inst in self.instances.all()])
class Meta:
app_label = 'main'

View File

@@ -41,7 +41,7 @@ class TaskManager():
for rampart_group in InstanceGroup.objects.all():
self.graph[rampart_group.name] = dict(graph=DependencyGraph(rampart_group.name),
capacity_total=rampart_group.capacity,
capacity_used=0)
consumed_capacity=0)
def is_job_blocked(self, task):
# TODO: I'm not happy with this, I think blocking behavior should be decided outside of the dependency graph
@@ -492,25 +492,11 @@ class TaskManager():
self.fail_jobs_if_not_in_celery(node_jobs, active_tasks, celery_task_start_time)
def calculate_capacity_used(self, tasks):
for rampart_group in self.graph:
self.graph[rampart_group]['capacity_used'] = 0
instance_ig_mapping, ig_ig_mapping = Instance.objects.capacity_mapping()
for t in tasks:
# TODO: dock capacity for isolated job management tasks running in queue
if t.status == 'waiting':
# Subtract capacity from any peer groups that share instances
for instance_group_name in ig_ig_mapping[t.instance_group.name]:
self.graph[instance_group_name]['capacity_used'] += t.task_impact
elif t.status == 'running':
# Subtract capacity from all groups that contain the instance
for instance_group_name in instance_ig_mapping[t.execution_node]:
self.graph[instance_group_name]['capacity_used'] += t.task_impact
else:
logger.error('Programming error, %s not in ["running", "waiting"]', t.log_format)
def calculate_capacity_consumed(self, tasks):
self.graph = InstanceGroup.objects.capacity_values(tasks=tasks, graph=self.graph)
def would_exceed_capacity(self, task, instance_group):
current_capacity = self.graph[instance_group]['capacity_used']
current_capacity = self.graph[instance_group]['consumed_capacity']
capacity_total = self.graph[instance_group]['capacity_total']
if current_capacity == 0:
return False
@@ -519,16 +505,16 @@ class TaskManager():
def consume_capacity(self, task, instance_group):
logger.debug('%s consumed %s capacity units from %s with prior total of %s',
task.log_format, task.task_impact, instance_group,
self.graph[instance_group]['capacity_used'])
self.graph[instance_group]['capacity_used'] += task.task_impact
self.graph[instance_group]['consumed_capacity'])
self.graph[instance_group]['consumed_capacity'] += task.task_impact
def get_remaining_capacity(self, instance_group):
return (self.graph[instance_group]['capacity_total'] - self.graph[instance_group]['capacity_used'])
return (self.graph[instance_group]['capacity_total'] - self.graph[instance_group]['consumed_capacity'])
def process_tasks(self, all_sorted_tasks):
running_tasks = filter(lambda t: t.status in ['waiting', 'running'], all_sorted_tasks)
self.calculate_capacity_used(running_tasks)
self.calculate_capacity_consumed(running_tasks)
self.process_running_tasks(running_tasks)

View File

@@ -3,7 +3,7 @@
import logging
# Celery
from celery import task
from celery import Task, task
# AWX
from awx.main.scheduler import TaskManager
@@ -15,6 +15,12 @@ logger = logging.getLogger('awx.main.scheduler')
# updated model, the call to schedule() may get stale data.
class LogErrorsTask(Task):
def on_failure(self, exc, task_id, args, kwargs, einfo):
logger.exception('Task {} encountered exception.'.format(self.name), exc_info=exc)
super(LogErrorsTask, self).on_failure(exc, task_id, args, kwargs, einfo)
@task
def run_job_launch(job_id):
TaskManager().schedule()
@@ -25,7 +31,7 @@ def run_job_complete(job_id):
TaskManager().schedule()
@task
@task(base=LogErrorsTask)
def run_task_manager():
logger.debug("Running Tower task manager.")
TaskManager().schedule()

View File

@@ -0,0 +1,69 @@
import pytest
import mock
from django.test import TransactionTestCase
from awx.main.models import (
Job,
Instance,
InstanceGroup,
)
@pytest.mark.django_db
class TestCapacityMapping(TransactionTestCase):
def sample_cluster(self):
ig_small = InstanceGroup.objects.create(name='ig_small')
ig_large = InstanceGroup.objects.create(name='ig_large')
tower = InstanceGroup.objects.create(name='tower')
i1 = Instance.objects.create(hostname='i1', capacity=200)
i2 = Instance.objects.create(hostname='i2', capacity=200)
i3 = Instance.objects.create(hostname='i3', capacity=200)
ig_small.instances.add(i1)
ig_large.instances.add(i2, i3)
tower.instances.add(i2)
return [tower, ig_large, ig_small]
def test_mapping(self):
self.sample_cluster()
with self.assertNumQueries(2):
inst_map, ig_map = Instance.objects.capacity_mapping()
assert inst_map['i1'] == set(['ig_small'])
assert inst_map['i2'] == set(['ig_large', 'tower'])
assert ig_map['ig_small'] == set(['ig_small'])
assert ig_map['ig_large'] == set(['ig_large', 'tower'])
assert ig_map['tower'] == set(['ig_large', 'tower'])
def test_committed_capacity(self):
tower, ig_large, ig_small = self.sample_cluster()
tasks = [
Job(status='waiting', instance_group=tower),
Job(status='waiting', instance_group=ig_large),
Job(status='waiting', instance_group=ig_small)
]
with mock.patch.object(Job, 'task_impact', new=mock.PropertyMock(return_value=43)):
capacities = InstanceGroup.objects.capacity_values(
tasks=tasks, breakdown=True
)
# Jobs submitted to either tower or ig_larg must count toward both
assert capacities['tower']['committed_capacity'] == 43 * 2
assert capacities['ig_large']['committed_capacity'] == 43 * 2
assert capacities['ig_small']['committed_capacity'] == 43
def test_running_capacity(self):
tower, ig_large, ig_small = self.sample_cluster()
tasks = [
Job(status='running', execution_node='i1'),
Job(status='running', execution_node='i2'),
Job(status='running', execution_node='i3')
]
with mock.patch.object(Job, 'task_impact', new=mock.PropertyMock(return_value=43)):
capacities = InstanceGroup.objects.capacity_values(
tasks=tasks, breakdown=True
)
# Tower is only given 1 instance
assert capacities['tower']['running_capacity'] == 43
# Large IG has 2 instances
assert capacities['ig_large']['running_capacity'] == 43 * 2
assert capacities['ig_small']['running_capacity'] == 43

View File

@@ -4,43 +4,15 @@ from datetime import timedelta, datetime
from django.core.cache import cache
from django.utils.timezone import now as tz_now
from django.test import TransactionTestCase
from awx.main.scheduler import TaskManager
from awx.main.models import (
Job,
Instance,
InstanceGroup,
WorkflowJob,
)
@pytest.mark.django_db
class TestCapacityMapping(TransactionTestCase):
def sample_cluster(self):
ig_small = InstanceGroup.objects.create(name='ig_small')
ig_large = InstanceGroup.objects.create(name='ig_large')
tower = InstanceGroup.objects.create(name='tower')
i1 = Instance.objects.create(hostname='i1', capacity=200)
i2 = Instance.objects.create(hostname='i2', capacity=200)
i3 = Instance.objects.create(hostname='i3', capacity=200)
ig_small.instances.add(i1)
ig_large.instances.add(i2, i3)
tower.instances.add(i2)
def test_something(self):
self.sample_cluster()
with self.assertNumQueries(2):
inst_map, ig_map = Instance.objects.capacity_mapping()
assert inst_map['i1'] == set(['ig_small'])
assert inst_map['i2'] == set(['ig_large', 'tower'])
assert ig_map['ig_small'] == set(['ig_small'])
assert ig_map['ig_large'] == set(['ig_large', 'tower'])
assert ig_map['tower'] == set(['ig_large', 'tower'])
@pytest.mark.django_db
def test_single_job_scheduler_launch(default_instance_group, job_template_factory, mocker):
objects = job_template_factory('jt', organization='org1', project='proj',