Merge pull request #3786 from chrismeyersfsu/improvement-inventory_update

New Task Manager
This commit is contained in:
Chris Meyers 2016-11-01 14:19:31 -05:00 committed by GitHub
commit 6cc2dec7c7
24 changed files with 1576 additions and 473 deletions

View File

@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('main', '0046_v310_job_event_stdout'),
]
operations = [
migrations.CreateModel(
name='TowerScheduleState',
fields=[
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
('schedule_last_run', models.DateTimeField(auto_now_add=True)),
],
options={
'abstract': False,
},
),
]

View File

@ -4,7 +4,6 @@
# Python
import datetime
import hmac
import json
import logging
from urlparse import urljoin
@ -24,7 +23,6 @@ from jsonfield import JSONField
# AWX
from awx.main.models.base import * # noqa
from awx.main.models.unified_jobs import * # noqa
from awx.main.utils import decrypt_field
from awx.main.models.notifications import JobNotificationMixin
logger = logging.getLogger('awx.main.models.ad_hoc_commands')
@ -181,13 +179,6 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin):
def get_passwords_needed_to_start(self):
return self.passwords_needed_to_start
def is_blocked_by(self, obj):
from awx.main.models import InventoryUpdate
if type(obj) == InventoryUpdate:
if self.inventory == obj.inventory_source.inventory:
return True
return False
@property
def task_impact(self):
# NOTE: We sorta have to assume the host count matches and that forks default to 5
@ -195,35 +186,6 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin):
count_hosts = Host.objects.filter( enabled=True, inventory__ad_hoc_commands__pk=self.pk).count()
return min(count_hosts, 5 if self.forks == 0 else self.forks) * 10
def generate_dependencies(self, active_tasks):
from awx.main.models import InventoryUpdate
if not self.inventory:
return []
inventory_sources = self.inventory.inventory_sources.filter( update_on_launch=True)
inventory_sources_found = []
dependencies = []
for obj in active_tasks:
if type(obj) == InventoryUpdate:
if obj.inventory_source in inventory_sources:
inventory_sources_found.append(obj.inventory_source)
# Skip updating any inventory sources that were already updated before
# running this job (via callback inventory refresh).
try:
start_args = json.loads(decrypt_field(self, 'start_args'))
except Exception:
start_args = None
start_args = start_args or {}
inventory_sources_already_updated = start_args.get('inventory_sources_already_updated', [])
if inventory_sources_already_updated:
for source in inventory_sources.filter(pk__in=inventory_sources_already_updated):
if source not in inventory_sources_found:
inventory_sources_found.append(source)
if inventory_sources.count(): # and not has_setup_failures? Probably handled as an error scenario in the task runner
for source in inventory_sources:
if source not in inventory_sources_found and source.needs_update_on_launch:
dependencies.append(source.create_inventory_update(launch_type='dependency'))
return dependencies
def copy(self):
data = {}
for field in ('job_type', 'inventory_id', 'limit', 'credential_id',

View File

@ -5,13 +5,15 @@ from django.db import models
from django.db.models.signals import post_save
from django.dispatch import receiver
from solo.models import SingletonModel
from awx.main.managers import InstanceManager
from awx.main.models.inventory import InventoryUpdate
from awx.main.models.jobs import Job
from awx.main.models.projects import ProjectUpdate
from awx.main.models.unified_jobs import UnifiedJob
__all__ = ('Instance', 'JobOrigin')
__all__ = ('Instance', 'JobOrigin', 'TowerScheduleState',)
class Instance(models.Model):
@ -33,6 +35,8 @@ class Instance(models.Model):
# NOTE: TODO: Likely to repurpose this once standalone ramparts are a thing
return "tower"
class TowerScheduleState(SingletonModel):
schedule_last_run = models.DateTimeField(auto_now_add=True)
class JobOrigin(models.Model):
"""A model representing the relationship between a unified job and

View File

@ -22,7 +22,6 @@ from awx.main.constants import CLOUD_PROVIDERS
from awx.main.fields import AutoOneToOneField, ImplicitRoleField
from awx.main.managers import HostManager
from awx.main.models.base import * # noqa
from awx.main.models.jobs import Job
from awx.main.models.unified_jobs import * # noqa
from awx.main.models.mixins import ResourceMixin
from awx.main.models.notifications import (
@ -1089,7 +1088,7 @@ class InventorySource(UnifiedJobTemplate, InventorySourceOptions):
def _get_unified_job_field_names(cls):
return ['name', 'description', 'source', 'source_path', 'source_script', 'source_vars', 'schedule',
'credential', 'source_regions', 'instance_filters', 'group_by', 'overwrite', 'overwrite_vars',
'timeout']
'timeout', 'launch_type',]
def save(self, *args, **kwargs):
# If update_fields has been specified, add our field names to it,
@ -1250,15 +1249,6 @@ class InventoryUpdate(UnifiedJob, InventorySourceOptions, JobNotificationMixin):
def get_ui_url(self):
return urljoin(settings.TOWER_URL_BASE, "/#/inventory_sync/{}".format(self.pk))
def is_blocked_by(self, obj):
if type(obj) == InventoryUpdate:
if self.inventory_source.inventory == obj.inventory_source.inventory:
return True
if type(obj) == Job:
if self.inventory_source.inventory == obj.inventory:
return True
return False
@property
def task_impact(self):
return 50

View File

@ -33,7 +33,7 @@ from awx.main.models.notifications import (
NotificationTemplate,
JobNotificationMixin,
)
from awx.main.utils import decrypt_field, ignore_inventory_computed_fields
from awx.main.utils import ignore_inventory_computed_fields
from awx.main.redact import PlainTextCleaner
from awx.main.fields import ImplicitRoleField
from awx.main.models.mixins import ResourceMixin
@ -646,29 +646,6 @@ class Job(UnifiedJob, JobOptions, JobNotificationMixin):
kwargs['job_host_summaries__job__pk'] = self.pk
return Host.objects.filter(**kwargs)
def is_blocked_by(self, obj):
from awx.main.models import InventoryUpdate, ProjectUpdate
if type(obj) == Job:
if obj.job_template is not None and obj.inventory is not None:
if obj.job_template == self.job_template and \
obj.inventory == self.inventory:
if self.allow_simultaneous:
return False
if obj.launch_type == 'callback' and self.launch_type == 'callback' and \
obj.limit != self.limit:
return False
return True
return False
if type(obj) == InventoryUpdate:
if self.inventory == obj.inventory_source.inventory:
return True
return False
if type(obj) == ProjectUpdate:
if obj.project == self.project:
return True
return False
return False
@property
def task_impact(self):
# NOTE: We sorta have to assume the host count matches and that forks default to 5
@ -707,39 +684,6 @@ class Job(UnifiedJob, JobOptions, JobNotificationMixin):
def processed_hosts(self):
return self._get_hosts(job_host_summaries__processed__gt=0)
def generate_dependencies(self, active_tasks):
from awx.main.models import InventoryUpdate, ProjectUpdate
inventory_sources = self.inventory.inventory_sources.filter(update_on_launch=True)
project_found = False
inventory_sources_found = []
dependencies = []
for obj in active_tasks:
if type(obj) == ProjectUpdate and self.project is not None:
if obj.project == self.project:
project_found = True
if type(obj) == InventoryUpdate:
if obj.inventory_source in inventory_sources:
inventory_sources_found.append(obj.inventory_source)
# Skip updating any inventory sources that were already updated before
# running this job (via callback inventory refresh).
try:
start_args = json.loads(decrypt_field(self, 'start_args'))
except Exception:
start_args = None
start_args = start_args or {}
inventory_sources_already_updated = start_args.get('inventory_sources_already_updated', [])
if inventory_sources_already_updated:
for source in inventory_sources.filter(pk__in=inventory_sources_already_updated):
if source not in inventory_sources_found:
inventory_sources_found.append(source)
if not project_found and self.project is not None and self.project.needs_update_on_launch:
dependencies.append(self.project.create_project_update(launch_type='dependency'))
if inventory_sources.count(): # and not has_setup_failures? Probably handled as an error scenario in the task runner
for source in inventory_sources:
if source not in inventory_sources_found and source.needs_update_on_launch:
dependencies.append(source.create_inventory_update(launch_type='dependency'))
return dependencies
def notification_data(self, block=5):
data = super(Job, self).notification_data()
all_hosts = {}
@ -1526,9 +1470,6 @@ class SystemJob(UnifiedJob, SystemJobOptions, JobNotificationMixin):
def get_ui_url(self):
return urljoin(settings.TOWER_URL_BASE, "/#/management_jobs/{}".format(self.pk))
def is_blocked_by(self, obj):
return True
def handle_extra_data(self, extra_data):
extra_vars = {}
if isinstance(extra_data, dict):

View File

@ -22,7 +22,6 @@ from django.utils.timezone import now, make_aware, get_default_timezone
# AWX
from awx.main.models.base import * # noqa
from awx.main.models.jobs import Job
from awx.main.models.notifications import (
NotificationTemplate,
JobNotificationMixin,
@ -275,7 +274,7 @@ class Project(UnifiedJobTemplate, ProjectOptions, ResourceMixin):
def _get_unified_job_field_names(cls):
return ['name', 'description', 'local_path', 'scm_type', 'scm_url',
'scm_branch', 'scm_clean', 'scm_delete_on_update',
'credential', 'schedule', 'timeout']
'credential', 'schedule', 'timeout', 'launch_type',]
def save(self, *args, **kwargs):
new_instance = not bool(self.pk)
@ -424,15 +423,6 @@ class ProjectUpdate(UnifiedJob, ProjectOptions, JobNotificationMixin):
from awx.main.tasks import RunProjectUpdate
return RunProjectUpdate
def is_blocked_by(self, obj):
if type(obj) == ProjectUpdate:
if self.project == obj.project:
return True
if type(obj) == Job:
if self.project == obj.project:
return True
return False
def websocket_emit_data(self):
return dict(project_id=self.project.id)

View File

@ -13,7 +13,7 @@ from StringIO import StringIO
# Django
from django.conf import settings
from django.db import models
from django.db import models, connection
from django.core.exceptions import NON_FIELD_ERRORS
from django.utils.translation import ugettext_lazy as _
from django.utils.timezone import now
@ -778,10 +778,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
def task_impact(self):
raise NotImplementedError # Implement in subclass.
def is_blocked_by(self, task_object):
''' Given another task object determine if this task would be blocked by it '''
raise NotImplementedError # Implement in subclass.
def websocket_emit_data(self):
''' Return extra data that should be included when submitting data to the browser over the websocket connection '''
return {}
@ -792,11 +788,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
status_data['group_name'] = 'jobs'
emit_channel_notification('jobs-status_changed', status_data)
def generate_dependencies(self, active_tasks):
''' Generate any tasks that the current task might be dependent on given a list of active
tasks that might preclude creating one'''
return []
def notification_data(self):
return dict(id=self.id,
name=self.name,
@ -835,14 +826,17 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
return (True, opts)
def start_celery_task(self, opts, error_callback, success_callback):
task_class = self._get_task_class()
task_class().apply_async((self.pk,), opts, link_error=error_callback, link=success_callback)
def start(self, error_callback, success_callback, **kwargs):
'''
Start the task running via Celery.
'''
task_class = self._get_task_class()
(res, opts) = self.pre_start(**kwargs)
if res:
task_class().apply_async((self.pk,), opts, link_error=error_callback, link=success_callback)
self.start_celery_task(opts, error_callback, success_callback)
return res
def signal_start(self, **kwargs):
@ -871,7 +865,7 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
self.websocket_emit_status("pending")
from awx.main.scheduler.tasks import run_job_launch
run_job_launch.delay(self.id)
connection.on_commit(lambda: run_job_launch.delay(self.id))
# Each type of unified job has a different Task class; get the
# appropirate one.

View File

@ -393,11 +393,6 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, JobNotificationMixin, Workflow
def _get_parent_field_name(cls):
return 'workflow_job_template'
@classmethod
def _get_task_class(cls):
from awx.main.tasks import RunWorkflowJob
return RunWorkflowJob
def _has_failed(self):
return False
@ -411,9 +406,6 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, JobNotificationMixin, Workflow
#def get_ui_url(self):
# return urlparse.urljoin(tower_settings.TOWER_URL_BASE, "/#/workflow_jobs/{}".format(self.pk))
def is_blocked_by(self, obj):
return True
@property
def task_impact(self):
return 0
@ -426,11 +418,9 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, JobNotificationMixin, Workflow
def get_notification_friendly_name(self):
return "Workflow Job"
def start(self, *args, **kwargs):
(res, opts) = self.pre_start(**kwargs)
if res:
self.status = 'running'
self.save()
self.websocket_emit_status("running")
return res
'''
A WorkflowJob is a virtual job. It doesn't result in a celery task.
'''
def start_celery_task(self, opts, error_callback, success_callback):
return None

View File

@ -2,256 +2,356 @@
# All Rights Reserved
# Python
import datetime
from datetime import timedelta
import logging
from sets import Set
# Django
from django.conf import settings
from django.db import transaction
from django.db import transaction, connection
from django.db.utils import DatabaseError
# AWX
from awx.main.models import * # noqa
from awx.main.utils import get_system_task_capacity
from awx.main.scheduler.dag_simple import SimpleDAG
#from awx.main.scheduler.dag_simple import SimpleDAG
from awx.main.scheduler.dag_workflow import WorkflowDAG
from awx.main.scheduler.dependency_graph import DependencyGraph
from awx.main.scheduler.partial import (
JobDict,
ProjectUpdateDict,
ProjectUpdateLatestDict,
InventoryUpdateDict,
InventoryUpdateLatestDict,
InventorySourceDict,
SystemJobDict,
AdHocCommandDict,
WorkflowJobDict,
)
# Celery
from celery.task.control import inspect
logger = logging.getLogger('awx.main.scheduler')
def get_tasks():
"""Fetch all Tower tasks that are relevant to the task management
system.
"""
RELEVANT_JOBS = ('pending', 'waiting', 'running')
# TODO: Replace this when we can grab all objects in a sane way.
graph_jobs = [j for j in Job.objects.filter(status__in=RELEVANT_JOBS)]
graph_ad_hoc_commands = [ahc for ahc in AdHocCommand.objects.filter(status__in=RELEVANT_JOBS)]
graph_inventory_updates = [iu for iu in
InventoryUpdate.objects.filter(status__in=RELEVANT_JOBS)]
graph_project_updates = [pu for pu in
ProjectUpdate.objects.filter(status__in=RELEVANT_JOBS)]
graph_system_jobs = [sj for sj in
SystemJob.objects.filter(status__in=RELEVANT_JOBS)]
graph_workflow_jobs = [wf for wf in
WorkflowJob.objects.filter(status__in=RELEVANT_JOBS)]
all_actions = sorted(graph_jobs + graph_ad_hoc_commands + graph_inventory_updates +
graph_project_updates + graph_system_jobs +
graph_workflow_jobs,
key=lambda task: task.created)
return all_actions
class TaskManager():
def __init__(self):
self.graph = DependencyGraph()
self.capacity_total = 200
self.capacity_used = 0
def get_running_workflow_jobs():
graph_workflow_jobs = [wf for wf in
WorkflowJob.objects.filter(status='running')]
return graph_workflow_jobs
def get_tasks(self):
status_list = ('pending', 'waiting', 'running')
def spawn_workflow_graph_jobs(workflow_jobs):
# TODO: Consider using transaction.atomic
for workflow_job in workflow_jobs:
dag = WorkflowDAG(workflow_job)
spawn_nodes = dag.bfs_nodes_to_run()
for spawn_node in spawn_nodes:
kv = spawn_node.get_job_kwargs()
job = spawn_node.unified_job_template.create_unified_job(**kv)
spawn_node.job = job
spawn_node.save()
can_start = job.signal_start(**kv)
if not can_start:
job.status = 'failed'
job.job_explanation = "Workflow job could not start because it was not in the right state or required manual credentials"
job.save(update_fields=['status', 'job_explanation'])
job.websocket_emit_status("failed")
jobs = JobDict.filter_partial(status=status_list)
inventory_updates = InventoryUpdateDict.filter_partial(status=status_list)
project_updates = ProjectUpdateDict.filter_partial(status=status_list)
system_jobs = SystemJobDict.filter_partial(status=status_list)
ad_hoc_commands = AdHocCommandDict.filter_partial(status=status_list)
workflow_jobs = WorkflowJobDict.filter_partial(status=status_list)
# TODO: should we emit a status on the socket here similar to tasks.py tower_periodic_scheduler() ?
#emit_websocket_notification('/socket.io/jobs', '', dict(id=))
all_actions = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands + workflow_jobs,
key=lambda task: task['created'])
return all_actions
# See comment in tasks.py::RunWorkflowJob::run()
def process_finished_workflow_jobs(workflow_jobs):
for workflow_job in workflow_jobs:
dag = WorkflowDAG(workflow_job)
if dag.is_workflow_done():
with transaction.atomic():
'''
Tasks that are running and SHOULD have a celery task.
'''
def get_running_tasks(self):
status_list = ('running',)
jobs = JobDict.filter_partial(status=status_list)
inventory_updates = InventoryUpdateDict.filter_partial(status=status_list)
project_updates = ProjectUpdateDict.filter_partial(status=status_list)
system_jobs = SystemJobDict.filter_partial(status=status_list)
ad_hoc_commands = AdHocCommandDict.filter_partial(status=status_list)
all_actions = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands,
key=lambda task: task['created'])
return all_actions
# TODO: Consider a database query for this logic
def get_latest_project_update_tasks(self, all_sorted_tasks):
project_ids = Set()
for task in all_sorted_tasks:
if type(task) == JobDict:
project_ids.add(task['project_id'])
return ProjectUpdateLatestDict.filter_partial(list(project_ids))
# TODO: Consider a database query for this logic
def get_latest_inventory_update_tasks(self, all_sorted_tasks):
inventory_ids = Set()
for task in all_sorted_tasks:
if type(task) == JobDict:
inventory_ids.add(task['inventory_id'])
return InventoryUpdateLatestDict.filter_partial(list(inventory_ids))
def get_running_workflow_jobs(self):
graph_workflow_jobs = [wf for wf in
WorkflowJob.objects.filter(status='running')]
return graph_workflow_jobs
# TODO: Consider a database query for this logic
def get_inventory_source_tasks(self, all_sorted_tasks):
inventory_ids = Set()
results = []
for task in all_sorted_tasks:
if type(task) is JobDict:
inventory_ids.add(task['inventory_id'])
for inventory_id in inventory_ids:
results.append((inventory_id, InventorySourceDict.filter_partial(inventory_id)))
return results
def spawn_workflow_graph_jobs(self, workflow_jobs):
for workflow_job in workflow_jobs:
dag = WorkflowDAG(workflow_job)
spawn_nodes = dag.bfs_nodes_to_run()
for spawn_node in spawn_nodes:
kv = spawn_node.get_job_kwargs()
job = spawn_node.unified_job_template.create_unified_job(**kv)
spawn_node.job = job
spawn_node.save()
can_start = job.signal_start(**kv)
if not can_start:
job.status = 'failed'
job.job_explanation = "Workflow job could not start because it was not in the right state or required manual credentials"
job.save(update_fields=['status', 'job_explanation'])
connection.on_commit(lambda: job.websocket_emit_status('failed'))
# TODO: should we emit a status on the socket here similar to tasks.py tower_periodic_scheduler() ?
#emit_websocket_notification('/socket.io/jobs', '', dict(id=))
# See comment in tasks.py::RunWorkflowJob::run()
def process_finished_workflow_jobs(self, workflow_jobs):
for workflow_job in workflow_jobs:
dag = WorkflowDAG(workflow_job)
if dag.is_workflow_done():
if workflow_job._has_failed():
workflow_job.status = 'failed'
else:
workflow_job.status = 'successful'
workflow_job.save()
workflow_job.websocket_emit_status(workflow_job.status)
connection.on_commit(lambda: workflow_job.websocket_emit_status(workflow_job.status))
def rebuild_graph():
"""Regenerate the task graph by refreshing known tasks from Tower, purging
orphaned running tasks, and creating dependencies for new tasks before
generating directed edge relationships between those tasks.
"""
'''
# Sanity check: Only do this on the primary node.
if Instance.objects.my_role() == 'secondary':
return None
'''
def get_active_tasks(self):
inspector = inspect()
if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'):
active_task_queues = inspector.active()
else:
logger.warn("Ignoring celery task inspector")
active_task_queues = None
inspector = inspect()
if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'):
active_task_queues = inspector.active()
else:
logger.warn("Ignoring celery task inspector")
active_task_queues = None
active_tasks = set()
if active_task_queues is not None:
for queue in active_task_queues:
map(lambda at: active_tasks.add(at['id']), active_task_queues[queue])
else:
if not hasattr(settings, 'CELERY_UNIT_TEST'):
return None
all_sorted_tasks = get_tasks()
if not len(all_sorted_tasks):
return None
return active_tasks
active_tasks = []
if active_task_queues is not None:
for queue in active_task_queues:
active_tasks += [at['id'] for at in active_task_queues[queue]]
else:
logger.error("Could not communicate with celery!")
# TODO: Something needs to be done here to signal to the system
# as a whole that celery appears to be down.
if not hasattr(settings, 'CELERY_UNIT_TEST'):
return None
def start_task(self, task, dependent_tasks=[]):
from awx.main.tasks import handle_work_error, handle_work_success
running_tasks = filter(lambda t: t.status == 'running', all_sorted_tasks)
running_celery_tasks = filter(lambda t: type(t) != WorkflowJob, running_tasks)
waiting_tasks = filter(lambda t: t.status != 'running', all_sorted_tasks)
new_tasks = filter(lambda t: t.status == 'pending', all_sorted_tasks)
task_actual = {
'type':task.get_job_type_str(),
'id': task['id'],
}
dependencies = [{'type': t.get_job_type_str(), 'id': t['id']} for t in dependent_tasks]
error_handler = handle_work_error.s(subtasks=[task_actual] + dependencies)
success_handler = handle_work_success.s(task_actual=task_actual)
job_obj = task.get_full()
job_obj.status = 'waiting'
# Check running tasks and make sure they are active in celery
logger.debug("Active celery tasks: " + str(active_tasks))
for task in list(running_celery_tasks):
if (task.celery_task_id not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')):
# NOTE: Pull status again and make sure it didn't finish in
# the meantime?
task.status = 'failed'
task.job_explanation += ' '.join((
'Task was marked as running in Tower but was not present in',
'Celery, so it has been marked as failed.',
))
task.save()
task.websocket_emit_status("failed")
running_tasks.pop(running_tasks.index(task))
logger.error("Task %s appears orphaned... marking as failed" % task)
(start_status, opts) = job_obj.pre_start()
if not start_status:
job_obj.status = 'failed'
if job_obj.job_explanation:
job_obj.job_explanation += ' '
job_obj.job_explanation += 'Task failed pre-start check.'
job_obj.save()
# TODO: run error handler to fail sub-tasks and send notifications
else:
if type(job_obj) is WorkflowJob:
job_obj.status = 'running'
# Create and process dependencies for new tasks
for task in new_tasks:
logger.debug("Checking dependencies for: %s" % str(task))
try:
task_dependencies = task.generate_dependencies(running_tasks + waiting_tasks)
except Exception, e:
logger.error("Failed processing dependencies for {}: {}".format(task, e))
task.status = 'failed'
task.job_explanation += 'Task failed to generate dependencies: {}'.format(e)
task.save()
task.websocket_emit_status("failed")
continue
logger.debug("New dependencies: %s" % str(task_dependencies))
for dep in task_dependencies:
# We recalculate the created time for the moment to ensure the
# dependencies are always sorted in the right order relative to
# the dependent task.
time_delt = len(task_dependencies) - task_dependencies.index(dep)
dep.created = task.created - datetime.timedelta(seconds=1 + time_delt)
dep.status = 'waiting'
dep.save()
waiting_tasks.insert(waiting_tasks.index(task), dep)
if not hasattr(settings, 'UNIT_TEST_IGNORE_TASK_WAIT'):
task.status = 'waiting'
task.save()
job_obj.save()
# Rebuild graph
graph = SimpleDAG()
for task in running_tasks:
graph.add_node(task)
for wait_task in waiting_tasks[:50]:
node_dependencies = []
for node in graph:
if wait_task.is_blocked_by(node['node_object']):
node_dependencies.append(node['node_object'])
graph.add_node(wait_task)
for dependency in node_dependencies:
graph.add_edge(wait_task, dependency)
if settings.DEBUG:
graph.generate_graphviz_plot()
return graph
self.consume_capacity(task)
def process_graph(graph, task_capacity):
"""Given a task dependency graph, start and manage tasks given their
priority and weight.
"""
from awx.main.tasks import handle_work_error, handle_work_success
def post_commit():
job_obj.websocket_emit_status(job_obj.status)
if job_obj.status != 'failed':
job_obj.start_celery_task(opts, error_callback=error_handler, success_callback=success_handler)
connection.on_commit(post_commit)
leaf_nodes = graph.get_leaf_nodes()
running_nodes = filter(lambda x: x['node_object'].status == 'running', leaf_nodes)
running_impact = sum([t['node_object'].task_impact for t in running_nodes])
ready_nodes = filter(lambda x: x['node_object'].status != 'running', leaf_nodes)
remaining_volume = task_capacity - running_impact
logger.info('Running Nodes: %s; Capacity: %s; Running Impact: %s; '
'Remaining Capacity: %s' %
(str(running_nodes), str(task_capacity),
str(running_impact), str(remaining_volume)))
logger.info("Ready Nodes: %s" % str(ready_nodes))
for task_node in ready_nodes:
node_obj = task_node['node_object']
# NOTE: This could be used to pass metadata through the task system
# node_args = task_node['metadata']
impact = node_obj.task_impact
if impact <= remaining_volume or running_impact == 0:
node_dependencies = graph.get_dependents(node_obj)
# Allow other tasks to continue if a job fails, even if they are
# other jobs.
def process_runnable_tasks(self, runnable_tasks):
map(lambda task: self.graph.add_job(task), runnable_tasks)
node_type = graph.get_node_type(node_obj)
if node_type == 'job':
# clear dependencies because a job can block (not necessarily
# depend) on other jobs that share the same job template
node_dependencies = []
def create_project_update(self, task):
dep = Project.objects.get(id=task['project_id']).create_project_update(launch_type='dependency')
# Make the workflow_job look like it's started by setting status to
# running, but don't make a celery Task for it.
# Introduce jobs from the workflow so they are candidates to run.
# Call process_graph() again to allow choosing for run, the
# created candidate jobs.
elif node_type == 'workflow_job':
node_obj.start()
spawn_workflow_graph_jobs([node_obj])
return process_graph(graph, task_capacity)
# Project created 1 seconds behind
dep.created = task['created'] - timedelta(seconds=1)
dep.status = 'pending'
dep.save()
dependent_nodes = [{'type': graph.get_node_type(node_obj), 'id': node_obj.id}] + \
[{'type': graph.get_node_type(n['node_object']),
'id': n['node_object'].id} for n in node_dependencies]
error_handler = handle_work_error.s(subtasks=dependent_nodes)
success_handler = handle_work_success.s(task_actual={'type': graph.get_node_type(node_obj),
'id': node_obj.id})
with transaction.atomic():
start_status = node_obj.start(error_callback=error_handler, success_callback=success_handler)
if not start_status:
node_obj.status = 'failed'
if node_obj.job_explanation:
node_obj.job_explanation += ' '
node_obj.job_explanation += 'Task failed pre-start check.'
node_obj.save()
continue
remaining_volume -= impact
running_impact += impact
logger.info('Started Node: %s (capacity hit: %s) '
'Remaining Capacity: %s' %
(str(node_obj), str(impact), str(remaining_volume)))
project_task = ProjectUpdateDict.get_partial(dep.id)
def schedule():
with transaction.atomic():
# Lock
Instance.objects.select_for_update().all()[0]
return project_task
task_capacity = get_system_task_capacity()
def create_inventory_update(self, task, inventory_source_task):
dep = InventorySource.objects.get(id=inventory_source_task['id']).create_inventory_update(launch_type='dependency')
workflow_jobs = get_running_workflow_jobs()
process_finished_workflow_jobs(workflow_jobs)
spawn_workflow_graph_jobs(workflow_jobs)
dep.created = task['created'] - timedelta(seconds=2)
dep.status = 'pending'
dep.save()
inventory_task = InventoryUpdateDict.get_partial(dep.id)
return inventory_task
def generate_dependencies(self, task):
dependencies = []
# TODO: What if the project is null ?
if type(task) is JobDict:
if task['project__scm_update_on_launch'] is True and \
self.graph.should_update_related_project(task):
project_task = self.create_project_update(task)
dependencies.append(project_task)
# Inventory created 2 seconds behind job
for inventory_source_task in self.graph.get_inventory_sources(task['inventory_id']):
if self.graph.should_update_related_inventory_source(task, inventory_source_task['id']):
inventory_task = self.create_inventory_update(task, inventory_source_task)
dependencies.append(inventory_task)
return dependencies
def process_latest_project_updates(self, latest_project_updates):
map(lambda task: self.graph.add_latest_project_update(task), latest_project_updates)
def process_latest_inventory_updates(self, latest_inventory_updates):
map(lambda task: self.graph.add_latest_inventory_update(task), latest_inventory_updates)
def process_inventory_sources(self, inventory_id_sources):
map(lambda (inventory_id, inventory_sources): self.graph.add_inventory_sources(inventory_id, inventory_sources), inventory_id_sources)
def process_dependencies(self, dependent_task, dependency_tasks):
for task in dependency_tasks:
# ProjectUpdate or InventoryUpdate may be blocked by another of
# the same type.
if not self.graph.is_job_blocked(task):
self.graph.add_job(task)
if not self.would_exceed_capacity(task):
self.start_task(task, [dependent_task])
else:
self.graph.add_job(task)
def process_pending_tasks(self, pending_tasks):
for task in pending_tasks:
# Stop processing tasks if we know we are out of capacity
if self.get_remaining_capacity() <= 0:
return
if not self.graph.is_job_blocked(task):
dependencies = self.generate_dependencies(task)
self.process_dependencies(task, dependencies)
# Spawning deps might have blocked us
if not self.graph.is_job_blocked(task):
self.graph.add_job(task)
if not self.would_exceed_capacity(task):
self.start_task(task)
else:
self.graph.add_job(task)
def process_celery_tasks(self, active_tasks, all_running_sorted_tasks):
'''
Rectify tower db <-> celery inconsistent view of jobs state
'''
for task in all_running_sorted_tasks:
if (task['celery_task_id'] not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')):
# NOTE: Pull status again and make sure it didn't finish in
# the meantime?
# TODO: try catch the getting of the job. The job COULD have been deleted
task_obj = task.get_full()
task_obj.status = 'failed'
task_obj.job_explanation += ' '.join((
'Task was marked as running in Tower but was not present in',
'Celery, so it has been marked as failed.',
))
task_obj.save()
print("Going to fail %s" % task_obj.id)
connection.on_commit(lambda: task_obj.websocket_emit_status('failed'))
logger.error("Task %s appears orphaned... marking as failed" % task)
def calculate_capacity_used(self, tasks):
self.capacity_used = 0
for t in tasks:
self.capacity_used += t.task_impact()
def would_exceed_capacity(self, task):
return (task.task_impact() + self.capacity_used > self.capacity_total)
def consume_capacity(self, task):
self.capacity_used += task.task_impact()
def get_remaining_capacity(self):
return (self.capacity_total - self.capacity_used)
def process_tasks(self, all_sorted_tasks):
running_tasks = filter(lambda t: t['status'] == 'running', all_sorted_tasks)
runnable_tasks = filter(lambda t: t['status'] in ['waiting', 'running'], all_sorted_tasks)
self.calculate_capacity_used(running_tasks)
self.process_runnable_tasks(runnable_tasks)
pending_tasks = filter(lambda t: t['status'] in 'pending', all_sorted_tasks)
self.process_pending_tasks(pending_tasks)
def _schedule(self):
all_sorted_tasks = self.get_tasks()
if len(all_sorted_tasks) > 0:
#self.process_celery_tasks(active_tasks, all_sorted_tasks)
latest_project_updates = self.get_latest_project_update_tasks(all_sorted_tasks)
self.process_latest_project_updates(latest_project_updates)
latest_inventory_updates = self.get_latest_inventory_update_tasks(all_sorted_tasks)
self.process_latest_inventory_updates(latest_inventory_updates)
inventory_id_sources = self.get_inventory_source_tasks(all_sorted_tasks)
self.process_inventory_sources(inventory_id_sources)
running_workflow_tasks = self.get_running_workflow_jobs()
self.process_finished_workflow_jobs(running_workflow_tasks)
self.spawn_workflow_graph_jobs(running_workflow_tasks)
self.process_tasks(all_sorted_tasks)
def schedule(self):
with transaction.atomic():
# Lock
try:
Instance.objects.select_for_update(nowait=True).all()[0]
except DatabaseError:
return
self._schedule()
graph = rebuild_graph()
if graph:
process_graph(graph, task_capacity)
# Unlock, due to transaction ending

View File

@ -0,0 +1,203 @@
from datetime import timedelta
from django.utils.timezone import now as tz_now
from awx.main.scheduler.partial import (
JobDict,
ProjectUpdateDict,
InventoryUpdateDict,
SystemJobDict,
AdHocCommandDict,
WorkflowJobDict,
)
class DependencyGraph(object):
PROJECT_UPDATES = 'project_updates'
INVENTORY_UPDATES = 'inventory_updates'
JOB_TEMPLATE_JOBS = 'job_template_jobs'
SYSTEM_JOB = 'system_job'
INVENTORY_SOURCE_UPDATES = 'inventory_source_updates'
WORKFLOW_JOB_TEMPLATES_JOBS = 'workflow_job_template_jobs'
LATEST_PROJECT_UPDATES = 'latest_project_updates'
LATEST_INVENTORY_UPDATES = 'latest_inventory_updates'
INVENTORY_SOURCES = 'inventory_source_ids'
def __init__(self, *args, **kwargs):
self.data = {}
# project_id -> True / False
self.data[self.PROJECT_UPDATES] = {}
# inventory_id -> True / False
self.data[self.INVENTORY_UPDATES] = {}
# job_template_id -> True / False
self.data[self.JOB_TEMPLATE_JOBS] = {}
# inventory_source_id -> True / False
self.data[self.INVENTORY_SOURCE_UPDATES] = {}
# True / False
self.data[self.SYSTEM_JOB] = True
# workflow_job_template_id -> True / False
self.data[self.WORKFLOW_JOB_TEMPLATES_JOBS] = {}
# project_id -> latest ProjectUpdateLatestDict
self.data[self.LATEST_PROJECT_UPDATES] = {}
# inventory_source_id -> latest InventoryUpdateLatestDict
self.data[self.LATEST_INVENTORY_UPDATES] = {}
# inventory_id -> [inventory_source_ids]
self.data[self.INVENTORY_SOURCES] = {}
def add_latest_project_update(self, job):
self.data[self.LATEST_PROJECT_UPDATES][job['project_id']] = job
def add_latest_inventory_update(self, job):
self.data[self.LATEST_INVENTORY_UPDATES][job['inventory_source_id']] = job
def add_inventory_sources(self, inventory_id, inventory_sources):
self.data[self.INVENTORY_SOURCES][inventory_id] = inventory_sources
def get_inventory_sources(self, inventory_id):
return self.data[self.INVENTORY_SOURCES].get(inventory_id, [])
def get_now(self):
return tz_now()
'''
JobDict
Presume that job is related to a project that is update on launch
'''
def should_update_related_project(self, job):
now = self.get_now()
latest_project_update = self.data[self.LATEST_PROJECT_UPDATES].get(job['project_id'], None)
if not latest_project_update:
return True
# TODO: Other finished, failed cases? i.e. error ?
if latest_project_update['status'] in ['failed', 'canceled']:
return True
'''
This is a bit of fuzzy logic.
If the latest project update has a created time == job_created_time-1
then consider the project update found. This is so we don't enter an infinite loop
of updating the project when cache timeout is 0.
'''
if latest_project_update['project__scm_update_cache_timeout'] == 0 and \
latest_project_update['launch_type'] == 'dependency' and \
latest_project_update['created'] == job['created'] - timedelta(seconds=1):
return False
'''
Normal, expected, cache timeout logic
'''
timeout_seconds = timedelta(seconds=latest_project_update['project__scm_update_cache_timeout'])
if (latest_project_update['finished'] + timeout_seconds) < now:
return True
return False
def should_update_related_inventory_source(self, job, inventory_source_id):
now = self.get_now()
latest_inventory_update = self.data[self.LATEST_INVENTORY_UPDATES].get(inventory_source_id, None)
if not latest_inventory_update:
return True
# TODO: Other finished, failed cases? i.e. error ?
if latest_inventory_update['status'] in ['failed', 'canceled']:
return True
'''
This is a bit of fuzzy logic.
If the latest inventory update has a created time == job_created_time-2
then consider the inventory update found. This is so we don't enter an infinite loop
of updating the project when cache timeout is 0.
'''
if latest_inventory_update['inventory_source__update_cache_timeout'] == 0 and \
latest_inventory_update['launch_type'] == 'dependency' and \
latest_inventory_update['created'] == job['created'] - timedelta(seconds=2):
return False
'''
Normal, expected, cache timeout logic
'''
timeout_seconds = timedelta(seconds=latest_inventory_update['inventory_source__update_cache_timeout'])
if (latest_inventory_update['finished'] + timeout_seconds) < now:
return True
return False
def mark_system_job(self):
self.data[self.SYSTEM_JOB] = False
def mark_project_update(self, job):
self.data[self.PROJECT_UPDATES][job['project_id']] = False
def mark_inventory_update(self, inventory_id):
self.data[self.INVENTORY_UPDATES][inventory_id] = False
def mark_inventory_source_update(self, inventory_source_id):
self.data[self.INVENTORY_SOURCE_UPDATES][inventory_source_id] = False
def mark_job_template_job(self, job):
self.data[self.INVENTORY_UPDATES][job['inventory_id']] = False
self.data[self.PROJECT_UPDATES][job['project_id']] = False
self.data[self.JOB_TEMPLATE_JOBS][job['job_template_id']] = False
def mark_workflow_job(self, job):
self.data[self.WORKFLOW_JOB_TEMPLATES_JOBS][job['workflow_job_template_id']] = False
def can_project_update_run(self, job):
return self.data[self.PROJECT_UPDATES].get(job['project_id'], True)
def can_inventory_update_run(self, job):
return self.data[self.INVENTORY_SOURCE_UPDATES].get(job['inventory_source_id'], True)
def can_job_run(self, job):
if self.can_project_update_run(job) is True and \
self.data[self.INVENTORY_UPDATES].get(job['inventory_id'], True) is True:
if job['allow_simultaneous'] is False:
return self.data[self.JOB_TEMPLATE_JOBS].get(job['job_template_id'], True)
else:
return True
return False
def can_workflow_job_run(self, job):
return self.data[self.WORKFLOW_JOB_TEMPLATES_JOBS].get(job['workflow_job_template_id'], True)
def can_system_job_run(self):
return self.data[self.SYSTEM_JOB]
def can_ad_hoc_command_run(self, job):
return self.data[self.INVENTORY_UPDATES].get(job['inventory_id'], True)
def is_job_blocked(self, job):
if type(job) is ProjectUpdateDict:
return not self.can_project_update_run(job)
elif type(job) is InventoryUpdateDict:
return not self.can_inventory_update_run(job)
elif type(job) is JobDict:
return not self.can_job_run(job)
elif type(job) is SystemJobDict:
return not self.can_system_job_run()
elif type(job) is AdHocCommandDict:
return not self.can_ad_hoc_command_run(job)
elif type(job) is WorkflowJobDict:
return not self.can_workflow_job_run(job)
def add_job(self, job):
if type(job) is ProjectUpdateDict:
self.mark_project_update(job)
elif type(job) is InventoryUpdateDict:
self.mark_inventory_update(job['inventory_source__inventory_id'])
self.mark_inventory_source_update(job['inventory_source_id'])
elif type(job) is JobDict:
self.mark_job_template_job(job)
elif type(job) is WorkflowJobDict:
self.mark_workflow_job(job)
elif type(job) is SystemJobDict:
self.mark_system_job()
elif type(job) is AdHocCommandDict:
self.mark_inventory_update(job['inventory_id'])
def add_jobs(self, jobs):
map(lambda j: self.add_job(j), jobs)

View File

@ -0,0 +1,220 @@
# AWX
from awx.main.models import (
Job,
ProjectUpdate,
InventoryUpdate,
InventorySource,
SystemJob,
AdHocCommand,
WorkflowJob,
)
class PartialModelDict(object):
FIELDS = ()
model = None
data = None
def __init__(self, data):
if type(data) is not dict:
raise RuntimeError("Expected data to be of type dict not %s" % type(data))
self.data = data
def __getitem__(self, index):
return self.data[index]
def __setitem__(self, key, value):
self.data[key] = value
def get(self, key, **kwargs):
return self.data.get(key, **kwargs)
def get_full(self):
return self.model.objects.get(id=self.data['id'])
def refresh_partial(self):
return self.__class__(self.model.objects.filter(id=self.data['id']).values(*self.__class__.get_db_values())[0])
@classmethod
def get_partial(cls, id):
return cls(cls.model.objects.filter(id=id).values(*cls.get_db_values())[0])
@classmethod
def get_db_values(cls):
return cls.FIELDS
@classmethod
def filter_partial(cls, status=[]):
kv = {
'status__in': status
}
return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())]
def get_job_type_str(self):
raise RuntimeError("Inherit and implement me")
def task_impact(self):
raise RuntimeError("Inherit and implement me")
class JobDict(PartialModelDict):
FIELDS = (
'id', 'status', 'job_template_id', 'inventory_id', 'project_id',
'launch_type', 'limit', 'allow_simultaneous', 'created',
'job_type', 'celery_task_id', 'project__scm_update_on_launch',
'forks', 'inventory__inventory_sources',
)
model = Job
def get_job_type_str(self):
return 'job'
def task_impact(self):
return (5 if self.data['forks'] == 0 else self.data['forks']) * 10
class ProjectUpdateDict(PartialModelDict):
FIELDS = (
'id', 'status', 'project_id', 'created', 'celery_task_id',
'launch_type', 'project__scm_update_cache_timeout',
'project__scm_update_on_launch',
)
model = ProjectUpdate
def get_job_type_str(self):
return 'project_update'
def task_impact(self):
return 10
@classmethod
def filter_partial(cls, status=[]):
kv = {
'status__in': status,
'job_type': 'check',
}
return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())]
class ProjectUpdateLatestDict(ProjectUpdateDict):
FIELDS = (
'id', 'status', 'project_id', 'created', 'finished',
'project__scm_update_cache_timeout',
'launch_type', 'project__scm_update_on_launch',
)
model = ProjectUpdate
@classmethod
def filter_partial(cls, project_ids):
# TODO: This can shurley be made more efficient
# * shouldn't have to do a query per inventory_id
# * shouldn't have to call .values() on all the results, only to get the first result
results = []
for project_id in project_ids:
qs = cls.model.objects.filter(project_id=project_id, status__in=['waiting', 'successful', 'failed']).order_by('-finished', '-started', '-created',)
if qs.count() > 0:
results.append(cls(cls.model.objects.filter(id=qs[0].id).values(*cls.get_db_values())[0]))
return results
class InventoryUpdateDict(PartialModelDict):
#'inventory_source__update_on_launch',
#'inventory_source__update_cache_timeout',
FIELDS = (
'id', 'status', 'created', 'celery_task_id', 'inventory_source_id', 'inventory_source__inventory_id',
)
model = InventoryUpdate
def get_job_type_str(self):
return 'inventory_update'
def task_impact(self):
return 20
class InventoryUpdateLatestDict(InventoryUpdateDict):
#'inventory_source__update_on_launch',
#'inventory_source__update_cache_timeout',
FIELDS = (
'id', 'status', 'created', 'celery_task_id', 'inventory_source_id',
'finished', 'inventory_source__update_cache_timeout', 'launch_type',
)
model = InventoryUpdate
@classmethod
def filter_partial(cls, inventory_ids):
# TODO: This can shurley be made more efficient
# * shouldn't have to do a query per inventory_id nor per inventory_source_id
# * shouldn't have to call .values() on all the results, only to get the first result
results = []
for inventory_id in inventory_ids:
inventory_source_ids = InventorySource.objects.filter(inventory_id=inventory_id,
update_on_launch=True).values_list('id', flat=True)
# Find the most recent inventory update for each inventory source
for inventory_source_id in inventory_source_ids:
qs = cls.model.objects.filter(inventory_source_id=inventory_source_id,
status__in=['waiting', 'successful', 'failed'],
inventory_source__update_on_launch=True).order_by('-finished', '-started', '-created')
if qs.count() > 0:
results.append(cls(cls.model.objects.filter(id=qs[0].id).values(*cls.get_db_values())[0]))
return results
class InventorySourceDict(PartialModelDict):
FIELDS = (
'id',
)
model = InventorySource
def get_job_type_str(self):
return 'inventory_source'
def task_impact(self):
return 20
@classmethod
# TODO: Optimize this to run the query once
def filter_partial(cls, inventory_id):
kv = {
'inventory_id': inventory_id,
'update_on_launch': True,
}
return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())]
class SystemJobDict(PartialModelDict):
FIELDS = (
'id', 'created', 'status',
)
model = SystemJob
def get_job_type_str(self):
return 'system_job'
def task_impact(self):
return 20
@classmethod
def filter_partial(cls, status=[]):
kv = {
'status__in': status
}
return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())]
class AdHocCommandDict(PartialModelDict):
FIELDS = (
'id', 'created', 'status', 'inventory_id',
)
model = AdHocCommand
def get_job_type_str(self):
return 'ad_hoc_command'
def task_impact(self):
return 20
class WorkflowJobDict(PartialModelDict):
FIELDS = (
'id', 'created', 'status', 'workflow_job_template_id',
)
model = WorkflowJob
def get_job_type_str(self):
return 'workflow_job'
def task_impact(self):
return 10

View File

@ -1,14 +1,17 @@
# Python
import logging
import time
# Django
from django.db import transaction
from django.db.utils import DatabaseError
# Celery
from celery import task
# AWX
from awx.main.models import UnifiedJob
from awx.main.scheduler import schedule
from awx.main.models import Instance
from awx.main.scheduler import TaskManager
logger = logging.getLogger('awx.main.scheduler')
@ -18,62 +21,31 @@ logger = logging.getLogger('awx.main.scheduler')
@task
def run_job_launch(job_id):
# Wait for job to exist.
# The job is created in a transaction then the message is created, but
# the transaction may not have completed.
# FIXME: We could generate the message in a Django signal handler.
# OR, we could call an explicit commit in the view and then send the
# message.
retries = 10
retry = 0
while not UnifiedJob.objects.filter(id=job_id).exists():
time.sleep(0.3)
if retry >= retries:
logger.error("Failed to process 'job_launch' message for job %d" % job_id)
# ack the message so we don't build up the queue.
#
# The job can still be chosen to run during tower startup or
# when another job is started or completes
return
retry += 1
# "Safe" to get the job now since it exists.
# Really, there is a race condition from exists to get
# TODO: while not loop should call get wrapped in a try except
#job = UnifiedJob.objects.get(id=job_id)
schedule()
TaskManager().schedule()
@task
def run_job_complete(job_id):
# TODO: use list of finished status from jobs.py or unified_jobs.py
finished_status = ['successful', 'error', 'failed', 'completed']
q = UnifiedJob.objects.filter(id=job_id)
TaskManager().schedule()
# Ensure that the job is updated in the database before we call to
# schedule the next job.
retries = 10
retry = 0
while True:
# Job not found, most likely deleted. That's fine
if not q.exists():
logger.warn("Failed to find job '%d' while processing 'job_complete' message. Presume that it was deleted." % job_id)
break
@task
def run_task_manager():
TaskManager().schedule()
job = q[0]
if job.status in finished_status:
break
@task
def run_fail_inconsistent_running_jobs():
with transaction.atomic():
# Lock
try:
Instance.objects.select_for_update(nowait=True).all()[0]
scheduler = TaskManager()
active_tasks = scheduler.get_active_tasks()
time.sleep(0.3)
if retry >= retries:
logger.error("Expected job status '%s' to be one of '%s' while processing 'job_complete' message." % (job.status, finished_status))
if active_tasks is None:
# TODO: Failed to contact celery. We should surface this.
return None
all_running_sorted_tasks = scheduler.get_running_tasks()
scheduler.process_celery_tasks(active_tasks, all_running_sorted_tasks)
except DatabaseError:
return
retry += 1
schedule()

View File

@ -21,7 +21,6 @@ import traceback
import urlparse
import uuid
from distutils.version import LooseVersion as Version
import dateutil.parser
import yaml
try:
import psutil
@ -137,30 +136,12 @@ def cluster_node_heartbeat(self):
@task(bind=True, queue='default')
def tower_periodic_scheduler(self):
def get_last_run():
if not os.path.exists(settings.SCHEDULE_METADATA_LOCATION):
return None
fd = open(settings.SCHEDULE_METADATA_LOCATION)
try:
last_run = dateutil.parser.parse(fd.read())
return last_run
except Exception as exc:
logger.error("get_last_run failed: {}".format(exc))
return None
def write_last_run(last_run):
fd = open(settings.SCHEDULE_METADATA_LOCATION, 'w')
fd.write(last_run.isoformat())
fd.close()
run_now = now()
last_run = get_last_run()
if not last_run:
logger.debug("First run time")
write_last_run(run_now)
return
state = TowerScheduleState.get_solo()
last_run = state.schedule_last_run
logger.debug("Last run was: %s", last_run)
write_last_run(run_now)
state.schedule_last_run = run_now
state.save()
old_schedules = Schedule.objects.enabled().before(last_run)
for schedule in old_schedules:
@ -180,6 +161,7 @@ def tower_periodic_scheduler(self):
new_unified_job.save(update_fields=['status', 'job_explanation'])
new_unified_job.websocket_emit_status("failed")
emit_channel_notification('schedules-changed', dict(id=schedule.id, group_name="schedules"))
state.save()
def _send_notification_templates(instance, status_str):
if status_str not in ['succeeded', 'failed']:
@ -1756,7 +1738,7 @@ class RunAdHocCommand(BaseTask):
'''
Hook for actions to run after ad hoc command has completed.
'''
super(RunAdHocCommand, self).post_run_hook(ad_hoc_command, **kwargs)
super(RunAdHocCommand, self).post_run_hook(ad_hoc_command, status, **kwargs)
class RunSystemJob(BaseTask):

View File

@ -2,40 +2,6 @@ from awx.main.models import Job
import pytest
@pytest.mark.django_db
def test_job_blocking(get, post, job_template, inventory, inventory_factory):
j1 = Job.objects.create(job_template=job_template,
inventory=inventory)
j2 = Job.objects.create(job_template=job_template,
inventory=inventory)
assert j1.is_blocked_by(j2)
j2.inventory = inventory_factory(name='test-different-inventory')
assert not j1.is_blocked_by(j2)
j_callback_1 = Job.objects.create(job_template=job_template,
inventory=inventory,
launch_type='callback',
limit='a')
j_callback_2 = Job.objects.create(job_template=job_template,
inventory=inventory,
launch_type='callback',
limit='a')
assert j_callback_1.is_blocked_by(j_callback_2)
j_callback_2.limit = 'b'
assert not j_callback_1.is_blocked_by(j_callback_2)
@pytest.mark.django_db
def test_job_blocking_allow_simul(get, post, job_template, inventory):
job_template.allow_simultaneous = True
j1 = Job.objects.create(job_template=job_template,
inventory=inventory)
j2 = Job.objects.create(job_template=job_template,
inventory=inventory)
assert not j1.is_blocked_by(j2)
assert not j2.is_blocked_by(j1)
job_template.allow_simultaneous = False
assert j1.is_blocked_by(j2)
assert j2.is_blocked_by(j1)
@pytest.mark.django_db
def test_orphan_unified_job_creation(instance, inventory):
job = Job.objects.create(job_template=None, inventory=inventory, name='hi world')

View File

@ -0,0 +1,120 @@
# Python
import pytest
from django.utils.timezone import now as tz_now
from datetime import timedelta
# AWX
from awx.main.models import (
Organization,
Inventory,
Group,
Project,
ProjectUpdate,
InventoryUpdate,
InventorySource,
)
from awx.main.scheduler.partial import (
ProjectUpdateLatestDict,
InventoryUpdateDict,
InventoryUpdateLatestDict,
)
@pytest.fixture
def org():
return Organization.objects.create(name="org1")
class TestProjectUpdateLatestDictDict():
@pytest.fixture
def successful_project_update(self):
p = Project.objects.create(name="proj1")
pu = ProjectUpdate.objects.create(project=p, status='successful', finished=tz_now() - timedelta(seconds=20))
return (p, pu)
# Failed project updates newer than successful ones
@pytest.fixture
def multiple_project_updates(self):
p = Project.objects.create(name="proj1")
epoch = tz_now()
successful_pus = [ProjectUpdate.objects.create(project=p,
status='successful',
finished=epoch - timedelta(seconds=100 + i)) for i in xrange(0, 5)]
failed_pus = [ProjectUpdate.objects.create(project=p,
status='failed',
finished=epoch - timedelta(seconds=100 - len(successful_pus) + i)) for i in xrange(0, 5)]
return (p, failed_pus, successful_pus)
@pytest.mark.django_db
class TestFilterPartial():
def test_project_update_successful(self, successful_project_update):
(project, project_update) = successful_project_update
tasks = ProjectUpdateLatestDict.filter_partial(project_ids=[project.id])
assert 1 == len(tasks)
assert project_update.id == tasks[0]['id']
def test_correct_project_update(self, multiple_project_updates):
(project, failed_pus, successful_pus) = multiple_project_updates
tasks = ProjectUpdateLatestDict.filter_partial(project_ids=[project.id])
assert 1 == len(tasks)
assert failed_pus[0].id == tasks[0]['id']
class TestInventoryUpdateDict():
@pytest.fixture
def waiting_inventory_update(self, org):
i = Inventory.objects.create(name='inv1', organization=org)
g = Group.objects.create(name='group1', inventory=i)
#Inventory.groups.add(g)
inv_src = InventorySource.objects.create(group=g)
iu = InventoryUpdate.objects.create(inventory_source=inv_src, status='waiting')
return iu
@pytest.mark.django_db
class TestFilterPartial():
def test_simple(self, waiting_inventory_update):
tasks = InventoryUpdateDict.filter_partial(status=['waiting'])
assert 1 == len(tasks)
assert waiting_inventory_update.id == tasks[0]['id']
class TestInventoryUpdateLatestDict():
@pytest.fixture
def inventory(self, org):
i = Inventory.objects.create(name='inv1', organization=org)
return i
@pytest.fixture
def inventory_updates(self, inventory):
g1 = Group.objects.create(name='group1', inventory=inventory)
g2 = Group.objects.create(name='group2', inventory=inventory)
g3 = Group.objects.create(name='group3', inventory=inventory)
inv_src1 = InventorySource.objects.create(group=g1, update_on_launch=True, inventory=inventory)
inv_src2 = InventorySource.objects.create(group=g2, update_on_launch=False, inventory=inventory)
inv_src3 = InventorySource.objects.create(group=g3, update_on_launch=True, inventory=inventory)
iu1 = InventoryUpdate.objects.create(inventory_source=inv_src1, status='successful')
iu2 = InventoryUpdate.objects.create(inventory_source=inv_src2, status='waiting')
iu3 = InventoryUpdate.objects.create(inventory_source=inv_src3, status='waiting')
return [iu1, iu2, iu3]
@pytest.mark.django_db
def test_filter_partial(self, inventory, inventory_updates):
tasks = InventoryUpdateLatestDict.filter_partial([inventory.id])
inventory_updates_expected = [inventory_updates[0], inventory_updates[2]]
assert 2 == len(tasks)
for i, inventory_update in enumerate(inventory_updates_expected):
assert inventory_update.id == tasks[i]['id']

View File

@ -0,0 +1,240 @@
# Python
import pytest
from datetime import timedelta
# Django
from django.utils.timezone import now as tz_now
# awx
from awx.main.scheduler.partial import (
JobDict,
ProjectUpdateDict,
InventoryUpdateDict,
InventorySourceDict,
)
from awx.main.scheduler import TaskManager
@pytest.fixture
def epoch():
return tz_now()
@pytest.fixture
def scheduler_factory(mocker, epoch):
def fn(tasks=[], inventory_sources=[], latest_project_updates=[], latest_inventory_updates=[], create_project_update=None, create_inventory_update=None):
sched = TaskManager()
sched.capacity_total = 999999999
sched.graph.get_now = lambda: epoch
def no_create_inventory_update(task, ignore):
raise RuntimeError("create_inventory_update should not be called")
def no_create_project_update(task):
raise RuntimeError("create_project_update should not be called")
mocker.patch.object(sched, 'get_tasks', return_value=tasks)
mocker.patch.object(sched, 'get_running_workflow_jobs', return_value=[])
mocker.patch.object(sched, 'get_inventory_source_tasks', return_value=inventory_sources)
mocker.patch.object(sched, 'get_latest_project_update_tasks', return_value=latest_project_updates)
mocker.patch.object(sched, 'get_latest_inventory_update_tasks', return_value=latest_inventory_updates)
create_project_update_mock = mocker.patch.object(sched, 'create_project_update', return_value=create_project_update)
create_inventory_update_mock = mocker.patch.object(sched, 'create_inventory_update', return_value=create_inventory_update)
mocker.patch.object(sched, 'start_task')
if not create_project_update:
create_project_update_mock.side_effect = no_create_project_update
if not create_inventory_update:
create_inventory_update_mock.side_effect = no_create_inventory_update
return sched
return fn
@pytest.fixture
def project_update_factory(epoch):
def fn():
return ProjectUpdateDict({
'id': 1,
'created': epoch - timedelta(seconds=100),
'project_id': 1,
'project__scm_update_cache_timeout': 0,
'celery_task_id': '',
'launch_type': 'dependency',
'project__scm_update_on_launch': True,
})
return fn
@pytest.fixture
def pending_project_update(project_update_factory):
project_update = project_update_factory()
project_update['status'] = 'pending'
return project_update
@pytest.fixture
def waiting_project_update(epoch, project_update_factory):
project_update = project_update_factory()
project_update['status'] = 'waiting'
return project_update
@pytest.fixture
def running_project_update(epoch, project_update_factory):
project_update = project_update_factory()
project_update['status'] = 'running'
return project_update
@pytest.fixture
def successful_project_update(epoch, project_update_factory):
project_update = project_update_factory()
project_update['finished'] = epoch - timedelta(seconds=90)
project_update['status'] = 'successful'
return project_update
@pytest.fixture
def successful_project_update_cache_expired(epoch, project_update_factory):
project_update = project_update_factory()
project_update['status'] = 'successful'
project_update['created'] = epoch - timedelta(seconds=120)
project_update['finished'] = epoch - timedelta(seconds=110)
project_update['project__scm_update_cache_timeout'] = 1
return project_update
@pytest.fixture
def failed_project_update(epoch, project_update_factory):
project_update = project_update_factory()
project_update['finished'] = epoch - timedelta(seconds=90)
project_update['status'] = 'failed'
return project_update
@pytest.fixture
def inventory_update_factory(epoch):
def fn():
return InventoryUpdateDict({
'id': 1,
'created': epoch - timedelta(seconds=101),
'inventory_id': 1,
'celery_task_id': '',
'status': 'pending',
'launch_type': 'dependency',
'inventory_source_id': 1,
'inventory_source__inventory_id': 1,
})
return fn
@pytest.fixture
def inventory_update_latest_factory(epoch):
def fn():
return InventoryUpdateDict({
'id': 1,
'created': epoch - timedelta(seconds=101),
'inventory_id': 1,
'celery_task_id': '',
'status': 'pending',
'launch_type': 'dependency',
'inventory_source_id': 1,
'finished': None,
})
return fn
@pytest.fixture
def inventory_update_latest(inventory_update_latest_factory):
return inventory_update_latest_factory()
@pytest.fixture
def successful_inventory_update_latest(inventory_update_latest_factory):
iu = inventory_update_latest_factory()
iu['status'] = 'successful'
iu['finished'] = iu['created'] + timedelta(seconds=10)
return iu
@pytest.fixture
def failed_inventory_update_latest(inventory_update_latest_factory):
iu = inventory_update_latest_factory()
iu['status'] = 'failed'
return iu
@pytest.fixture
def pending_inventory_update(epoch, inventory_update_factory):
inventory_update = inventory_update_factory()
inventory_update['status'] = 'pending'
return inventory_update
@pytest.fixture
def waiting_inventory_update(epoch, inventory_update_factory):
inventory_update = inventory_update_factory()
inventory_update['status'] = 'waiting'
return inventory_update
@pytest.fixture
def failed_inventory_update(epoch, inventory_update_factory):
inventory_update = inventory_update_factory()
inventory_update['status'] = 'failed'
return inventory_update
@pytest.fixture
def running_inventory_update(epoch, inventory_update_factory):
inventory_update = inventory_update_factory()
inventory_update['status'] = 'running'
return inventory_update
@pytest.fixture
def successful_inventory_update(epoch, inventory_update_factory):
inventory_update = inventory_update_factory()
inventory_update['finished'] = epoch - timedelta(seconds=90)
inventory_update['status'] = 'successful'
return inventory_update
'''
Job
'''
@pytest.fixture
def job_factory(epoch):
def fn(project__scm_update_on_launch=True, inventory__inventory_sources=[]):
return JobDict({
'id': 1,
'status': 'pending',
'job_template_id': 1,
'project_id': 1,
'inventory_id': 1,
'launch_type': 'manual',
'allow_simultaneous': False,
'created': epoch - timedelta(seconds=99),
'celery_task_id': '',
'project__scm_update_on_launch': project__scm_update_on_launch,
'inventory__inventory_sources': inventory__inventory_sources,
'forks': 5
})
return fn
@pytest.fixture
def pending_job(job_factory):
job = job_factory()
job['status'] = 'pending'
return job
@pytest.fixture
def running_job(job_factory):
job = job_factory()
job['status'] = 'running'
return job
'''
Inventory id -> [InventorySourceDict, ...]
'''
@pytest.fixture
def inventory_source_factory():
def fn(id=1):
return InventorySourceDict({
'id': id,
})
return fn
@pytest.fixture
def inventory_id_sources(inventory_source_factory):
return [
(1, [
inventory_source_factory(id=1),
inventory_source_factory(id=2),
]),
]

View File

@ -0,0 +1,121 @@
# Python
import pytest
from datetime import timedelta
# Django
from django.utils.timezone import now as tz_now
# AWX
from awx.main.scheduler.dependency_graph import DependencyGraph
from awx.main.scheduler.partial import ProjectUpdateDict
@pytest.fixture
def graph():
return DependencyGraph()
@pytest.fixture
def job():
return dict(project_id=1)
@pytest.fixture
def unsuccessful_last_project(graph, job):
pu = ProjectUpdateDict(dict(id=1,
project__scm_update_cache_timeout=999999,
project_id=1,
status='failed',
created='3',
finished='3',))
graph.add_latest_project_update(pu)
return graph
@pytest.fixture
def last_dependent_project(graph):
now = tz_now()
job = {
'project_id': 1,
'created': now,
}
pu = ProjectUpdateDict(dict(id=1, project_id=1, status='waiting',
project__scm_update_cache_timeout=0,
launch_type='dependency',
created=now - timedelta(seconds=1),))
graph.add_latest_project_update(pu)
return (graph, job)
@pytest.fixture
def timedout_project_update(graph, job):
now = tz_now()
job = {
'project_id': 1,
'created': now,
}
pu = ProjectUpdateDict(dict(id=1, project_id=1, status='successful',
project__scm_update_cache_timeout=10,
launch_type='dependency',
created=now - timedelta(seconds=100),
finished=now - timedelta(seconds=11),))
graph.add_latest_project_update(pu)
return (graph, job)
@pytest.fixture
def not_timedout_project_update(graph, job):
now = tz_now()
job = {
'project_id': 1,
'created': now,
}
pu = ProjectUpdateDict(dict(id=1, project_id=1, status='successful',
project__scm_update_cache_timeout=3600,
launch_type='dependency',
created=now - timedelta(seconds=100),
finished=now - timedelta(seconds=11),))
graph.add_latest_project_update(pu)
return (graph, job)
class TestShouldUpdateRelatedProject():
def test_no_project_updates(self, graph, job):
actual = graph.should_update_related_project(job)
assert True is actual
def test_timedout_project_update(self, timedout_project_update):
(graph, job) = timedout_project_update
actual = graph.should_update_related_project(job)
assert True is actual
def test_not_timedout_project_update(self, not_timedout_project_update):
(graph, job) = not_timedout_project_update
actual = graph.should_update_related_project(job)
assert False is actual
def test_unsuccessful_last_project(self, unsuccessful_last_project, job):
graph = unsuccessful_last_project
actual = graph.should_update_related_project(job)
assert True is actual
def test_last_dependent_project(self, last_dependent_project):
(graph, job) = last_dependent_project
actual = graph.should_update_related_project(job)
assert False is actual

View File

@ -0,0 +1,85 @@
# Python
import pytest
from datetime import timedelta
@pytest.fixture
def pending_job(job_factory):
return job_factory(project__scm_update_on_launch=False, inventory__inventory_sources=['1'])
@pytest.fixture
def successful_inventory_update_latest(inventory_update_latest_factory):
iu = inventory_update_latest_factory()
iu['inventory_source__update_cache_timeout'] = 100
iu['status'] = 'successful'
iu['finished'] = iu['created'] + timedelta(seconds=10)
return iu
@pytest.fixture
def successful_inventory_update_latest_cache_expired(inventory_update_latest_factory):
iu = inventory_update_latest_factory()
iu['inventory_source__update_cache_timeout'] = 1
iu['finished'] = iu['created'] + timedelta(seconds=2)
return iu
class TestStartInventoryUpdate():
def test_pending(self, scheduler_factory, pending_inventory_update):
scheduler = scheduler_factory(tasks=[pending_inventory_update])
scheduler._schedule()
scheduler.start_task.assert_called_with(pending_inventory_update)
class TestInventoryUpdateBlocked():
def test_running_inventory_update(self, epoch, scheduler_factory, running_inventory_update, pending_inventory_update):
running_inventory_update['created'] = epoch - timedelta(seconds=100)
pending_inventory_update['created'] = epoch - timedelta(seconds=90)
scheduler = scheduler_factory(tasks=[running_inventory_update, pending_inventory_update])
scheduler._schedule()
def test_waiting_inventory_update(self, epoch, scheduler_factory, waiting_inventory_update, pending_inventory_update):
waiting_inventory_update['created'] = epoch - timedelta(seconds=100)
pending_inventory_update['created'] = epoch - timedelta(seconds=90)
scheduler = scheduler_factory(tasks=[waiting_inventory_update, pending_inventory_update])
scheduler._schedule()
class TestCreateDependentInventoryUpdate():
def test(self, scheduler_factory, pending_job, waiting_inventory_update, inventory_id_sources):
scheduler = scheduler_factory(tasks=[pending_job],
create_inventory_update=waiting_inventory_update,
inventory_sources=inventory_id_sources)
scheduler._schedule()
scheduler.start_task.assert_called_with(waiting_inventory_update, [pending_job])
def test_cache_hit(self, scheduler_factory, pending_job, successful_inventory_update, successful_inventory_update_latest):
scheduler = scheduler_factory(tasks=[successful_inventory_update, pending_job],
latest_inventory_updates=[successful_inventory_update_latest])
scheduler._schedule()
scheduler.start_task.assert_called_with(pending_job)
def test_cache_miss(self, scheduler_factory, pending_job, successful_inventory_update, successful_inventory_update_latest_cache_expired, waiting_inventory_update, inventory_id_sources):
scheduler = scheduler_factory(tasks=[successful_inventory_update, pending_job],
latest_inventory_updates=[successful_inventory_update_latest_cache_expired],
create_inventory_update=waiting_inventory_update,
inventory_sources=inventory_id_sources)
scheduler._schedule()
scheduler.start_task.assert_called_with(waiting_inventory_update, [pending_job])
def test_last_update_failed(self, scheduler_factory, pending_job, failed_inventory_update, failed_inventory_update_latest, waiting_inventory_update, inventory_id_sources):
scheduler = scheduler_factory(tasks=[failed_inventory_update, pending_job],
latest_inventory_updates=[failed_inventory_update_latest],
create_inventory_update=waiting_inventory_update,
inventory_sources=inventory_id_sources)
scheduler._schedule()
scheduler.start_task.assert_called_with(waiting_inventory_update, [pending_job])

View File

@ -0,0 +1,56 @@
# Python
import pytest
from datetime import timedelta
class TestJobBlocked():
def test_inventory_update_waiting(self, scheduler_factory, waiting_inventory_update, pending_job):
scheduler = scheduler_factory(tasks=[waiting_inventory_update, pending_job])
scheduler._schedule()
scheduler.start_task.assert_not_called()
def test_inventory_update_running(self, scheduler_factory, running_inventory_update, pending_job, inventory_source_factory, inventory_id_sources):
scheduler = scheduler_factory(tasks=[running_inventory_update, pending_job],
inventory_sources=inventory_id_sources)
scheduler._schedule()
scheduler.start_task.assert_not_called()
def test_project_update_running(self, scheduler_factory, pending_job, running_project_update):
scheduler = scheduler_factory(tasks=[running_project_update, pending_job])
scheduler._schedule()
scheduler.start_task.assert_not_called()
assert scheduler.create_project_update.call_count == 0
def test_project_update_waiting(self, scheduler_factory, pending_job, waiting_project_update):
scheduler = scheduler_factory(tasks=[waiting_project_update, pending_job],
latest_project_updates=[waiting_project_update])
scheduler._schedule()
scheduler.start_task.assert_not_called()
assert scheduler.create_project_update.call_count == 0
class TestJob():
@pytest.fixture
def successful_project_update(self, project_update_factory):
project_update = project_update_factory()
project_update['status'] = 'successful'
project_update['finished'] = project_update['created'] + timedelta(seconds=10)
project_update['project__scm_update_cache_timeout'] = 3600
return project_update
def test_existing_dependencies_finished(self, scheduler_factory, successful_project_update, successful_inventory_update_latest, pending_job):
scheduler = scheduler_factory(tasks=[successful_project_update, pending_job],
latest_project_updates=[successful_project_update],
latest_inventory_updates=[successful_inventory_update_latest])
scheduler._schedule()
scheduler.start_task.assert_called_with(pending_job)

View File

@ -0,0 +1,75 @@
# TODO: wherever get_latest_rpoject_update_task() is stubbed and returns a
# ProjectUpdateDict. We should instead return a ProjectUpdateLatestDict()
# For now, this is ok since the fields on deviate that much.
class TestStartProjectUpdate():
def test(self, scheduler_factory, pending_project_update):
scheduler = scheduler_factory(tasks=[pending_project_update])
scheduler._schedule()
scheduler.start_task.assert_called_with(pending_project_update)
assert scheduler.create_project_update.call_count == 0
'''
Explicit project update should always run. They should not use cache logic.
'''
def test_cache_oblivious(self, scheduler_factory, successful_project_update, pending_project_update):
scheduler = scheduler_factory(tasks=[pending_project_update],
latest_project_updates=[successful_project_update])
scheduler._schedule()
scheduler.start_task.assert_called_with(pending_project_update)
assert scheduler.create_project_update.call_count == 0
class TestCreateDependentProjectUpdate():
def test(self, scheduler_factory, pending_job, waiting_project_update):
scheduler = scheduler_factory(tasks=[pending_job],
create_project_update=waiting_project_update)
scheduler._schedule()
scheduler.start_task.assert_called_with(waiting_project_update, [pending_job])
def test_cache_hit(self, scheduler_factory, pending_job, successful_project_update):
scheduler = scheduler_factory(tasks=[successful_project_update, pending_job],
latest_project_updates=[successful_project_update])
scheduler._schedule()
scheduler.start_task.assert_called_with(pending_job)
def test_cache_miss(self, scheduler_factory, pending_job, successful_project_update_cache_expired, waiting_project_update):
scheduler = scheduler_factory(tasks=[successful_project_update_cache_expired, pending_job],
latest_project_updates=[successful_project_update_cache_expired],
create_project_update=waiting_project_update)
scheduler._schedule()
scheduler.start_task.assert_called_with(waiting_project_update, [pending_job])
def test_last_update_failed(self, scheduler_factory, pending_job, failed_project_update, waiting_project_update):
scheduler = scheduler_factory(tasks=[failed_project_update, pending_job],
latest_project_updates=[failed_project_update],
create_project_update=waiting_project_update)
scheduler._schedule()
scheduler.start_task.assert_called_with(waiting_project_update, [pending_job])
class TestProjectUpdateBlocked():
def test_projct_update_running(self, scheduler_factory, running_project_update, pending_project_update):
scheduler = scheduler_factory(tasks=[running_project_update, pending_project_update])
scheduler._schedule()
scheduler.start_task.assert_not_called()
assert scheduler.create_project_update.call_count == 0
def test_job_running(self, scheduler_factory, running_job, pending_project_update):
scheduler = scheduler_factory(tasks=[running_job, pending_project_update])
scheduler._schedule()
scheduler.start_task.assert_not_called()

View File

@ -201,6 +201,7 @@ INSTALLED_APPS = (
'awx.ui',
'awx.fact',
'awx.sso',
'solo',
)
INTERNAL_IPS = ('127.0.0.1',)
@ -392,6 +393,14 @@ CELERYBEAT_SCHEDULE = {
'task': 'awx.main.tasks.cluster_node_heartbeat',
'schedule': timedelta(seconds=60)
},
'task_manager': {
'task': 'awx.main.scheduler.tasks.run_task_manager',
'schedule': timedelta(seconds=20)
},
'task_fail_inconsistent_running_jobs': {
'task': 'awx.main.scheduler.tasks.run_fail_inconsistent_running_jobs',
'schedule': timedelta(seconds=30)
},
}
# Django Caching Configuration

View File

@ -0,0 +1,57 @@
# Task Manager Overview
The task manager is responsible for deciding when jobs should be introduced to celery for running. When choosing a task to run the considerations are: (1) creation time, (2) job dependency, (3) capacity.
Independent jobs are ran in order of creation time, earliest first. Jobs with dependencies are also ran in creation time order within the group of job dependencies. Capacity is the final consideration when deciding to release a job to be ran by celery.
## Task Manager Architecture
The task manager has a single entry point, `Scheduler().schedule()`. The method may be called in parallel, at any time, as many times as the user wants. The `schedule()` function tries to aquire a single, global, lock using the Instance table first record in the database. If the lock can not be aquired the method returns. The failure to aquire the lock indicates that there is another instance currently running `schedule()`.
### Hybrid Scheduler: Periodic + Event
The `schedule()` function is ran (a) periodically by a celery task and (b) on job creation or completion. The task manager system would behave correctly if ran, exclusively, via (a) or (b). We chose to trigger `schedule()` via both mechanisms because of the nice properties I will now mention. (b) reduces the time from launch to running, resulting a better user experience. (a) is a fail-safe in case we miss code-paths, in the present and future, that change the 3 scheduling considerations for which we should call `schedule()` (i.e. adding new nodes to tower changes the capacity, obscure job error handling that fails a job)
Emperically, the periodic task manager has served us well in the past and we will continue to rely on it with the added event-triggered `schedule()`.
### Scheduler Algorithm
* Get all non-completed jobs, `all_tasks`
* Generate the hash tables from `all_tasks`:
* `<job_template_id, True/False>` indicates a job is running
* `<project_id, True/False>` indicates a project update is running
* `<inventory_id, True/False>` indicates a job template or inventory update is running
* `<inventory_source_id, True/False>` indiciates an inventory update is running
* `<workflow_job_template_id, True/False>` indiciates a workflow job is running
* `<project_id, latest_project_update_partial>` used to determine cache timeout
* `<inventory_id, [ inventory_source_partial, ... ]>` used to determine cache timeout and dependencies to spawn
* `<inventory_source_id, latest_inventory_update_partial>` used to determine cache timeout
* Detect finished workflow jobs
* Spawn next workflow jobs if needed
* For each pending jobs; start with oldest created job and stop when no capacity == 0
* If job is not blocked, determined using generated hash tables, and there is capacity, then mark the as `waiting` and submit the job to celery.
### Job Lifecycle
| Job Status | State |
|:----------:|:------------------------------------------------------------------------------------------------------------------:|
| pending | Job launched. <br>1. Hasn't yet been seen by the scheduler <br>2. Is blocked by another task <br>3. Not enough capacity |
| waiting | Job submitted to celery. |
| running | Job running in celery. |
| successful | Job finished with ansible-playbook return code 0. |
| failed | Job finished with ansible-playbook return code other than 0. |
| error | System failure. |
## todo
## Code Composition
* partials
*
## Acceptance Tests
* assemelate with .md and trim the fat https://docs.google.com/a/redhat.com/document/d/1AOvKiTMSV0A2RHykHW66BZKBuaJ_l0SJ-VbMwvu-5Gk/edit?usp=sharing

View File

@ -24,7 +24,9 @@ django-polymorphic==0.7.2
django-radius==1.0.0
djangorestframework==3.3.2
djangorestframework-yaml==1.0.2
django-solo==1.1.2
django-split-settings==0.1.1
django-transaction-hooks==0.2
django-taggit==0.17.6
git+https://github.com/matburt/dm.xmlsec.binding.git@master#egg=dm.xmlsec.binding
dogpile.core==0.4.1