diff --git a/awx/main/migrations/0047_v310_tower_state.py b/awx/main/migrations/0047_v310_tower_state.py new file mode 100644 index 0000000000..941dfd0ba2 --- /dev/null +++ b/awx/main/migrations/0047_v310_tower_state.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('main', '0046_v310_job_event_stdout'), + ] + + operations = [ + migrations.CreateModel( + name='TowerScheduleState', + fields=[ + ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('schedule_last_run', models.DateTimeField(auto_now_add=True)), + ], + options={ + 'abstract': False, + }, + ), + ] diff --git a/awx/main/models/ad_hoc_commands.py b/awx/main/models/ad_hoc_commands.py index 65f40427b0..aadd34c190 100644 --- a/awx/main/models/ad_hoc_commands.py +++ b/awx/main/models/ad_hoc_commands.py @@ -4,7 +4,6 @@ # Python import datetime import hmac -import json import logging from urlparse import urljoin @@ -24,7 +23,6 @@ from jsonfield import JSONField # AWX from awx.main.models.base import * # noqa from awx.main.models.unified_jobs import * # noqa -from awx.main.utils import decrypt_field from awx.main.models.notifications import JobNotificationMixin logger = logging.getLogger('awx.main.models.ad_hoc_commands') @@ -181,13 +179,6 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin): def get_passwords_needed_to_start(self): return self.passwords_needed_to_start - def is_blocked_by(self, obj): - from awx.main.models import InventoryUpdate - if type(obj) == InventoryUpdate: - if self.inventory == obj.inventory_source.inventory: - return True - return False - @property def task_impact(self): # NOTE: We sorta have to assume the host count matches and that forks default to 5 @@ -195,35 +186,6 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin): count_hosts = Host.objects.filter( enabled=True, inventory__ad_hoc_commands__pk=self.pk).count() return min(count_hosts, 5 if self.forks == 0 else self.forks) * 10 - def generate_dependencies(self, active_tasks): - from awx.main.models import InventoryUpdate - if not self.inventory: - return [] - inventory_sources = self.inventory.inventory_sources.filter( update_on_launch=True) - inventory_sources_found = [] - dependencies = [] - for obj in active_tasks: - if type(obj) == InventoryUpdate: - if obj.inventory_source in inventory_sources: - inventory_sources_found.append(obj.inventory_source) - # Skip updating any inventory sources that were already updated before - # running this job (via callback inventory refresh). - try: - start_args = json.loads(decrypt_field(self, 'start_args')) - except Exception: - start_args = None - start_args = start_args or {} - inventory_sources_already_updated = start_args.get('inventory_sources_already_updated', []) - if inventory_sources_already_updated: - for source in inventory_sources.filter(pk__in=inventory_sources_already_updated): - if source not in inventory_sources_found: - inventory_sources_found.append(source) - if inventory_sources.count(): # and not has_setup_failures? Probably handled as an error scenario in the task runner - for source in inventory_sources: - if source not in inventory_sources_found and source.needs_update_on_launch: - dependencies.append(source.create_inventory_update(launch_type='dependency')) - return dependencies - def copy(self): data = {} for field in ('job_type', 'inventory_id', 'limit', 'credential_id', diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 3f92aebc12..691faf6305 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -5,13 +5,15 @@ from django.db import models from django.db.models.signals import post_save from django.dispatch import receiver +from solo.models import SingletonModel + from awx.main.managers import InstanceManager from awx.main.models.inventory import InventoryUpdate from awx.main.models.jobs import Job from awx.main.models.projects import ProjectUpdate from awx.main.models.unified_jobs import UnifiedJob -__all__ = ('Instance', 'JobOrigin') +__all__ = ('Instance', 'JobOrigin', 'TowerScheduleState',) class Instance(models.Model): @@ -33,6 +35,8 @@ class Instance(models.Model): # NOTE: TODO: Likely to repurpose this once standalone ramparts are a thing return "tower" +class TowerScheduleState(SingletonModel): + schedule_last_run = models.DateTimeField(auto_now_add=True) class JobOrigin(models.Model): """A model representing the relationship between a unified job and diff --git a/awx/main/models/inventory.py b/awx/main/models/inventory.py index 6fb3e2f992..662b1702f9 100644 --- a/awx/main/models/inventory.py +++ b/awx/main/models/inventory.py @@ -22,7 +22,6 @@ from awx.main.constants import CLOUD_PROVIDERS from awx.main.fields import AutoOneToOneField, ImplicitRoleField from awx.main.managers import HostManager from awx.main.models.base import * # noqa -from awx.main.models.jobs import Job from awx.main.models.unified_jobs import * # noqa from awx.main.models.mixins import ResourceMixin from awx.main.models.notifications import ( @@ -1089,7 +1088,7 @@ class InventorySource(UnifiedJobTemplate, InventorySourceOptions): def _get_unified_job_field_names(cls): return ['name', 'description', 'source', 'source_path', 'source_script', 'source_vars', 'schedule', 'credential', 'source_regions', 'instance_filters', 'group_by', 'overwrite', 'overwrite_vars', - 'timeout'] + 'timeout', 'launch_type',] def save(self, *args, **kwargs): # If update_fields has been specified, add our field names to it, @@ -1250,15 +1249,6 @@ class InventoryUpdate(UnifiedJob, InventorySourceOptions, JobNotificationMixin): def get_ui_url(self): return urljoin(settings.TOWER_URL_BASE, "/#/inventory_sync/{}".format(self.pk)) - def is_blocked_by(self, obj): - if type(obj) == InventoryUpdate: - if self.inventory_source.inventory == obj.inventory_source.inventory: - return True - if type(obj) == Job: - if self.inventory_source.inventory == obj.inventory: - return True - return False - @property def task_impact(self): return 50 diff --git a/awx/main/models/jobs.py b/awx/main/models/jobs.py index 3377968eba..8ed723626b 100644 --- a/awx/main/models/jobs.py +++ b/awx/main/models/jobs.py @@ -33,7 +33,7 @@ from awx.main.models.notifications import ( NotificationTemplate, JobNotificationMixin, ) -from awx.main.utils import decrypt_field, ignore_inventory_computed_fields +from awx.main.utils import ignore_inventory_computed_fields from awx.main.redact import PlainTextCleaner from awx.main.fields import ImplicitRoleField from awx.main.models.mixins import ResourceMixin @@ -646,29 +646,6 @@ class Job(UnifiedJob, JobOptions, JobNotificationMixin): kwargs['job_host_summaries__job__pk'] = self.pk return Host.objects.filter(**kwargs) - def is_blocked_by(self, obj): - from awx.main.models import InventoryUpdate, ProjectUpdate - if type(obj) == Job: - if obj.job_template is not None and obj.inventory is not None: - if obj.job_template == self.job_template and \ - obj.inventory == self.inventory: - if self.allow_simultaneous: - return False - if obj.launch_type == 'callback' and self.launch_type == 'callback' and \ - obj.limit != self.limit: - return False - return True - return False - if type(obj) == InventoryUpdate: - if self.inventory == obj.inventory_source.inventory: - return True - return False - if type(obj) == ProjectUpdate: - if obj.project == self.project: - return True - return False - return False - @property def task_impact(self): # NOTE: We sorta have to assume the host count matches and that forks default to 5 @@ -707,39 +684,6 @@ class Job(UnifiedJob, JobOptions, JobNotificationMixin): def processed_hosts(self): return self._get_hosts(job_host_summaries__processed__gt=0) - def generate_dependencies(self, active_tasks): - from awx.main.models import InventoryUpdate, ProjectUpdate - inventory_sources = self.inventory.inventory_sources.filter(update_on_launch=True) - project_found = False - inventory_sources_found = [] - dependencies = [] - for obj in active_tasks: - if type(obj) == ProjectUpdate and self.project is not None: - if obj.project == self.project: - project_found = True - if type(obj) == InventoryUpdate: - if obj.inventory_source in inventory_sources: - inventory_sources_found.append(obj.inventory_source) - # Skip updating any inventory sources that were already updated before - # running this job (via callback inventory refresh). - try: - start_args = json.loads(decrypt_field(self, 'start_args')) - except Exception: - start_args = None - start_args = start_args or {} - inventory_sources_already_updated = start_args.get('inventory_sources_already_updated', []) - if inventory_sources_already_updated: - for source in inventory_sources.filter(pk__in=inventory_sources_already_updated): - if source not in inventory_sources_found: - inventory_sources_found.append(source) - if not project_found and self.project is not None and self.project.needs_update_on_launch: - dependencies.append(self.project.create_project_update(launch_type='dependency')) - if inventory_sources.count(): # and not has_setup_failures? Probably handled as an error scenario in the task runner - for source in inventory_sources: - if source not in inventory_sources_found and source.needs_update_on_launch: - dependencies.append(source.create_inventory_update(launch_type='dependency')) - return dependencies - def notification_data(self, block=5): data = super(Job, self).notification_data() all_hosts = {} @@ -1526,9 +1470,6 @@ class SystemJob(UnifiedJob, SystemJobOptions, JobNotificationMixin): def get_ui_url(self): return urljoin(settings.TOWER_URL_BASE, "/#/management_jobs/{}".format(self.pk)) - def is_blocked_by(self, obj): - return True - def handle_extra_data(self, extra_data): extra_vars = {} if isinstance(extra_data, dict): diff --git a/awx/main/models/projects.py b/awx/main/models/projects.py index 1c693a6398..c3763ff34f 100644 --- a/awx/main/models/projects.py +++ b/awx/main/models/projects.py @@ -22,7 +22,6 @@ from django.utils.timezone import now, make_aware, get_default_timezone # AWX from awx.main.models.base import * # noqa -from awx.main.models.jobs import Job from awx.main.models.notifications import ( NotificationTemplate, JobNotificationMixin, @@ -275,7 +274,7 @@ class Project(UnifiedJobTemplate, ProjectOptions, ResourceMixin): def _get_unified_job_field_names(cls): return ['name', 'description', 'local_path', 'scm_type', 'scm_url', 'scm_branch', 'scm_clean', 'scm_delete_on_update', - 'credential', 'schedule', 'timeout'] + 'credential', 'schedule', 'timeout', 'launch_type',] def save(self, *args, **kwargs): new_instance = not bool(self.pk) @@ -424,15 +423,6 @@ class ProjectUpdate(UnifiedJob, ProjectOptions, JobNotificationMixin): from awx.main.tasks import RunProjectUpdate return RunProjectUpdate - def is_blocked_by(self, obj): - if type(obj) == ProjectUpdate: - if self.project == obj.project: - return True - if type(obj) == Job: - if self.project == obj.project: - return True - return False - def websocket_emit_data(self): return dict(project_id=self.project.id) diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index 674bedbffe..fa989ad60c 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -13,7 +13,7 @@ from StringIO import StringIO # Django from django.conf import settings -from django.db import models +from django.db import models, connection from django.core.exceptions import NON_FIELD_ERRORS from django.utils.translation import ugettext_lazy as _ from django.utils.timezone import now @@ -778,10 +778,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique def task_impact(self): raise NotImplementedError # Implement in subclass. - def is_blocked_by(self, task_object): - ''' Given another task object determine if this task would be blocked by it ''' - raise NotImplementedError # Implement in subclass. - def websocket_emit_data(self): ''' Return extra data that should be included when submitting data to the browser over the websocket connection ''' return {} @@ -792,11 +788,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique status_data['group_name'] = 'jobs' emit_channel_notification('jobs-status_changed', status_data) - def generate_dependencies(self, active_tasks): - ''' Generate any tasks that the current task might be dependent on given a list of active - tasks that might preclude creating one''' - return [] - def notification_data(self): return dict(id=self.id, name=self.name, @@ -835,14 +826,17 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique return (True, opts) + def start_celery_task(self, opts, error_callback, success_callback): + task_class = self._get_task_class() + task_class().apply_async((self.pk,), opts, link_error=error_callback, link=success_callback) + def start(self, error_callback, success_callback, **kwargs): ''' Start the task running via Celery. ''' - task_class = self._get_task_class() (res, opts) = self.pre_start(**kwargs) if res: - task_class().apply_async((self.pk,), opts, link_error=error_callback, link=success_callback) + self.start_celery_task(opts, error_callback, success_callback) return res def signal_start(self, **kwargs): @@ -871,7 +865,7 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique self.websocket_emit_status("pending") from awx.main.scheduler.tasks import run_job_launch - run_job_launch.delay(self.id) + connection.on_commit(lambda: run_job_launch.delay(self.id)) # Each type of unified job has a different Task class; get the # appropirate one. diff --git a/awx/main/models/workflow.py b/awx/main/models/workflow.py index a4f02deef2..b267343ea3 100644 --- a/awx/main/models/workflow.py +++ b/awx/main/models/workflow.py @@ -393,11 +393,6 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, JobNotificationMixin, Workflow def _get_parent_field_name(cls): return 'workflow_job_template' - @classmethod - def _get_task_class(cls): - from awx.main.tasks import RunWorkflowJob - return RunWorkflowJob - def _has_failed(self): return False @@ -411,9 +406,6 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, JobNotificationMixin, Workflow #def get_ui_url(self): # return urlparse.urljoin(tower_settings.TOWER_URL_BASE, "/#/workflow_jobs/{}".format(self.pk)) - def is_blocked_by(self, obj): - return True - @property def task_impact(self): return 0 @@ -426,11 +418,9 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, JobNotificationMixin, Workflow def get_notification_friendly_name(self): return "Workflow Job" - def start(self, *args, **kwargs): - (res, opts) = self.pre_start(**kwargs) - if res: - self.status = 'running' - self.save() - self.websocket_emit_status("running") - return res + ''' + A WorkflowJob is a virtual job. It doesn't result in a celery task. + ''' + def start_celery_task(self, opts, error_callback, success_callback): + return None diff --git a/awx/main/scheduler/__init__.py b/awx/main/scheduler/__init__.py index 6ecdc09b37..c0a680c7d4 100644 --- a/awx/main/scheduler/__init__.py +++ b/awx/main/scheduler/__init__.py @@ -2,256 +2,356 @@ # All Rights Reserved # Python -import datetime +from datetime import timedelta import logging +from sets import Set # Django from django.conf import settings -from django.db import transaction +from django.db import transaction, connection +from django.db.utils import DatabaseError # AWX from awx.main.models import * # noqa -from awx.main.utils import get_system_task_capacity -from awx.main.scheduler.dag_simple import SimpleDAG +#from awx.main.scheduler.dag_simple import SimpleDAG from awx.main.scheduler.dag_workflow import WorkflowDAG +from awx.main.scheduler.dependency_graph import DependencyGraph +from awx.main.scheduler.partial import ( + JobDict, + ProjectUpdateDict, + ProjectUpdateLatestDict, + InventoryUpdateDict, + InventoryUpdateLatestDict, + InventorySourceDict, + SystemJobDict, + AdHocCommandDict, + WorkflowJobDict, +) + # Celery from celery.task.control import inspect logger = logging.getLogger('awx.main.scheduler') -def get_tasks(): - """Fetch all Tower tasks that are relevant to the task management - system. - """ - RELEVANT_JOBS = ('pending', 'waiting', 'running') - # TODO: Replace this when we can grab all objects in a sane way. - graph_jobs = [j for j in Job.objects.filter(status__in=RELEVANT_JOBS)] - graph_ad_hoc_commands = [ahc for ahc in AdHocCommand.objects.filter(status__in=RELEVANT_JOBS)] - graph_inventory_updates = [iu for iu in - InventoryUpdate.objects.filter(status__in=RELEVANT_JOBS)] - graph_project_updates = [pu for pu in - ProjectUpdate.objects.filter(status__in=RELEVANT_JOBS)] - graph_system_jobs = [sj for sj in - SystemJob.objects.filter(status__in=RELEVANT_JOBS)] - graph_workflow_jobs = [wf for wf in - WorkflowJob.objects.filter(status__in=RELEVANT_JOBS)] - all_actions = sorted(graph_jobs + graph_ad_hoc_commands + graph_inventory_updates + - graph_project_updates + graph_system_jobs + - graph_workflow_jobs, - key=lambda task: task.created) - return all_actions +class TaskManager(): + def __init__(self): + self.graph = DependencyGraph() + self.capacity_total = 200 + self.capacity_used = 0 -def get_running_workflow_jobs(): - graph_workflow_jobs = [wf for wf in - WorkflowJob.objects.filter(status='running')] - return graph_workflow_jobs + def get_tasks(self): + status_list = ('pending', 'waiting', 'running') -def spawn_workflow_graph_jobs(workflow_jobs): - # TODO: Consider using transaction.atomic - for workflow_job in workflow_jobs: - dag = WorkflowDAG(workflow_job) - spawn_nodes = dag.bfs_nodes_to_run() - for spawn_node in spawn_nodes: - kv = spawn_node.get_job_kwargs() - job = spawn_node.unified_job_template.create_unified_job(**kv) - spawn_node.job = job - spawn_node.save() - can_start = job.signal_start(**kv) - if not can_start: - job.status = 'failed' - job.job_explanation = "Workflow job could not start because it was not in the right state or required manual credentials" - job.save(update_fields=['status', 'job_explanation']) - job.websocket_emit_status("failed") + jobs = JobDict.filter_partial(status=status_list) + inventory_updates = InventoryUpdateDict.filter_partial(status=status_list) + project_updates = ProjectUpdateDict.filter_partial(status=status_list) + system_jobs = SystemJobDict.filter_partial(status=status_list) + ad_hoc_commands = AdHocCommandDict.filter_partial(status=status_list) + workflow_jobs = WorkflowJobDict.filter_partial(status=status_list) - # TODO: should we emit a status on the socket here similar to tasks.py tower_periodic_scheduler() ? - #emit_websocket_notification('/socket.io/jobs', '', dict(id=)) + all_actions = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands + workflow_jobs, + key=lambda task: task['created']) + return all_actions -# See comment in tasks.py::RunWorkflowJob::run() -def process_finished_workflow_jobs(workflow_jobs): - for workflow_job in workflow_jobs: - dag = WorkflowDAG(workflow_job) - if dag.is_workflow_done(): - with transaction.atomic(): + ''' + Tasks that are running and SHOULD have a celery task. + ''' + def get_running_tasks(self): + status_list = ('running',) + + jobs = JobDict.filter_partial(status=status_list) + inventory_updates = InventoryUpdateDict.filter_partial(status=status_list) + project_updates = ProjectUpdateDict.filter_partial(status=status_list) + system_jobs = SystemJobDict.filter_partial(status=status_list) + ad_hoc_commands = AdHocCommandDict.filter_partial(status=status_list) + + all_actions = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands, + key=lambda task: task['created']) + return all_actions + + # TODO: Consider a database query for this logic + def get_latest_project_update_tasks(self, all_sorted_tasks): + project_ids = Set() + for task in all_sorted_tasks: + if type(task) == JobDict: + project_ids.add(task['project_id']) + + return ProjectUpdateLatestDict.filter_partial(list(project_ids)) + + # TODO: Consider a database query for this logic + def get_latest_inventory_update_tasks(self, all_sorted_tasks): + inventory_ids = Set() + for task in all_sorted_tasks: + if type(task) == JobDict: + inventory_ids.add(task['inventory_id']) + + return InventoryUpdateLatestDict.filter_partial(list(inventory_ids)) + + + def get_running_workflow_jobs(self): + graph_workflow_jobs = [wf for wf in + WorkflowJob.objects.filter(status='running')] + return graph_workflow_jobs + + # TODO: Consider a database query for this logic + def get_inventory_source_tasks(self, all_sorted_tasks): + inventory_ids = Set() + results = [] + for task in all_sorted_tasks: + if type(task) is JobDict: + inventory_ids.add(task['inventory_id']) + + for inventory_id in inventory_ids: + results.append((inventory_id, InventorySourceDict.filter_partial(inventory_id))) + + return results + + def spawn_workflow_graph_jobs(self, workflow_jobs): + for workflow_job in workflow_jobs: + dag = WorkflowDAG(workflow_job) + spawn_nodes = dag.bfs_nodes_to_run() + for spawn_node in spawn_nodes: + kv = spawn_node.get_job_kwargs() + job = spawn_node.unified_job_template.create_unified_job(**kv) + spawn_node.job = job + spawn_node.save() + can_start = job.signal_start(**kv) + if not can_start: + job.status = 'failed' + job.job_explanation = "Workflow job could not start because it was not in the right state or required manual credentials" + job.save(update_fields=['status', 'job_explanation']) + connection.on_commit(lambda: job.websocket_emit_status('failed')) + + # TODO: should we emit a status on the socket here similar to tasks.py tower_periodic_scheduler() ? + #emit_websocket_notification('/socket.io/jobs', '', dict(id=)) + + # See comment in tasks.py::RunWorkflowJob::run() + def process_finished_workflow_jobs(self, workflow_jobs): + for workflow_job in workflow_jobs: + dag = WorkflowDAG(workflow_job) + if dag.is_workflow_done(): if workflow_job._has_failed(): workflow_job.status = 'failed' else: workflow_job.status = 'successful' workflow_job.save() - workflow_job.websocket_emit_status(workflow_job.status) + connection.on_commit(lambda: workflow_job.websocket_emit_status(workflow_job.status)) -def rebuild_graph(): - """Regenerate the task graph by refreshing known tasks from Tower, purging - orphaned running tasks, and creating dependencies for new tasks before - generating directed edge relationships between those tasks. - """ - ''' - # Sanity check: Only do this on the primary node. - if Instance.objects.my_role() == 'secondary': - return None - ''' + def get_active_tasks(self): + inspector = inspect() + if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'): + active_task_queues = inspector.active() + else: + logger.warn("Ignoring celery task inspector") + active_task_queues = None - inspector = inspect() - if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'): - active_task_queues = inspector.active() - else: - logger.warn("Ignoring celery task inspector") - active_task_queues = None + active_tasks = set() + if active_task_queues is not None: + for queue in active_task_queues: + map(lambda at: active_tasks.add(at['id']), active_task_queues[queue]) + else: + if not hasattr(settings, 'CELERY_UNIT_TEST'): + return None - all_sorted_tasks = get_tasks() - if not len(all_sorted_tasks): - return None + return active_tasks - active_tasks = [] - if active_task_queues is not None: - for queue in active_task_queues: - active_tasks += [at['id'] for at in active_task_queues[queue]] - else: - logger.error("Could not communicate with celery!") - # TODO: Something needs to be done here to signal to the system - # as a whole that celery appears to be down. - if not hasattr(settings, 'CELERY_UNIT_TEST'): - return None + def start_task(self, task, dependent_tasks=[]): + from awx.main.tasks import handle_work_error, handle_work_success - running_tasks = filter(lambda t: t.status == 'running', all_sorted_tasks) - running_celery_tasks = filter(lambda t: type(t) != WorkflowJob, running_tasks) - waiting_tasks = filter(lambda t: t.status != 'running', all_sorted_tasks) - new_tasks = filter(lambda t: t.status == 'pending', all_sorted_tasks) + task_actual = { + 'type':task.get_job_type_str(), + 'id': task['id'], + } + dependencies = [{'type': t.get_job_type_str(), 'id': t['id']} for t in dependent_tasks] + + error_handler = handle_work_error.s(subtasks=[task_actual] + dependencies) + success_handler = handle_work_success.s(task_actual=task_actual) + + job_obj = task.get_full() + job_obj.status = 'waiting' - # Check running tasks and make sure they are active in celery - logger.debug("Active celery tasks: " + str(active_tasks)) - for task in list(running_celery_tasks): - if (task.celery_task_id not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')): - # NOTE: Pull status again and make sure it didn't finish in - # the meantime? - task.status = 'failed' - task.job_explanation += ' '.join(( - 'Task was marked as running in Tower but was not present in', - 'Celery, so it has been marked as failed.', - )) - task.save() - task.websocket_emit_status("failed") - running_tasks.pop(running_tasks.index(task)) - logger.error("Task %s appears orphaned... marking as failed" % task) + (start_status, opts) = job_obj.pre_start() + if not start_status: + job_obj.status = 'failed' + if job_obj.job_explanation: + job_obj.job_explanation += ' ' + job_obj.job_explanation += 'Task failed pre-start check.' + job_obj.save() + # TODO: run error handler to fail sub-tasks and send notifications + else: + if type(job_obj) is WorkflowJob: + job_obj.status = 'running' - # Create and process dependencies for new tasks - for task in new_tasks: - logger.debug("Checking dependencies for: %s" % str(task)) - try: - task_dependencies = task.generate_dependencies(running_tasks + waiting_tasks) - except Exception, e: - logger.error("Failed processing dependencies for {}: {}".format(task, e)) - task.status = 'failed' - task.job_explanation += 'Task failed to generate dependencies: {}'.format(e) - task.save() - task.websocket_emit_status("failed") - continue - logger.debug("New dependencies: %s" % str(task_dependencies)) - for dep in task_dependencies: - # We recalculate the created time for the moment to ensure the - # dependencies are always sorted in the right order relative to - # the dependent task. - time_delt = len(task_dependencies) - task_dependencies.index(dep) - dep.created = task.created - datetime.timedelta(seconds=1 + time_delt) - dep.status = 'waiting' - dep.save() - waiting_tasks.insert(waiting_tasks.index(task), dep) - if not hasattr(settings, 'UNIT_TEST_IGNORE_TASK_WAIT'): - task.status = 'waiting' - task.save() + job_obj.save() - # Rebuild graph - graph = SimpleDAG() - for task in running_tasks: - graph.add_node(task) - for wait_task in waiting_tasks[:50]: - node_dependencies = [] - for node in graph: - if wait_task.is_blocked_by(node['node_object']): - node_dependencies.append(node['node_object']) - graph.add_node(wait_task) - for dependency in node_dependencies: - graph.add_edge(wait_task, dependency) - if settings.DEBUG: - graph.generate_graphviz_plot() - return graph + self.consume_capacity(task) -def process_graph(graph, task_capacity): - """Given a task dependency graph, start and manage tasks given their - priority and weight. - """ - from awx.main.tasks import handle_work_error, handle_work_success + def post_commit(): + job_obj.websocket_emit_status(job_obj.status) + if job_obj.status != 'failed': + job_obj.start_celery_task(opts, error_callback=error_handler, success_callback=success_handler) + + connection.on_commit(post_commit) - leaf_nodes = graph.get_leaf_nodes() - running_nodes = filter(lambda x: x['node_object'].status == 'running', leaf_nodes) - running_impact = sum([t['node_object'].task_impact for t in running_nodes]) - ready_nodes = filter(lambda x: x['node_object'].status != 'running', leaf_nodes) - remaining_volume = task_capacity - running_impact - logger.info('Running Nodes: %s; Capacity: %s; Running Impact: %s; ' - 'Remaining Capacity: %s' % - (str(running_nodes), str(task_capacity), - str(running_impact), str(remaining_volume))) - logger.info("Ready Nodes: %s" % str(ready_nodes)) - for task_node in ready_nodes: - node_obj = task_node['node_object'] - # NOTE: This could be used to pass metadata through the task system - # node_args = task_node['metadata'] - impact = node_obj.task_impact - if impact <= remaining_volume or running_impact == 0: - node_dependencies = graph.get_dependents(node_obj) - # Allow other tasks to continue if a job fails, even if they are - # other jobs. + def process_runnable_tasks(self, runnable_tasks): + map(lambda task: self.graph.add_job(task), runnable_tasks) - node_type = graph.get_node_type(node_obj) - if node_type == 'job': - # clear dependencies because a job can block (not necessarily - # depend) on other jobs that share the same job template - node_dependencies = [] + def create_project_update(self, task): + dep = Project.objects.get(id=task['project_id']).create_project_update(launch_type='dependency') - # Make the workflow_job look like it's started by setting status to - # running, but don't make a celery Task for it. - # Introduce jobs from the workflow so they are candidates to run. - # Call process_graph() again to allow choosing for run, the - # created candidate jobs. - elif node_type == 'workflow_job': - node_obj.start() - spawn_workflow_graph_jobs([node_obj]) - return process_graph(graph, task_capacity) + # Project created 1 seconds behind + dep.created = task['created'] - timedelta(seconds=1) + dep.status = 'pending' + dep.save() - dependent_nodes = [{'type': graph.get_node_type(node_obj), 'id': node_obj.id}] + \ - [{'type': graph.get_node_type(n['node_object']), - 'id': n['node_object'].id} for n in node_dependencies] - error_handler = handle_work_error.s(subtasks=dependent_nodes) - success_handler = handle_work_success.s(task_actual={'type': graph.get_node_type(node_obj), - 'id': node_obj.id}) - with transaction.atomic(): - start_status = node_obj.start(error_callback=error_handler, success_callback=success_handler) - if not start_status: - node_obj.status = 'failed' - if node_obj.job_explanation: - node_obj.job_explanation += ' ' - node_obj.job_explanation += 'Task failed pre-start check.' - node_obj.save() - continue - remaining_volume -= impact - running_impact += impact - logger.info('Started Node: %s (capacity hit: %s) ' - 'Remaining Capacity: %s' % - (str(node_obj), str(impact), str(remaining_volume))) + project_task = ProjectUpdateDict.get_partial(dep.id) -def schedule(): - with transaction.atomic(): - # Lock - Instance.objects.select_for_update().all()[0] + return project_task - task_capacity = get_system_task_capacity() + def create_inventory_update(self, task, inventory_source_task): + dep = InventorySource.objects.get(id=inventory_source_task['id']).create_inventory_update(launch_type='dependency') - workflow_jobs = get_running_workflow_jobs() - process_finished_workflow_jobs(workflow_jobs) - spawn_workflow_graph_jobs(workflow_jobs) + dep.created = task['created'] - timedelta(seconds=2) + dep.status = 'pending' + dep.save() + + inventory_task = InventoryUpdateDict.get_partial(dep.id) + + return inventory_task + + def generate_dependencies(self, task): + dependencies = [] + # TODO: What if the project is null ? + if type(task) is JobDict: + if task['project__scm_update_on_launch'] is True and \ + self.graph.should_update_related_project(task): + project_task = self.create_project_update(task) + dependencies.append(project_task) + # Inventory created 2 seconds behind job + + for inventory_source_task in self.graph.get_inventory_sources(task['inventory_id']): + if self.graph.should_update_related_inventory_source(task, inventory_source_task['id']): + inventory_task = self.create_inventory_update(task, inventory_source_task) + dependencies.append(inventory_task) + return dependencies + + def process_latest_project_updates(self, latest_project_updates): + map(lambda task: self.graph.add_latest_project_update(task), latest_project_updates) + + def process_latest_inventory_updates(self, latest_inventory_updates): + map(lambda task: self.graph.add_latest_inventory_update(task), latest_inventory_updates) + + def process_inventory_sources(self, inventory_id_sources): + map(lambda (inventory_id, inventory_sources): self.graph.add_inventory_sources(inventory_id, inventory_sources), inventory_id_sources) + + def process_dependencies(self, dependent_task, dependency_tasks): + for task in dependency_tasks: + # ProjectUpdate or InventoryUpdate may be blocked by another of + # the same type. + if not self.graph.is_job_blocked(task): + self.graph.add_job(task) + if not self.would_exceed_capacity(task): + self.start_task(task, [dependent_task]) + else: + self.graph.add_job(task) + + def process_pending_tasks(self, pending_tasks): + for task in pending_tasks: + # Stop processing tasks if we know we are out of capacity + if self.get_remaining_capacity() <= 0: + return + + if not self.graph.is_job_blocked(task): + dependencies = self.generate_dependencies(task) + self.process_dependencies(task, dependencies) + + # Spawning deps might have blocked us + if not self.graph.is_job_blocked(task): + self.graph.add_job(task) + if not self.would_exceed_capacity(task): + self.start_task(task) + else: + self.graph.add_job(task) + + def process_celery_tasks(self, active_tasks, all_running_sorted_tasks): + ''' + Rectify tower db <-> celery inconsistent view of jobs state + ''' + for task in all_running_sorted_tasks: + + if (task['celery_task_id'] not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')): + # NOTE: Pull status again and make sure it didn't finish in + # the meantime? + # TODO: try catch the getting of the job. The job COULD have been deleted + task_obj = task.get_full() + task_obj.status = 'failed' + task_obj.job_explanation += ' '.join(( + 'Task was marked as running in Tower but was not present in', + 'Celery, so it has been marked as failed.', + )) + task_obj.save() + print("Going to fail %s" % task_obj.id) + connection.on_commit(lambda: task_obj.websocket_emit_status('failed')) + + logger.error("Task %s appears orphaned... marking as failed" % task) + + + def calculate_capacity_used(self, tasks): + self.capacity_used = 0 + for t in tasks: + self.capacity_used += t.task_impact() + + def would_exceed_capacity(self, task): + return (task.task_impact() + self.capacity_used > self.capacity_total) + + def consume_capacity(self, task): + self.capacity_used += task.task_impact() + + def get_remaining_capacity(self): + return (self.capacity_total - self.capacity_used) + + def process_tasks(self, all_sorted_tasks): + + running_tasks = filter(lambda t: t['status'] == 'running', all_sorted_tasks) + runnable_tasks = filter(lambda t: t['status'] in ['waiting', 'running'], all_sorted_tasks) + + self.calculate_capacity_used(running_tasks) + + self.process_runnable_tasks(runnable_tasks) + + pending_tasks = filter(lambda t: t['status'] in 'pending', all_sorted_tasks) + self.process_pending_tasks(pending_tasks) + + def _schedule(self): + all_sorted_tasks = self.get_tasks() + if len(all_sorted_tasks) > 0: + #self.process_celery_tasks(active_tasks, all_sorted_tasks) + + latest_project_updates = self.get_latest_project_update_tasks(all_sorted_tasks) + self.process_latest_project_updates(latest_project_updates) + + latest_inventory_updates = self.get_latest_inventory_update_tasks(all_sorted_tasks) + self.process_latest_inventory_updates(latest_inventory_updates) + + inventory_id_sources = self.get_inventory_source_tasks(all_sorted_tasks) + self.process_inventory_sources(inventory_id_sources) + + running_workflow_tasks = self.get_running_workflow_jobs() + self.process_finished_workflow_jobs(running_workflow_tasks) + + self.spawn_workflow_graph_jobs(running_workflow_tasks) + + self.process_tasks(all_sorted_tasks) + + def schedule(self): + with transaction.atomic(): + # Lock + try: + Instance.objects.select_for_update(nowait=True).all()[0] + except DatabaseError: + return + + self._schedule() - graph = rebuild_graph() - if graph: - process_graph(graph, task_capacity) - # Unlock, due to transaction ending diff --git a/awx/main/scheduler/dependency_graph.py b/awx/main/scheduler/dependency_graph.py new file mode 100644 index 0000000000..edd49c98a9 --- /dev/null +++ b/awx/main/scheduler/dependency_graph.py @@ -0,0 +1,203 @@ +from datetime import timedelta +from django.utils.timezone import now as tz_now + +from awx.main.scheduler.partial import ( + JobDict, + ProjectUpdateDict, + InventoryUpdateDict, + SystemJobDict, + AdHocCommandDict, + WorkflowJobDict, +) +class DependencyGraph(object): + PROJECT_UPDATES = 'project_updates' + INVENTORY_UPDATES = 'inventory_updates' + JOB_TEMPLATE_JOBS = 'job_template_jobs' + SYSTEM_JOB = 'system_job' + INVENTORY_SOURCE_UPDATES = 'inventory_source_updates' + WORKFLOW_JOB_TEMPLATES_JOBS = 'workflow_job_template_jobs' + + LATEST_PROJECT_UPDATES = 'latest_project_updates' + LATEST_INVENTORY_UPDATES = 'latest_inventory_updates' + + INVENTORY_SOURCES = 'inventory_source_ids' + + def __init__(self, *args, **kwargs): + self.data = {} + # project_id -> True / False + self.data[self.PROJECT_UPDATES] = {} + # inventory_id -> True / False + self.data[self.INVENTORY_UPDATES] = {} + # job_template_id -> True / False + self.data[self.JOB_TEMPLATE_JOBS] = {} + # inventory_source_id -> True / False + self.data[self.INVENTORY_SOURCE_UPDATES] = {} + # True / False + self.data[self.SYSTEM_JOB] = True + # workflow_job_template_id -> True / False + self.data[self.WORKFLOW_JOB_TEMPLATES_JOBS] = {} + + # project_id -> latest ProjectUpdateLatestDict + self.data[self.LATEST_PROJECT_UPDATES] = {} + # inventory_source_id -> latest InventoryUpdateLatestDict + self.data[self.LATEST_INVENTORY_UPDATES] = {} + + # inventory_id -> [inventory_source_ids] + self.data[self.INVENTORY_SOURCES] = {} + + def add_latest_project_update(self, job): + self.data[self.LATEST_PROJECT_UPDATES][job['project_id']] = job + + def add_latest_inventory_update(self, job): + self.data[self.LATEST_INVENTORY_UPDATES][job['inventory_source_id']] = job + + def add_inventory_sources(self, inventory_id, inventory_sources): + self.data[self.INVENTORY_SOURCES][inventory_id] = inventory_sources + + def get_inventory_sources(self, inventory_id): + return self.data[self.INVENTORY_SOURCES].get(inventory_id, []) + + def get_now(self): + return tz_now() + + ''' + JobDict + + Presume that job is related to a project that is update on launch + ''' + def should_update_related_project(self, job): + now = self.get_now() + latest_project_update = self.data[self.LATEST_PROJECT_UPDATES].get(job['project_id'], None) + if not latest_project_update: + return True + + # TODO: Other finished, failed cases? i.e. error ? + if latest_project_update['status'] in ['failed', 'canceled']: + return True + + ''' + This is a bit of fuzzy logic. + If the latest project update has a created time == job_created_time-1 + then consider the project update found. This is so we don't enter an infinite loop + of updating the project when cache timeout is 0. + ''' + if latest_project_update['project__scm_update_cache_timeout'] == 0 and \ + latest_project_update['launch_type'] == 'dependency' and \ + latest_project_update['created'] == job['created'] - timedelta(seconds=1): + return False + + ''' + Normal, expected, cache timeout logic + ''' + timeout_seconds = timedelta(seconds=latest_project_update['project__scm_update_cache_timeout']) + if (latest_project_update['finished'] + timeout_seconds) < now: + return True + + return False + + def should_update_related_inventory_source(self, job, inventory_source_id): + now = self.get_now() + latest_inventory_update = self.data[self.LATEST_INVENTORY_UPDATES].get(inventory_source_id, None) + if not latest_inventory_update: + return True + + # TODO: Other finished, failed cases? i.e. error ? + if latest_inventory_update['status'] in ['failed', 'canceled']: + return True + + ''' + This is a bit of fuzzy logic. + If the latest inventory update has a created time == job_created_time-2 + then consider the inventory update found. This is so we don't enter an infinite loop + of updating the project when cache timeout is 0. + ''' + if latest_inventory_update['inventory_source__update_cache_timeout'] == 0 and \ + latest_inventory_update['launch_type'] == 'dependency' and \ + latest_inventory_update['created'] == job['created'] - timedelta(seconds=2): + return False + + ''' + Normal, expected, cache timeout logic + ''' + timeout_seconds = timedelta(seconds=latest_inventory_update['inventory_source__update_cache_timeout']) + if (latest_inventory_update['finished'] + timeout_seconds) < now: + return True + + return False + + def mark_system_job(self): + self.data[self.SYSTEM_JOB] = False + + def mark_project_update(self, job): + self.data[self.PROJECT_UPDATES][job['project_id']] = False + + def mark_inventory_update(self, inventory_id): + self.data[self.INVENTORY_UPDATES][inventory_id] = False + + def mark_inventory_source_update(self, inventory_source_id): + self.data[self.INVENTORY_SOURCE_UPDATES][inventory_source_id] = False + + def mark_job_template_job(self, job): + self.data[self.INVENTORY_UPDATES][job['inventory_id']] = False + self.data[self.PROJECT_UPDATES][job['project_id']] = False + self.data[self.JOB_TEMPLATE_JOBS][job['job_template_id']] = False + + def mark_workflow_job(self, job): + self.data[self.WORKFLOW_JOB_TEMPLATES_JOBS][job['workflow_job_template_id']] = False + + def can_project_update_run(self, job): + return self.data[self.PROJECT_UPDATES].get(job['project_id'], True) + + def can_inventory_update_run(self, job): + return self.data[self.INVENTORY_SOURCE_UPDATES].get(job['inventory_source_id'], True) + + def can_job_run(self, job): + if self.can_project_update_run(job) is True and \ + self.data[self.INVENTORY_UPDATES].get(job['inventory_id'], True) is True: + if job['allow_simultaneous'] is False: + return self.data[self.JOB_TEMPLATE_JOBS].get(job['job_template_id'], True) + else: + return True + return False + + def can_workflow_job_run(self, job): + return self.data[self.WORKFLOW_JOB_TEMPLATES_JOBS].get(job['workflow_job_template_id'], True) + + def can_system_job_run(self): + return self.data[self.SYSTEM_JOB] + + def can_ad_hoc_command_run(self, job): + return self.data[self.INVENTORY_UPDATES].get(job['inventory_id'], True) + + def is_job_blocked(self, job): + if type(job) is ProjectUpdateDict: + return not self.can_project_update_run(job) + elif type(job) is InventoryUpdateDict: + return not self.can_inventory_update_run(job) + elif type(job) is JobDict: + return not self.can_job_run(job) + elif type(job) is SystemJobDict: + return not self.can_system_job_run() + elif type(job) is AdHocCommandDict: + return not self.can_ad_hoc_command_run(job) + elif type(job) is WorkflowJobDict: + return not self.can_workflow_job_run(job) + + def add_job(self, job): + if type(job) is ProjectUpdateDict: + self.mark_project_update(job) + elif type(job) is InventoryUpdateDict: + self.mark_inventory_update(job['inventory_source__inventory_id']) + self.mark_inventory_source_update(job['inventory_source_id']) + elif type(job) is JobDict: + self.mark_job_template_job(job) + elif type(job) is WorkflowJobDict: + self.mark_workflow_job(job) + elif type(job) is SystemJobDict: + self.mark_system_job() + elif type(job) is AdHocCommandDict: + self.mark_inventory_update(job['inventory_id']) + + def add_jobs(self, jobs): + map(lambda j: self.add_job(j), jobs) + diff --git a/awx/main/scheduler/partial.py b/awx/main/scheduler/partial.py new file mode 100644 index 0000000000..576b66a9c3 --- /dev/null +++ b/awx/main/scheduler/partial.py @@ -0,0 +1,220 @@ + +# AWX +from awx.main.models import ( + Job, + ProjectUpdate, + InventoryUpdate, + InventorySource, + SystemJob, + AdHocCommand, + WorkflowJob, +) + +class PartialModelDict(object): + FIELDS = () + model = None + data = None + + def __init__(self, data): + if type(data) is not dict: + raise RuntimeError("Expected data to be of type dict not %s" % type(data)) + self.data = data + + def __getitem__(self, index): + return self.data[index] + + def __setitem__(self, key, value): + self.data[key] = value + + def get(self, key, **kwargs): + return self.data.get(key, **kwargs) + + def get_full(self): + return self.model.objects.get(id=self.data['id']) + + def refresh_partial(self): + return self.__class__(self.model.objects.filter(id=self.data['id']).values(*self.__class__.get_db_values())[0]) + + @classmethod + def get_partial(cls, id): + return cls(cls.model.objects.filter(id=id).values(*cls.get_db_values())[0]) + + @classmethod + def get_db_values(cls): + return cls.FIELDS + + @classmethod + def filter_partial(cls, status=[]): + kv = { + 'status__in': status + } + return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())] + + def get_job_type_str(self): + raise RuntimeError("Inherit and implement me") + + def task_impact(self): + raise RuntimeError("Inherit and implement me") + +class JobDict(PartialModelDict): + FIELDS = ( + 'id', 'status', 'job_template_id', 'inventory_id', 'project_id', + 'launch_type', 'limit', 'allow_simultaneous', 'created', + 'job_type', 'celery_task_id', 'project__scm_update_on_launch', + 'forks', 'inventory__inventory_sources', + ) + model = Job + + def get_job_type_str(self): + return 'job' + + def task_impact(self): + return (5 if self.data['forks'] == 0 else self.data['forks']) * 10 + +class ProjectUpdateDict(PartialModelDict): + FIELDS = ( + 'id', 'status', 'project_id', 'created', 'celery_task_id', + 'launch_type', 'project__scm_update_cache_timeout', + 'project__scm_update_on_launch', + ) + model = ProjectUpdate + + def get_job_type_str(self): + return 'project_update' + + def task_impact(self): + return 10 + + @classmethod + def filter_partial(cls, status=[]): + kv = { + 'status__in': status, + 'job_type': 'check', + } + return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())] + +class ProjectUpdateLatestDict(ProjectUpdateDict): + FIELDS = ( + 'id', 'status', 'project_id', 'created', 'finished', + 'project__scm_update_cache_timeout', + 'launch_type', 'project__scm_update_on_launch', + ) + model = ProjectUpdate + + @classmethod + def filter_partial(cls, project_ids): + # TODO: This can shurley be made more efficient + # * shouldn't have to do a query per inventory_id + # * shouldn't have to call .values() on all the results, only to get the first result + results = [] + for project_id in project_ids: + qs = cls.model.objects.filter(project_id=project_id, status__in=['waiting', 'successful', 'failed']).order_by('-finished', '-started', '-created',) + if qs.count() > 0: + results.append(cls(cls.model.objects.filter(id=qs[0].id).values(*cls.get_db_values())[0])) + return results + +class InventoryUpdateDict(PartialModelDict): + #'inventory_source__update_on_launch', + #'inventory_source__update_cache_timeout', + FIELDS = ( + 'id', 'status', 'created', 'celery_task_id', 'inventory_source_id', 'inventory_source__inventory_id', + ) + model = InventoryUpdate + + def get_job_type_str(self): + return 'inventory_update' + + def task_impact(self): + return 20 + +class InventoryUpdateLatestDict(InventoryUpdateDict): + #'inventory_source__update_on_launch', + #'inventory_source__update_cache_timeout', + FIELDS = ( + 'id', 'status', 'created', 'celery_task_id', 'inventory_source_id', + 'finished', 'inventory_source__update_cache_timeout', 'launch_type', + ) + model = InventoryUpdate + + @classmethod + def filter_partial(cls, inventory_ids): + # TODO: This can shurley be made more efficient + # * shouldn't have to do a query per inventory_id nor per inventory_source_id + # * shouldn't have to call .values() on all the results, only to get the first result + results = [] + for inventory_id in inventory_ids: + inventory_source_ids = InventorySource.objects.filter(inventory_id=inventory_id, + update_on_launch=True).values_list('id', flat=True) + # Find the most recent inventory update for each inventory source + for inventory_source_id in inventory_source_ids: + qs = cls.model.objects.filter(inventory_source_id=inventory_source_id, + status__in=['waiting', 'successful', 'failed'], + inventory_source__update_on_launch=True).order_by('-finished', '-started', '-created') + if qs.count() > 0: + results.append(cls(cls.model.objects.filter(id=qs[0].id).values(*cls.get_db_values())[0])) + return results + +class InventorySourceDict(PartialModelDict): + FIELDS = ( + 'id', + ) + model = InventorySource + + def get_job_type_str(self): + return 'inventory_source' + + def task_impact(self): + return 20 + + @classmethod + # TODO: Optimize this to run the query once + def filter_partial(cls, inventory_id): + kv = { + 'inventory_id': inventory_id, + 'update_on_launch': True, + } + return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())] + +class SystemJobDict(PartialModelDict): + FIELDS = ( + 'id', 'created', 'status', + ) + model = SystemJob + + def get_job_type_str(self): + return 'system_job' + + def task_impact(self): + return 20 + + @classmethod + def filter_partial(cls, status=[]): + kv = { + 'status__in': status + } + return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())] + +class AdHocCommandDict(PartialModelDict): + FIELDS = ( + 'id', 'created', 'status', 'inventory_id', + ) + model = AdHocCommand + + def get_job_type_str(self): + return 'ad_hoc_command' + + def task_impact(self): + return 20 + +class WorkflowJobDict(PartialModelDict): + FIELDS = ( + 'id', 'created', 'status', 'workflow_job_template_id', + ) + model = WorkflowJob + + def get_job_type_str(self): + return 'workflow_job' + + def task_impact(self): + return 10 + diff --git a/awx/main/scheduler/tasks.py b/awx/main/scheduler/tasks.py index 343bdd1546..622876a44e 100644 --- a/awx/main/scheduler/tasks.py +++ b/awx/main/scheduler/tasks.py @@ -1,14 +1,17 @@ # Python import logging -import time + +# Django +from django.db import transaction +from django.db.utils import DatabaseError # Celery from celery import task # AWX -from awx.main.models import UnifiedJob -from awx.main.scheduler import schedule +from awx.main.models import Instance +from awx.main.scheduler import TaskManager logger = logging.getLogger('awx.main.scheduler') @@ -18,62 +21,31 @@ logger = logging.getLogger('awx.main.scheduler') @task def run_job_launch(job_id): - # Wait for job to exist. - # The job is created in a transaction then the message is created, but - # the transaction may not have completed. - - # FIXME: We could generate the message in a Django signal handler. - # OR, we could call an explicit commit in the view and then send the - # message. - - retries = 10 - retry = 0 - while not UnifiedJob.objects.filter(id=job_id).exists(): - time.sleep(0.3) - - if retry >= retries: - logger.error("Failed to process 'job_launch' message for job %d" % job_id) - # ack the message so we don't build up the queue. - # - # The job can still be chosen to run during tower startup or - # when another job is started or completes - return - retry += 1 - - # "Safe" to get the job now since it exists. - # Really, there is a race condition from exists to get - - # TODO: while not loop should call get wrapped in a try except - #job = UnifiedJob.objects.get(id=job_id) - - schedule() + TaskManager().schedule() @task def run_job_complete(job_id): - # TODO: use list of finished status from jobs.py or unified_jobs.py - finished_status = ['successful', 'error', 'failed', 'completed'] - q = UnifiedJob.objects.filter(id=job_id) + TaskManager().schedule() - # Ensure that the job is updated in the database before we call to - # schedule the next job. - retries = 10 - retry = 0 - while True: - # Job not found, most likely deleted. That's fine - if not q.exists(): - logger.warn("Failed to find job '%d' while processing 'job_complete' message. Presume that it was deleted." % job_id) - break +@task +def run_task_manager(): + TaskManager().schedule() - job = q[0] - if job.status in finished_status: - break +@task +def run_fail_inconsistent_running_jobs(): + with transaction.atomic(): + # Lock + try: + Instance.objects.select_for_update(nowait=True).all()[0] + scheduler = TaskManager() + active_tasks = scheduler.get_active_tasks() - time.sleep(0.3) - - if retry >= retries: - logger.error("Expected job status '%s' to be one of '%s' while processing 'job_complete' message." % (job.status, finished_status)) + if active_tasks is None: + # TODO: Failed to contact celery. We should surface this. + return None + + all_running_sorted_tasks = scheduler.get_running_tasks() + scheduler.process_celery_tasks(active_tasks, all_running_sorted_tasks) + except DatabaseError: return - retry += 1 - - schedule() diff --git a/awx/main/tasks.py b/awx/main/tasks.py index b76bcb48a6..7f187c7bce 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -21,7 +21,6 @@ import traceback import urlparse import uuid from distutils.version import LooseVersion as Version -import dateutil.parser import yaml try: import psutil @@ -137,30 +136,12 @@ def cluster_node_heartbeat(self): @task(bind=True, queue='default') def tower_periodic_scheduler(self): - def get_last_run(): - if not os.path.exists(settings.SCHEDULE_METADATA_LOCATION): - return None - fd = open(settings.SCHEDULE_METADATA_LOCATION) - try: - last_run = dateutil.parser.parse(fd.read()) - return last_run - except Exception as exc: - logger.error("get_last_run failed: {}".format(exc)) - return None - - def write_last_run(last_run): - fd = open(settings.SCHEDULE_METADATA_LOCATION, 'w') - fd.write(last_run.isoformat()) - fd.close() - run_now = now() - last_run = get_last_run() - if not last_run: - logger.debug("First run time") - write_last_run(run_now) - return + state = TowerScheduleState.get_solo() + last_run = state.schedule_last_run logger.debug("Last run was: %s", last_run) - write_last_run(run_now) + state.schedule_last_run = run_now + state.save() old_schedules = Schedule.objects.enabled().before(last_run) for schedule in old_schedules: @@ -180,6 +161,7 @@ def tower_periodic_scheduler(self): new_unified_job.save(update_fields=['status', 'job_explanation']) new_unified_job.websocket_emit_status("failed") emit_channel_notification('schedules-changed', dict(id=schedule.id, group_name="schedules")) + state.save() def _send_notification_templates(instance, status_str): if status_str not in ['succeeded', 'failed']: @@ -1756,7 +1738,7 @@ class RunAdHocCommand(BaseTask): ''' Hook for actions to run after ad hoc command has completed. ''' - super(RunAdHocCommand, self).post_run_hook(ad_hoc_command, **kwargs) + super(RunAdHocCommand, self).post_run_hook(ad_hoc_command, status, **kwargs) class RunSystemJob(BaseTask): diff --git a/awx/main/tests/functional/test_jobs.py b/awx/main/tests/functional/test_jobs.py index 83302e7400..55b5d428c9 100644 --- a/awx/main/tests/functional/test_jobs.py +++ b/awx/main/tests/functional/test_jobs.py @@ -2,40 +2,6 @@ from awx.main.models import Job import pytest -@pytest.mark.django_db -def test_job_blocking(get, post, job_template, inventory, inventory_factory): - j1 = Job.objects.create(job_template=job_template, - inventory=inventory) - j2 = Job.objects.create(job_template=job_template, - inventory=inventory) - assert j1.is_blocked_by(j2) - j2.inventory = inventory_factory(name='test-different-inventory') - assert not j1.is_blocked_by(j2) - j_callback_1 = Job.objects.create(job_template=job_template, - inventory=inventory, - launch_type='callback', - limit='a') - j_callback_2 = Job.objects.create(job_template=job_template, - inventory=inventory, - launch_type='callback', - limit='a') - assert j_callback_1.is_blocked_by(j_callback_2) - j_callback_2.limit = 'b' - assert not j_callback_1.is_blocked_by(j_callback_2) - -@pytest.mark.django_db -def test_job_blocking_allow_simul(get, post, job_template, inventory): - job_template.allow_simultaneous = True - j1 = Job.objects.create(job_template=job_template, - inventory=inventory) - j2 = Job.objects.create(job_template=job_template, - inventory=inventory) - assert not j1.is_blocked_by(j2) - assert not j2.is_blocked_by(j1) - job_template.allow_simultaneous = False - assert j1.is_blocked_by(j2) - assert j2.is_blocked_by(j1) - @pytest.mark.django_db def test_orphan_unified_job_creation(instance, inventory): job = Job.objects.create(job_template=None, inventory=inventory, name='hi world') diff --git a/awx/main/tests/functional/test_partial.py b/awx/main/tests/functional/test_partial.py new file mode 100644 index 0000000000..0ab84dc901 --- /dev/null +++ b/awx/main/tests/functional/test_partial.py @@ -0,0 +1,120 @@ + +# Python +import pytest +from django.utils.timezone import now as tz_now +from datetime import timedelta + +# AWX +from awx.main.models import ( + Organization, + Inventory, + Group, + Project, + ProjectUpdate, + InventoryUpdate, + InventorySource, +) +from awx.main.scheduler.partial import ( + ProjectUpdateLatestDict, + InventoryUpdateDict, + InventoryUpdateLatestDict, +) + +@pytest.fixture +def org(): + return Organization.objects.create(name="org1") + +class TestProjectUpdateLatestDictDict(): + @pytest.fixture + def successful_project_update(self): + p = Project.objects.create(name="proj1") + pu = ProjectUpdate.objects.create(project=p, status='successful', finished=tz_now() - timedelta(seconds=20)) + + return (p, pu) + + # Failed project updates newer than successful ones + @pytest.fixture + def multiple_project_updates(self): + p = Project.objects.create(name="proj1") + + epoch = tz_now() + + successful_pus = [ProjectUpdate.objects.create(project=p, + status='successful', + finished=epoch - timedelta(seconds=100 + i)) for i in xrange(0, 5)] + failed_pus = [ProjectUpdate.objects.create(project=p, + status='failed', + finished=epoch - timedelta(seconds=100 - len(successful_pus) + i)) for i in xrange(0, 5)] + return (p, failed_pus, successful_pus) + + + @pytest.mark.django_db + class TestFilterPartial(): + def test_project_update_successful(self, successful_project_update): + (project, project_update) = successful_project_update + + tasks = ProjectUpdateLatestDict.filter_partial(project_ids=[project.id]) + + assert 1 == len(tasks) + assert project_update.id == tasks[0]['id'] + + def test_correct_project_update(self, multiple_project_updates): + (project, failed_pus, successful_pus) = multiple_project_updates + + tasks = ProjectUpdateLatestDict.filter_partial(project_ids=[project.id]) + + assert 1 == len(tasks) + assert failed_pus[0].id == tasks[0]['id'] + + +class TestInventoryUpdateDict(): + @pytest.fixture + def waiting_inventory_update(self, org): + i = Inventory.objects.create(name='inv1', organization=org) + g = Group.objects.create(name='group1', inventory=i) + #Inventory.groups.add(g) + inv_src = InventorySource.objects.create(group=g) + iu = InventoryUpdate.objects.create(inventory_source=inv_src, status='waiting') + return iu + + @pytest.mark.django_db + class TestFilterPartial(): + def test_simple(self, waiting_inventory_update): + tasks = InventoryUpdateDict.filter_partial(status=['waiting']) + + assert 1 == len(tasks) + assert waiting_inventory_update.id == tasks[0]['id'] + +class TestInventoryUpdateLatestDict(): + @pytest.fixture + def inventory(self, org): + i = Inventory.objects.create(name='inv1', organization=org) + return i + + @pytest.fixture + def inventory_updates(self, inventory): + g1 = Group.objects.create(name='group1', inventory=inventory) + g2 = Group.objects.create(name='group2', inventory=inventory) + g3 = Group.objects.create(name='group3', inventory=inventory) + + inv_src1 = InventorySource.objects.create(group=g1, update_on_launch=True, inventory=inventory) + inv_src2 = InventorySource.objects.create(group=g2, update_on_launch=False, inventory=inventory) + inv_src3 = InventorySource.objects.create(group=g3, update_on_launch=True, inventory=inventory) + + iu1 = InventoryUpdate.objects.create(inventory_source=inv_src1, status='successful') + iu2 = InventoryUpdate.objects.create(inventory_source=inv_src2, status='waiting') + iu3 = InventoryUpdate.objects.create(inventory_source=inv_src3, status='waiting') + return [iu1, iu2, iu3] + + @pytest.mark.django_db + def test_filter_partial(self, inventory, inventory_updates): + + tasks = InventoryUpdateLatestDict.filter_partial([inventory.id]) + + inventory_updates_expected = [inventory_updates[0], inventory_updates[2]] + + assert 2 == len(tasks) + for i, inventory_update in enumerate(inventory_updates_expected): + assert inventory_update.id == tasks[i]['id'] + + diff --git a/awx/main/tests/unit/scheduler/__init__.py b/awx/main/tests/unit/scheduler/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/awx/main/tests/unit/scheduler/conftest.py b/awx/main/tests/unit/scheduler/conftest.py new file mode 100644 index 0000000000..6b07649fd0 --- /dev/null +++ b/awx/main/tests/unit/scheduler/conftest.py @@ -0,0 +1,240 @@ + +# Python +import pytest +from datetime import timedelta + +# Django +from django.utils.timezone import now as tz_now + +# awx +from awx.main.scheduler.partial import ( + JobDict, + ProjectUpdateDict, + InventoryUpdateDict, + InventorySourceDict, +) +from awx.main.scheduler import TaskManager + + +@pytest.fixture +def epoch(): + return tz_now() + +@pytest.fixture +def scheduler_factory(mocker, epoch): + def fn(tasks=[], inventory_sources=[], latest_project_updates=[], latest_inventory_updates=[], create_project_update=None, create_inventory_update=None): + sched = TaskManager() + sched.capacity_total = 999999999 + + sched.graph.get_now = lambda: epoch + + def no_create_inventory_update(task, ignore): + raise RuntimeError("create_inventory_update should not be called") + + def no_create_project_update(task): + raise RuntimeError("create_project_update should not be called") + + mocker.patch.object(sched, 'get_tasks', return_value=tasks) + mocker.patch.object(sched, 'get_running_workflow_jobs', return_value=[]) + mocker.patch.object(sched, 'get_inventory_source_tasks', return_value=inventory_sources) + mocker.patch.object(sched, 'get_latest_project_update_tasks', return_value=latest_project_updates) + mocker.patch.object(sched, 'get_latest_inventory_update_tasks', return_value=latest_inventory_updates) + create_project_update_mock = mocker.patch.object(sched, 'create_project_update', return_value=create_project_update) + create_inventory_update_mock = mocker.patch.object(sched, 'create_inventory_update', return_value=create_inventory_update) + mocker.patch.object(sched, 'start_task') + + if not create_project_update: + create_project_update_mock.side_effect = no_create_project_update + if not create_inventory_update: + create_inventory_update_mock.side_effect = no_create_inventory_update + return sched + return fn + +@pytest.fixture +def project_update_factory(epoch): + def fn(): + return ProjectUpdateDict({ + 'id': 1, + 'created': epoch - timedelta(seconds=100), + 'project_id': 1, + 'project__scm_update_cache_timeout': 0, + 'celery_task_id': '', + 'launch_type': 'dependency', + 'project__scm_update_on_launch': True, + }) + return fn + +@pytest.fixture +def pending_project_update(project_update_factory): + project_update = project_update_factory() + project_update['status'] = 'pending' + return project_update + +@pytest.fixture +def waiting_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['status'] = 'waiting' + return project_update + +@pytest.fixture +def running_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['status'] = 'running' + return project_update + +@pytest.fixture +def successful_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['finished'] = epoch - timedelta(seconds=90) + project_update['status'] = 'successful' + return project_update + +@pytest.fixture +def successful_project_update_cache_expired(epoch, project_update_factory): + project_update = project_update_factory() + + project_update['status'] = 'successful' + project_update['created'] = epoch - timedelta(seconds=120) + project_update['finished'] = epoch - timedelta(seconds=110) + project_update['project__scm_update_cache_timeout'] = 1 + return project_update + +@pytest.fixture +def failed_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['finished'] = epoch - timedelta(seconds=90) + project_update['status'] = 'failed' + return project_update + +@pytest.fixture +def inventory_update_factory(epoch): + def fn(): + return InventoryUpdateDict({ + 'id': 1, + 'created': epoch - timedelta(seconds=101), + 'inventory_id': 1, + 'celery_task_id': '', + 'status': 'pending', + 'launch_type': 'dependency', + 'inventory_source_id': 1, + 'inventory_source__inventory_id': 1, + }) + return fn + +@pytest.fixture +def inventory_update_latest_factory(epoch): + def fn(): + return InventoryUpdateDict({ + 'id': 1, + 'created': epoch - timedelta(seconds=101), + 'inventory_id': 1, + 'celery_task_id': '', + 'status': 'pending', + 'launch_type': 'dependency', + 'inventory_source_id': 1, + 'finished': None, + }) + return fn + +@pytest.fixture +def inventory_update_latest(inventory_update_latest_factory): + return inventory_update_latest_factory() + +@pytest.fixture +def successful_inventory_update_latest(inventory_update_latest_factory): + iu = inventory_update_latest_factory() + iu['status'] = 'successful' + iu['finished'] = iu['created'] + timedelta(seconds=10) + return iu + +@pytest.fixture +def failed_inventory_update_latest(inventory_update_latest_factory): + iu = inventory_update_latest_factory() + iu['status'] = 'failed' + return iu + +@pytest.fixture +def pending_inventory_update(epoch, inventory_update_factory): + inventory_update = inventory_update_factory() + inventory_update['status'] = 'pending' + return inventory_update + +@pytest.fixture +def waiting_inventory_update(epoch, inventory_update_factory): + inventory_update = inventory_update_factory() + inventory_update['status'] = 'waiting' + return inventory_update + +@pytest.fixture +def failed_inventory_update(epoch, inventory_update_factory): + inventory_update = inventory_update_factory() + inventory_update['status'] = 'failed' + return inventory_update + +@pytest.fixture +def running_inventory_update(epoch, inventory_update_factory): + inventory_update = inventory_update_factory() + inventory_update['status'] = 'running' + return inventory_update + +@pytest.fixture +def successful_inventory_update(epoch, inventory_update_factory): + inventory_update = inventory_update_factory() + inventory_update['finished'] = epoch - timedelta(seconds=90) + inventory_update['status'] = 'successful' + return inventory_update + +''' +Job +''' +@pytest.fixture +def job_factory(epoch): + def fn(project__scm_update_on_launch=True, inventory__inventory_sources=[]): + return JobDict({ + 'id': 1, + 'status': 'pending', + 'job_template_id': 1, + 'project_id': 1, + 'inventory_id': 1, + 'launch_type': 'manual', + 'allow_simultaneous': False, + 'created': epoch - timedelta(seconds=99), + 'celery_task_id': '', + 'project__scm_update_on_launch': project__scm_update_on_launch, + 'inventory__inventory_sources': inventory__inventory_sources, + 'forks': 5 + }) + return fn + +@pytest.fixture +def pending_job(job_factory): + job = job_factory() + job['status'] = 'pending' + return job + +@pytest.fixture +def running_job(job_factory): + job = job_factory() + job['status'] = 'running' + return job + +''' +Inventory id -> [InventorySourceDict, ...] +''' +@pytest.fixture +def inventory_source_factory(): + def fn(id=1): + return InventorySourceDict({ + 'id': id, + }) + return fn + +@pytest.fixture +def inventory_id_sources(inventory_source_factory): + return [ + (1, [ + inventory_source_factory(id=1), + inventory_source_factory(id=2), + ]), + ] + diff --git a/awx/main/tests/unit/scheduler/test_dependency_graph.py b/awx/main/tests/unit/scheduler/test_dependency_graph.py new file mode 100644 index 0000000000..081f175027 --- /dev/null +++ b/awx/main/tests/unit/scheduler/test_dependency_graph.py @@ -0,0 +1,121 @@ + +# Python +import pytest +from datetime import timedelta + +# Django +from django.utils.timezone import now as tz_now + +# AWX +from awx.main.scheduler.dependency_graph import DependencyGraph +from awx.main.scheduler.partial import ProjectUpdateDict + +@pytest.fixture +def graph(): + return DependencyGraph() + +@pytest.fixture +def job(): + return dict(project_id=1) + +@pytest.fixture +def unsuccessful_last_project(graph, job): + pu = ProjectUpdateDict(dict(id=1, + project__scm_update_cache_timeout=999999, + project_id=1, + status='failed', + created='3', + finished='3',)) + + graph.add_latest_project_update(pu) + + return graph + +@pytest.fixture +def last_dependent_project(graph): + now = tz_now() + + job = { + 'project_id': 1, + 'created': now, + } + pu = ProjectUpdateDict(dict(id=1, project_id=1, status='waiting', + project__scm_update_cache_timeout=0, + launch_type='dependency', + created=now - timedelta(seconds=1),)) + + graph.add_latest_project_update(pu) + + return (graph, job) + +@pytest.fixture +def timedout_project_update(graph, job): + now = tz_now() + + job = { + 'project_id': 1, + 'created': now, + } + pu = ProjectUpdateDict(dict(id=1, project_id=1, status='successful', + project__scm_update_cache_timeout=10, + launch_type='dependency', + created=now - timedelta(seconds=100), + finished=now - timedelta(seconds=11),)) + + graph.add_latest_project_update(pu) + + return (graph, job) + +@pytest.fixture +def not_timedout_project_update(graph, job): + now = tz_now() + + job = { + 'project_id': 1, + 'created': now, + } + pu = ProjectUpdateDict(dict(id=1, project_id=1, status='successful', + project__scm_update_cache_timeout=3600, + launch_type='dependency', + created=now - timedelta(seconds=100), + finished=now - timedelta(seconds=11),)) + + graph.add_latest_project_update(pu) + + return (graph, job) + + +class TestShouldUpdateRelatedProject(): + + def test_no_project_updates(self, graph, job): + actual = graph.should_update_related_project(job) + + assert True is actual + + def test_timedout_project_update(self, timedout_project_update): + (graph, job) = timedout_project_update + + actual = graph.should_update_related_project(job) + + assert True is actual + + def test_not_timedout_project_update(self, not_timedout_project_update): + (graph, job) = not_timedout_project_update + + actual = graph.should_update_related_project(job) + + assert False is actual + + def test_unsuccessful_last_project(self, unsuccessful_last_project, job): + graph = unsuccessful_last_project + + actual = graph.should_update_related_project(job) + + assert True is actual + + def test_last_dependent_project(self, last_dependent_project): + (graph, job) = last_dependent_project + + actual = graph.should_update_related_project(job) + assert False is actual + diff --git a/awx/main/tests/unit/scheduler/test_scheduler_inventory_update.py b/awx/main/tests/unit/scheduler/test_scheduler_inventory_update.py new file mode 100644 index 0000000000..09125df527 --- /dev/null +++ b/awx/main/tests/unit/scheduler/test_scheduler_inventory_update.py @@ -0,0 +1,85 @@ + +# Python +import pytest +from datetime import timedelta + +@pytest.fixture +def pending_job(job_factory): + return job_factory(project__scm_update_on_launch=False, inventory__inventory_sources=['1']) + +@pytest.fixture +def successful_inventory_update_latest(inventory_update_latest_factory): + iu = inventory_update_latest_factory() + iu['inventory_source__update_cache_timeout'] = 100 + iu['status'] = 'successful' + iu['finished'] = iu['created'] + timedelta(seconds=10) + return iu + +@pytest.fixture +def successful_inventory_update_latest_cache_expired(inventory_update_latest_factory): + iu = inventory_update_latest_factory() + iu['inventory_source__update_cache_timeout'] = 1 + iu['finished'] = iu['created'] + timedelta(seconds=2) + return iu + +class TestStartInventoryUpdate(): + def test_pending(self, scheduler_factory, pending_inventory_update): + scheduler = scheduler_factory(tasks=[pending_inventory_update]) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_inventory_update) + +class TestInventoryUpdateBlocked(): + def test_running_inventory_update(self, epoch, scheduler_factory, running_inventory_update, pending_inventory_update): + running_inventory_update['created'] = epoch - timedelta(seconds=100) + pending_inventory_update['created'] = epoch - timedelta(seconds=90) + + scheduler = scheduler_factory(tasks=[running_inventory_update, pending_inventory_update]) + + scheduler._schedule() + + def test_waiting_inventory_update(self, epoch, scheduler_factory, waiting_inventory_update, pending_inventory_update): + waiting_inventory_update['created'] = epoch - timedelta(seconds=100) + pending_inventory_update['created'] = epoch - timedelta(seconds=90) + + scheduler = scheduler_factory(tasks=[waiting_inventory_update, pending_inventory_update]) + + scheduler._schedule() + +class TestCreateDependentInventoryUpdate(): + + def test(self, scheduler_factory, pending_job, waiting_inventory_update, inventory_id_sources): + scheduler = scheduler_factory(tasks=[pending_job], + create_inventory_update=waiting_inventory_update, + inventory_sources=inventory_id_sources) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_inventory_update, [pending_job]) + + def test_cache_hit(self, scheduler_factory, pending_job, successful_inventory_update, successful_inventory_update_latest): + scheduler = scheduler_factory(tasks=[successful_inventory_update, pending_job], + latest_inventory_updates=[successful_inventory_update_latest]) + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_job) + + def test_cache_miss(self, scheduler_factory, pending_job, successful_inventory_update, successful_inventory_update_latest_cache_expired, waiting_inventory_update, inventory_id_sources): + scheduler = scheduler_factory(tasks=[successful_inventory_update, pending_job], + latest_inventory_updates=[successful_inventory_update_latest_cache_expired], + create_inventory_update=waiting_inventory_update, + inventory_sources=inventory_id_sources) + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_inventory_update, [pending_job]) + + def test_last_update_failed(self, scheduler_factory, pending_job, failed_inventory_update, failed_inventory_update_latest, waiting_inventory_update, inventory_id_sources): + scheduler = scheduler_factory(tasks=[failed_inventory_update, pending_job], + latest_inventory_updates=[failed_inventory_update_latest], + create_inventory_update=waiting_inventory_update, + inventory_sources=inventory_id_sources) + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_inventory_update, [pending_job]) + diff --git a/awx/main/tests/unit/scheduler/test_scheduler_job.py b/awx/main/tests/unit/scheduler/test_scheduler_job.py new file mode 100644 index 0000000000..735ce04d95 --- /dev/null +++ b/awx/main/tests/unit/scheduler/test_scheduler_job.py @@ -0,0 +1,56 @@ + +# Python +import pytest +from datetime import timedelta + +class TestJobBlocked(): + def test_inventory_update_waiting(self, scheduler_factory, waiting_inventory_update, pending_job): + scheduler = scheduler_factory(tasks=[waiting_inventory_update, pending_job]) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + + def test_inventory_update_running(self, scheduler_factory, running_inventory_update, pending_job, inventory_source_factory, inventory_id_sources): + scheduler = scheduler_factory(tasks=[running_inventory_update, pending_job], + inventory_sources=inventory_id_sources) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + + def test_project_update_running(self, scheduler_factory, pending_job, running_project_update): + scheduler = scheduler_factory(tasks=[running_project_update, pending_job]) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + assert scheduler.create_project_update.call_count == 0 + + def test_project_update_waiting(self, scheduler_factory, pending_job, waiting_project_update): + scheduler = scheduler_factory(tasks=[waiting_project_update, pending_job], + latest_project_updates=[waiting_project_update]) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + assert scheduler.create_project_update.call_count == 0 + +class TestJob(): + @pytest.fixture + def successful_project_update(self, project_update_factory): + project_update = project_update_factory() + project_update['status'] = 'successful' + project_update['finished'] = project_update['created'] + timedelta(seconds=10) + project_update['project__scm_update_cache_timeout'] = 3600 + return project_update + + def test_existing_dependencies_finished(self, scheduler_factory, successful_project_update, successful_inventory_update_latest, pending_job): + scheduler = scheduler_factory(tasks=[successful_project_update, pending_job], + latest_project_updates=[successful_project_update], + latest_inventory_updates=[successful_inventory_update_latest]) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_job) + diff --git a/awx/main/tests/unit/scheduler/test_scheduler_project_update.py b/awx/main/tests/unit/scheduler/test_scheduler_project_update.py new file mode 100644 index 0000000000..8122d93c09 --- /dev/null +++ b/awx/main/tests/unit/scheduler/test_scheduler_project_update.py @@ -0,0 +1,75 @@ + +# TODO: wherever get_latest_rpoject_update_task() is stubbed and returns a +# ProjectUpdateDict. We should instead return a ProjectUpdateLatestDict() +# For now, this is ok since the fields on deviate that much. + +class TestStartProjectUpdate(): + def test(self, scheduler_factory, pending_project_update): + scheduler = scheduler_factory(tasks=[pending_project_update]) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_project_update) + assert scheduler.create_project_update.call_count == 0 + + ''' + Explicit project update should always run. They should not use cache logic. + ''' + def test_cache_oblivious(self, scheduler_factory, successful_project_update, pending_project_update): + scheduler = scheduler_factory(tasks=[pending_project_update], + latest_project_updates=[successful_project_update]) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_project_update) + assert scheduler.create_project_update.call_count == 0 + + +class TestCreateDependentProjectUpdate(): + + def test(self, scheduler_factory, pending_job, waiting_project_update): + scheduler = scheduler_factory(tasks=[pending_job], + create_project_update=waiting_project_update) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_project_update, [pending_job]) + + def test_cache_hit(self, scheduler_factory, pending_job, successful_project_update): + scheduler = scheduler_factory(tasks=[successful_project_update, pending_job], + latest_project_updates=[successful_project_update]) + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_job) + + def test_cache_miss(self, scheduler_factory, pending_job, successful_project_update_cache_expired, waiting_project_update): + scheduler = scheduler_factory(tasks=[successful_project_update_cache_expired, pending_job], + latest_project_updates=[successful_project_update_cache_expired], + create_project_update=waiting_project_update) + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_project_update, [pending_job]) + + def test_last_update_failed(self, scheduler_factory, pending_job, failed_project_update, waiting_project_update): + scheduler = scheduler_factory(tasks=[failed_project_update, pending_job], + latest_project_updates=[failed_project_update], + create_project_update=waiting_project_update) + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_project_update, [pending_job]) + +class TestProjectUpdateBlocked(): + def test_projct_update_running(self, scheduler_factory, running_project_update, pending_project_update): + scheduler = scheduler_factory(tasks=[running_project_update, pending_project_update]) + scheduler._schedule() + + scheduler.start_task.assert_not_called() + assert scheduler.create_project_update.call_count == 0 + + def test_job_running(self, scheduler_factory, running_job, pending_project_update): + scheduler = scheduler_factory(tasks=[running_job, pending_project_update]) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 9c6ed0950b..02cbf3fc31 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -201,6 +201,7 @@ INSTALLED_APPS = ( 'awx.ui', 'awx.fact', 'awx.sso', + 'solo', ) INTERNAL_IPS = ('127.0.0.1',) @@ -392,6 +393,14 @@ CELERYBEAT_SCHEDULE = { 'task': 'awx.main.tasks.cluster_node_heartbeat', 'schedule': timedelta(seconds=60) }, + 'task_manager': { + 'task': 'awx.main.scheduler.tasks.run_task_manager', + 'schedule': timedelta(seconds=20) + }, + 'task_fail_inconsistent_running_jobs': { + 'task': 'awx.main.scheduler.tasks.run_fail_inconsistent_running_jobs', + 'schedule': timedelta(seconds=30) + }, } # Django Caching Configuration diff --git a/docs/task_manager_system.md b/docs/task_manager_system.md new file mode 100644 index 0000000000..92b8e35880 --- /dev/null +++ b/docs/task_manager_system.md @@ -0,0 +1,57 @@ +# Task Manager Overview + +The task manager is responsible for deciding when jobs should be introduced to celery for running. When choosing a task to run the considerations are: (1) creation time, (2) job dependency, (3) capacity. + +Independent jobs are ran in order of creation time, earliest first. Jobs with dependencies are also ran in creation time order within the group of job dependencies. Capacity is the final consideration when deciding to release a job to be ran by celery. + +## Task Manager Architecture + +The task manager has a single entry point, `Scheduler().schedule()`. The method may be called in parallel, at any time, as many times as the user wants. The `schedule()` function tries to aquire a single, global, lock using the Instance table first record in the database. If the lock can not be aquired the method returns. The failure to aquire the lock indicates that there is another instance currently running `schedule()`. + +### Hybrid Scheduler: Periodic + Event +The `schedule()` function is ran (a) periodically by a celery task and (b) on job creation or completion. The task manager system would behave correctly if ran, exclusively, via (a) or (b). We chose to trigger `schedule()` via both mechanisms because of the nice properties I will now mention. (b) reduces the time from launch to running, resulting a better user experience. (a) is a fail-safe in case we miss code-paths, in the present and future, that change the 3 scheduling considerations for which we should call `schedule()` (i.e. adding new nodes to tower changes the capacity, obscure job error handling that fails a job) + Emperically, the periodic task manager has served us well in the past and we will continue to rely on it with the added event-triggered `schedule()`. + + ### Scheduler Algorithm + * Get all non-completed jobs, `all_tasks` + * Generate the hash tables from `all_tasks`: + * `` indicates a job is running + * `` indicates a project update is running + * `` indicates a job template or inventory update is running + * `` indiciates an inventory update is running + * `` indiciates a workflow job is running + * `` used to determine cache timeout + * `` used to determine cache timeout and dependencies to spawn + * `` used to determine cache timeout + * Detect finished workflow jobs + * Spawn next workflow jobs if needed + * For each pending jobs; start with oldest created job and stop when no capacity == 0 + * If job is not blocked, determined using generated hash tables, and there is capacity, then mark the as `waiting` and submit the job to celery. + +### Job Lifecycle +| Job Status | State | +|:----------:|:------------------------------------------------------------------------------------------------------------------:| +| pending | Job launched.
1. Hasn't yet been seen by the scheduler
2. Is blocked by another task
3. Not enough capacity | +| waiting | Job submitted to celery. | +| running | Job running in celery. | +| successful | Job finished with ansible-playbook return code 0. | +| failed | Job finished with ansible-playbook return code other than 0. | +| error | System failure. | + +## todo + +## Code Composition +* partials +* + +## Acceptance Tests +* assemelate with .md and trim the fat https://docs.google.com/a/redhat.com/document/d/1AOvKiTMSV0A2RHykHW66BZKBuaJ_l0SJ-VbMwvu-5Gk/edit?usp=sharing + + + + + + + + + diff --git a/requirements/requirements.txt b/requirements/requirements.txt index fb885a8842..44e4f58bee 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -24,7 +24,9 @@ django-polymorphic==0.7.2 django-radius==1.0.0 djangorestframework==3.3.2 djangorestframework-yaml==1.0.2 +django-solo==1.1.2 django-split-settings==0.1.1 +django-transaction-hooks==0.2 django-taggit==0.17.6 git+https://github.com/matburt/dm.xmlsec.binding.git@master#egg=dm.xmlsec.binding dogpile.core==0.4.1