From 555f0bb90f5228b026de8ea2245789f987d50dd3 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Thu, 13 Oct 2016 09:42:29 -0400 Subject: [PATCH 01/17] project and jobs running correctly --- awx/main/models/projects.py | 2 +- awx/main/scheduler/__init__.py | 516 +++++++++++------- awx/main/scheduler/dependency_graph.py | 108 ++++ awx/main/scheduler/partial.py | 109 ++++ awx/main/scheduler/tasks.py | 39 +- awx/main/tests/functional/test_partial.py | 65 +++ awx/main/tests/unit/scheduler/__init__.py | 0 .../unit/scheduler/test_dependency_graph.py | 121 ++++ .../test_scheduler_project_update.py | 194 +++++++ awx/settings/defaults.py | 8 + 10 files changed, 948 insertions(+), 214 deletions(-) create mode 100644 awx/main/scheduler/dependency_graph.py create mode 100644 awx/main/scheduler/partial.py create mode 100644 awx/main/tests/functional/test_partial.py create mode 100644 awx/main/tests/unit/scheduler/__init__.py create mode 100644 awx/main/tests/unit/scheduler/test_dependency_graph.py create mode 100644 awx/main/tests/unit/scheduler/test_scheduler_project_update.py diff --git a/awx/main/models/projects.py b/awx/main/models/projects.py index 1c693a6398..4c20e01e08 100644 --- a/awx/main/models/projects.py +++ b/awx/main/models/projects.py @@ -275,7 +275,7 @@ class Project(UnifiedJobTemplate, ProjectOptions, ResourceMixin): def _get_unified_job_field_names(cls): return ['name', 'description', 'local_path', 'scm_type', 'scm_url', 'scm_branch', 'scm_clean', 'scm_delete_on_update', - 'credential', 'schedule', 'timeout'] + 'credential', 'schedule', 'timeout', 'launch_type',] def save(self, *args, **kwargs): new_instance = not bool(self.pk) diff --git a/awx/main/scheduler/__init__.py b/awx/main/scheduler/__init__.py index 6ecdc09b37..0711528c56 100644 --- a/awx/main/scheduler/__init__.py +++ b/awx/main/scheduler/__init__.py @@ -2,77 +2,107 @@ # All Rights Reserved # Python -import datetime +from datetime import timedelta import logging +from sets import Set # Django from django.conf import settings from django.db import transaction +from django.db.utils import DatabaseError # AWX from awx.main.models import * # noqa -from awx.main.utils import get_system_task_capacity -from awx.main.scheduler.dag_simple import SimpleDAG +#from awx.main.scheduler.dag_simple import SimpleDAG from awx.main.scheduler.dag_workflow import WorkflowDAG +from awx.main.scheduler.dependency_graph import DependencyGraph +from awx.main.scheduler.partial import ( + JobDict, + ProjectUpdateDict, + InventoryUpdateDict, + ProjectUpdateLatestDict, +) + # Celery from celery.task.control import inspect logger = logging.getLogger('awx.main.scheduler') -def get_tasks(): - """Fetch all Tower tasks that are relevant to the task management - system. - """ - RELEVANT_JOBS = ('pending', 'waiting', 'running') - # TODO: Replace this when we can grab all objects in a sane way. - graph_jobs = [j for j in Job.objects.filter(status__in=RELEVANT_JOBS)] - graph_ad_hoc_commands = [ahc for ahc in AdHocCommand.objects.filter(status__in=RELEVANT_JOBS)] - graph_inventory_updates = [iu for iu in - InventoryUpdate.objects.filter(status__in=RELEVANT_JOBS)] - graph_project_updates = [pu for pu in - ProjectUpdate.objects.filter(status__in=RELEVANT_JOBS)] - graph_system_jobs = [sj for sj in - SystemJob.objects.filter(status__in=RELEVANT_JOBS)] - graph_workflow_jobs = [wf for wf in - WorkflowJob.objects.filter(status__in=RELEVANT_JOBS)] - all_actions = sorted(graph_jobs + graph_ad_hoc_commands + graph_inventory_updates + - graph_project_updates + graph_system_jobs + - graph_workflow_jobs, - key=lambda task: task.created) - return all_actions +class Scheduler(): + def __init__(self): + self.graph = DependencyGraph() + self.capacity_total = 200 + self.capacity_used = 0 -def get_running_workflow_jobs(): - graph_workflow_jobs = [wf for wf in - WorkflowJob.objects.filter(status='running')] - return graph_workflow_jobs + def _get_tasks_with_status(self, status_list): -def spawn_workflow_graph_jobs(workflow_jobs): - # TODO: Consider using transaction.atomic - for workflow_job in workflow_jobs: - dag = WorkflowDAG(workflow_job) - spawn_nodes = dag.bfs_nodes_to_run() - for spawn_node in spawn_nodes: - kv = spawn_node.get_job_kwargs() - job = spawn_node.unified_job_template.create_unified_job(**kv) - spawn_node.job = job - spawn_node.save() - can_start = job.signal_start(**kv) - if not can_start: - job.status = 'failed' - job.job_explanation = "Workflow job could not start because it was not in the right state or required manual credentials" - job.save(update_fields=['status', 'job_explanation']) - job.websocket_emit_status("failed") + graph_jobs = JobDict.filter_partial(status=status_list) + ''' + graph_ad_hoc_commands = [ahc for ahc in AdHocCommand.objects.filter(**kv)] + graph_inventory_updates = [iu for iu in + InventoryUpdate.objects.filter(**kv)] + ''' + graph_inventory_updates = InventoryUpdateDict.filter_partial(status=status_list) + graph_project_updates = ProjectUpdateDict.filter_partial(status=status_list) + ''' + graph_system_jobs = [sj for sj in + SystemJob.objects.filter(**kv)] + graph_workflow_jobs = [wf for wf in + WorkflowJob.objects.filter(**kv)] + all_actions = sorted(graph_jobs + graph_ad_hoc_commands + graph_inventory_updates + + graph_project_updates + graph_system_jobs + + graph_workflow_jobs, + key=lambda task: task.created) + ''' + all_actions = sorted(graph_jobs + graph_project_updates + graph_inventory_updates, + key=lambda task: task['created']) + return all_actions - # TODO: should we emit a status on the socket here similar to tasks.py tower_periodic_scheduler() ? - #emit_websocket_notification('/socket.io/jobs', '', dict(id=)) + def get_tasks(self): + RELEVANT_JOBS = ('pending', 'waiting', 'running') + return self._get_tasks_with_status(RELEVANT_JOBS) -# See comment in tasks.py::RunWorkflowJob::run() -def process_finished_workflow_jobs(workflow_jobs): - for workflow_job in workflow_jobs: - dag = WorkflowDAG(workflow_job) - if dag.is_workflow_done(): - with transaction.atomic(): + # TODO: Consider a database query for this logic + def get_latest_project_update_tasks(self, all_sorted_tasks): + project_ids = Set() + for task in all_sorted_tasks: + if type(task) == JobDict: + project_ids.add(task['project_id']) + + return ProjectUpdateLatestDict.filter_partial(list(project_ids)) + + def get_running_workflow_jobs(self): + graph_workflow_jobs = [wf for wf in + WorkflowJob.objects.filter(status='running')] + return graph_workflow_jobs + + def spawn_workflow_graph_jobs(self, workflow_jobs): + # TODO: Consider using transaction.atomic + for workflow_job in workflow_jobs: + dag = WorkflowDAG(workflow_job) + spawn_nodes = dag.bfs_nodes_to_run() + for spawn_node in spawn_nodes: + kv = spawn_node.get_job_kwargs() + job = spawn_node.unified_job_template.create_unified_job(**kv) + spawn_node.job = job + spawn_node.save() + can_start = job.signal_start(**kv) + if not can_start: + job.status = 'failed' + job.job_explanation = "Workflow job could not start because it was not in the right state or required manual credentials" + job.save(update_fields=['status', 'job_explanation']) + job.websocket_emit_status("failed") + + # TODO: should we emit a status on the socket here similar to tasks.py tower_periodic_scheduler() ? + #emit_websocket_notification('/socket.io/jobs', '', dict(id=)) + + # See comment in tasks.py::RunWorkflowJob::run() + def process_finished_workflow_jobs(self, workflow_jobs): + for workflow_job in workflow_jobs: + dag = WorkflowDAG(workflow_job) + if dag.is_workflow_done(): + # TODO: detect if wfj failed if workflow_job._has_failed(): workflow_job.status = 'failed' else: @@ -80,178 +110,248 @@ def process_finished_workflow_jobs(workflow_jobs): workflow_job.save() workflow_job.websocket_emit_status(workflow_job.status) -def rebuild_graph(): - """Regenerate the task graph by refreshing known tasks from Tower, purging - orphaned running tasks, and creating dependencies for new tasks before - generating directed edge relationships between those tasks. - """ - ''' - # Sanity check: Only do this on the primary node. - if Instance.objects.my_role() == 'secondary': - return None - ''' + def get_activate_tasks(self): + inspector = inspect() + if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'): + active_task_queues = inspector.active() + else: + logger.warn("Ignoring celery task inspector") + active_task_queues = None - inspector = inspect() - if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'): - active_task_queues = inspector.active() - else: - logger.warn("Ignoring celery task inspector") - active_task_queues = None + active_tasks = [] + if active_task_queues is not None: + for queue in active_task_queues: + active_tasks += [at['id'] for at in active_task_queues[queue]] + else: + logger.error("Could not communicate with celery!") + # TODO: Something needs to be done here to signal to the system + # as a whole that celery appears to be down. + if not hasattr(settings, 'CELERY_UNIT_TEST'): + return None - all_sorted_tasks = get_tasks() - if not len(all_sorted_tasks): - return None + return active_tasks - active_tasks = [] - if active_task_queues is not None: - for queue in active_task_queues: - active_tasks += [at['id'] for at in active_task_queues[queue]] - else: - logger.error("Could not communicate with celery!") - # TODO: Something needs to be done here to signal to the system - # as a whole that celery appears to be down. - if not hasattr(settings, 'CELERY_UNIT_TEST'): - return None + def start_task(self, task, dependent_tasks=[]): + from awx.main.tasks import handle_work_error, handle_work_success - running_tasks = filter(lambda t: t.status == 'running', all_sorted_tasks) - running_celery_tasks = filter(lambda t: type(t) != WorkflowJob, running_tasks) - waiting_tasks = filter(lambda t: t.status != 'running', all_sorted_tasks) - new_tasks = filter(lambda t: t.status == 'pending', all_sorted_tasks) + #print("start_task() <%s, %s> with deps %s" % (task.get_job_type_str(), task['id'], dependent_tasks)) + + # TODO: spawn inventory and project updates + task_actual = { + 'type':task.get_job_type_str(), + 'id': task['id'], + } + dependencies = [{'type': t.get_job_type_str(), 'id': t['id']} for t in dependent_tasks] + + error_handler = handle_work_error.s(subtasks=[task_actual] + dependencies) + success_handler = handle_work_success.s(task_actual=task_actual) + + job_obj = task.get_full() + job_obj.status = 'waiting' + job_obj.save() - # Check running tasks and make sure they are active in celery - logger.debug("Active celery tasks: " + str(active_tasks)) - for task in list(running_celery_tasks): - if (task.celery_task_id not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')): - # NOTE: Pull status again and make sure it didn't finish in - # the meantime? - task.status = 'failed' - task.job_explanation += ' '.join(( - 'Task was marked as running in Tower but was not present in', - 'Celery, so it has been marked as failed.', - )) - task.save() - task.websocket_emit_status("failed") - running_tasks.pop(running_tasks.index(task)) - logger.error("Task %s appears orphaned... marking as failed" % task) + #print("For real, starting job <%s, %s>" % (type(job_obj), job_obj.id)) + start_status = job_obj.start(error_callback=error_handler, success_callback=success_handler) + if not start_status: + job_obj.status = 'failed' + if job_obj.job_explanation: + job_obj.job_explanation += ' ' + job_obj.job_explanation += 'Task failed pre-start check.' + job_obj.save() + # TODO: run error handler to fail sub-tasks and send notifications + return - # Create and process dependencies for new tasks - for task in new_tasks: - logger.debug("Checking dependencies for: %s" % str(task)) - try: - task_dependencies = task.generate_dependencies(running_tasks + waiting_tasks) - except Exception, e: - logger.error("Failed processing dependencies for {}: {}".format(task, e)) - task.status = 'failed' - task.job_explanation += 'Task failed to generate dependencies: {}'.format(e) - task.save() - task.websocket_emit_status("failed") - continue - logger.debug("New dependencies: %s" % str(task_dependencies)) - for dep in task_dependencies: - # We recalculate the created time for the moment to ensure the - # dependencies are always sorted in the right order relative to - # the dependent task. - time_delt = len(task_dependencies) - task_dependencies.index(dep) - dep.created = task.created - datetime.timedelta(seconds=1 + time_delt) - dep.status = 'waiting' - dep.save() - waiting_tasks.insert(waiting_tasks.index(task), dep) - if not hasattr(settings, 'UNIT_TEST_IGNORE_TASK_WAIT'): - task.status = 'waiting' - task.save() + self.consume_capacity(task) - # Rebuild graph - graph = SimpleDAG() - for task in running_tasks: - graph.add_node(task) - for wait_task in waiting_tasks[:50]: - node_dependencies = [] - for node in graph: - if wait_task.is_blocked_by(node['node_object']): - node_dependencies.append(node['node_object']) - graph.add_node(wait_task) - for dependency in node_dependencies: - graph.add_edge(wait_task, dependency) - if settings.DEBUG: - graph.generate_graphviz_plot() - return graph + def process_runnable_tasks(self, runnable_tasks): + for i, task in enumerate(runnable_tasks): + # TODO: maybe batch process new tasks. + # Processing a new task individually seems to be expensive + self.graph.add_job(task) -def process_graph(graph, task_capacity): - """Given a task dependency graph, start and manage tasks given their - priority and weight. - """ - from awx.main.tasks import handle_work_error, handle_work_success + def create_project_update(self, task): + dep = Project.objects.get(id=task['project_id']).create_project_update(launch_type='dependency') - leaf_nodes = graph.get_leaf_nodes() - running_nodes = filter(lambda x: x['node_object'].status == 'running', leaf_nodes) - running_impact = sum([t['node_object'].task_impact for t in running_nodes]) - ready_nodes = filter(lambda x: x['node_object'].status != 'running', leaf_nodes) - remaining_volume = task_capacity - running_impact - logger.info('Running Nodes: %s; Capacity: %s; Running Impact: %s; ' - 'Remaining Capacity: %s' % - (str(running_nodes), str(task_capacity), - str(running_impact), str(remaining_volume))) - logger.info("Ready Nodes: %s" % str(ready_nodes)) - for task_node in ready_nodes: - node_obj = task_node['node_object'] - # NOTE: This could be used to pass metadata through the task system - # node_args = task_node['metadata'] - impact = node_obj.task_impact - if impact <= remaining_volume or running_impact == 0: - node_dependencies = graph.get_dependents(node_obj) - # Allow other tasks to continue if a job fails, even if they are - # other jobs. + # TODO: Consider using milliseconds or microseconds + # Project created 1 seconds behind + dep.created = task['created'] - timedelta(seconds=1) + dep.status = 'waiting' + dep.save() - node_type = graph.get_node_type(node_obj) - if node_type == 'job': - # clear dependencies because a job can block (not necessarily - # depend) on other jobs that share the same job template + project_task = ProjectUpdateDict.get_partial(dep.id) + #waiting_tasks.insert(waiting_tasks.index(task), dep) + + return project_task + + def generate_dependencies(self, task): + dependencies = [] + # TODO: What if the project is null ? + if type(task) is JobDict: + if task['project__scm_update_on_launch'] is True and \ + self.graph.should_update_related_project(task): + project_task = self.create_project_update(task) + dependencies.append(project_task) + # Inventory created 2 seconds behind + return dependencies + + def process_latest_project_updates(self, latest_project_updates): + for task in latest_project_updates: + self.graph.add_latest_project_update(task) + + def process_dependencies(self, dependent_task, dependency_tasks): + for task in dependency_tasks: + # ProjectUpdate or InventoryUpdate may be blocked by another of + # the same type. + if not self.graph.is_job_blocked(task): + self.graph.add_job(task) + if not self.would_exceed_capacity(task): + #print("process_dependencies() going to run project update <%s, %s>" % (task['id'], task['project_id'])) + self.start_task(task, [dependent_task]) + else: + self.graph.add_job(task) + + def process_pending_tasks(self, pending_tasks): + for task in pending_tasks: + + if not self.graph.is_job_blocked(task): + #print("process_pending_tasks() generating deps for job <%s, %s, %s>" % (task['id'], task['project_id'], task.model)) + dependencies = self.generate_dependencies(task) + self.process_dependencies(task, dependencies) + + # Spawning deps might have blocked us + if not self.graph.is_job_blocked(task): + self.graph.add_job(task) + if not self.would_exceed_capacity(task): + #print("Starting the original task <%s, %s>" % (task.get_job_type_str(), task['id'])) + self.start_task(task) + else: + self.graph.add_job(task) + + # Stop processing tasks if we know we are out of capacity + if self.get_remaining_capacity() <= 0: + return + + def fail_inconsistent_running_jobs(self, active_tasks, all_sorted_tasks): + for i, task in enumerate(all_sorted_tasks): + if task['status'] != 'running': + continue + + if (task['celery_task_id'] not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')): + # NOTE: Pull status again and make sure it didn't finish in + # the meantime? + # TODO: try catch the getting of the job. The job COULD have been deleted + task_obj = task.get_full() + task_obj.status = 'failed' + task_obj.job_explanation += ' '.join(( + 'Task was marked as running in Tower but was not present in', + 'Celery, so it has been marked as failed.', + )) + task_obj.save() + task_obj.websocket_emit_status("failed") + + all_sorted_tasks.pop(i) + logger.error("Task %s appears orphaned... marking as failed" % task) + + def process_celery_tasks(self, active_tasks, all_sorted_tasks): + + ''' + Rectify tower db <-> celery inconsistent view of jobs state + ''' + # Check running tasks and make sure they are active in celery + logger.debug("Active celery tasks: " + str(active_tasks)) + all_sorted_tasks = self.fail_inconsistent_running_jobs(active_tasks, + all_sorted_tasks) + + def calculate_capacity_used(self, tasks): + self.capacity_used = 0 + for t in tasks: + self.capacity_used += t.task_impact() + + def would_exceed_capacity(self, task): + return (task.task_impact() + self.capacity_used > self.capacity_total) + + def consume_capacity(self, task): + self.capacity_used += task.task_impact() + #print("Capacity used %s vs total %s" % (self.capacity_used, self.capacity_total)) + + def get_remaining_capacity(self): + return (self.capacity_total - self.capacity_used) + + def process_tasks(self, all_sorted_tasks): + + # TODO: Process new tasks + running_tasks = filter(lambda t: t['status'] == 'running', all_sorted_tasks) + runnable_tasks = filter(lambda t: t['status'] in ['waiting', 'running'], all_sorted_tasks) + + self.calculate_capacity_used(running_tasks) + + self.process_runnable_tasks(runnable_tasks) + + pending_tasks = filter(lambda t: t['status'] == 'pending', all_sorted_tasks) + self.process_pending_tasks(pending_tasks) + + + ''' + def do_graph_things(): + # Rebuild graph + graph = SimpleDAG() + for task in running_tasks: + graph.add_node(task) + #for wait_task in waiting_tasks[:50]: + for wait_task in waiting_tasks: node_dependencies = [] + for node in graph: + if wait_task.is_blocked_by(node['node_object']): + node_dependencies.append(node['node_object']) + graph.add_node(wait_task) + for dependency in node_dependencies: + graph.add_edge(wait_task, dependency) + if settings.DEBUG: + graph.generate_graphviz_plot() + return graph + ''' + #return do_graph_things() - # Make the workflow_job look like it's started by setting status to - # running, but don't make a celery Task for it. - # Introduce jobs from the workflow so they are candidates to run. - # Call process_graph() again to allow choosing for run, the - # created candidate jobs. - elif node_type == 'workflow_job': - node_obj.start() - spawn_workflow_graph_jobs([node_obj]) - return process_graph(graph, task_capacity) + def _schedule(self): + all_sorted_tasks = self.get_tasks() + if len(all_sorted_tasks) > 0: + #self.process_celery_tasks(active_tasks, all_sorted_tasks) - dependent_nodes = [{'type': graph.get_node_type(node_obj), 'id': node_obj.id}] + \ - [{'type': graph.get_node_type(n['node_object']), - 'id': n['node_object'].id} for n in node_dependencies] - error_handler = handle_work_error.s(subtasks=dependent_nodes) - success_handler = handle_work_success.s(task_actual={'type': graph.get_node_type(node_obj), - 'id': node_obj.id}) - with transaction.atomic(): - start_status = node_obj.start(error_callback=error_handler, success_callback=success_handler) - if not start_status: - node_obj.status = 'failed' - if node_obj.job_explanation: - node_obj.job_explanation += ' ' - node_obj.job_explanation += 'Task failed pre-start check.' - node_obj.save() - continue - remaining_volume -= impact - running_impact += impact - logger.info('Started Node: %s (capacity hit: %s) ' - 'Remaining Capacity: %s' % - (str(node_obj), str(impact), str(remaining_volume))) + latest_project_updates = self.get_latest_project_update_tasks(all_sorted_tasks) + self.process_latest_project_updates(latest_project_updates) -def schedule(): - with transaction.atomic(): - # Lock - Instance.objects.select_for_update().all()[0] + self.process_tasks(all_sorted_tasks) - task_capacity = get_system_task_capacity() + #print("Finished schedule()") - workflow_jobs = get_running_workflow_jobs() - process_finished_workflow_jobs(workflow_jobs) - spawn_workflow_graph_jobs(workflow_jobs) + def schedule(self): + with transaction.atomic(): + #t1 = datetime.now() + # Lock + try: + Instance.objects.select_for_update(nowait=True).all()[0] + except DatabaseError: + return - graph = rebuild_graph() - if graph: - process_graph(graph, task_capacity) + #workflow_jobs = get_running_workflow_jobs() + #process_finished_workflow_jobs(workflow_jobs) + #spawn_workflow_graph_jobs(workflow_jobs) + + ''' + Get tasks known by celery + ''' + ''' + active_tasks = self.get_activate_tasks() + # Communication with celery failed :(, return + if active_tasks is None: + return None + ''' + self._schedule() # Unlock, due to transaction ending + #t2 = datetime.now() + #t_diff = t2 - t1 + #print("schedule() time %s" % (t_diff.total_seconds())) + + + diff --git a/awx/main/scheduler/dependency_graph.py b/awx/main/scheduler/dependency_graph.py new file mode 100644 index 0000000000..5ecea91385 --- /dev/null +++ b/awx/main/scheduler/dependency_graph.py @@ -0,0 +1,108 @@ +from datetime import timedelta +from django.utils.timezone import now as tz_now + +from awx.main.scheduler.partial import JobDict, ProjectUpdateDict, InventoryUpdateDict +class DependencyGraph(object): + PROJECT_UPDATES = 'project_updates' + INVENTORY_UPDATES = 'inventory_updates' + JOB_TEMPLATE_JOBS = 'job_template_jobs' + LATEST_PROJECT_UPDATES = 'latest_project_updates' + + def __init__(self, *args, **kwargs): + self.data = {} + # project_id -> True / False + self.data[self.PROJECT_UPDATES] = {} + # inventory_id -> True / False + self.data[self.INVENTORY_UPDATES] = {} + # job_template_id -> True / False + self.data[self.JOB_TEMPLATE_JOBS] = {} + + # project_id -> latest ProjectUpdateDict + self.data[self.LATEST_PROJECT_UPDATES] = {} + + def add_latest_project_update(self, job): + self.data[self.LATEST_PROJECT_UPDATES][job['project_id']] = job + + def get_now(self): + return tz_now() + + ''' + JobDict + + Presume that job is related to a project that is update on launch + ''' + def should_update_related_project(self, job): + now = self.get_now() + latest_project_update = self.data[self.LATEST_PROJECT_UPDATES].get(job['project_id'], None) + if not latest_project_update: + return True + + # TODO: Other finished, failed cases? i.e. error ? + if latest_project_update['status'] == 'failed': + return True + + ''' + This is a bit of fuzzy logic. + If the latest project update has a created time == job_created_time-1 + then consider the project update found. This is so we don't enter an infinite loop + of updating the project when cache timeout is 0. + ''' + if latest_project_update['project__scm_update_cache_timeout'] == 0 and \ + latest_project_update['launch_type'] == 'dependency' and \ + latest_project_update['created'] == job['created'] - timedelta(seconds=1): + return False + + ''' + Normal, expected, cache timeout logic + ''' + timeout_seconds = timedelta(seconds=latest_project_update['project__scm_update_cache_timeout']) + if (latest_project_update['finished'] + timeout_seconds) < now: + return True + + return False + + def add_project_update(self, job): + self.data[self.PROJECT_UPDATES][job['project_id']] = False + + def add_inventory_update(self, job): + self.data[self.INVENTORY_UPDATES][job['inventory_id']] = False + + def add_job_template_job(self, job): + self.data[self.JOB_TEMPLATE_JOBS][job['job_template_id']] = False + + + def can_project_update_run(self, job): + return self.data[self.PROJECT_UPDATES].get(job['project_id'], True) + + def can_inventory_update_run(self, job): + return self.data[self.INVENTORY_UPDATES].get(job['inventory_id'], True) + + def can_job_run(self, job): + if self.can_project_update_run(job) is True and \ + self.can_inventory_update_run(job) is True: + if job['allow_simultaneous'] is False: + return self.data[self.JOB_TEMPLATE_JOBS].get(job['job_template_id'], True) + else: + return True + return False + + def is_job_blocked(self, job): + if type(job) is ProjectUpdateDict: + return not self.can_project_update_run(job) + elif type(job) is InventoryUpdateDict: + return not self.can_inventory_update_run(job) + elif type(job) is JobDict: + return not self.can_job_run(job) + + def add_job(self, job): + if type(job) is ProjectUpdateDict: + self.add_project_update(job) + elif type(job) is InventoryUpdateDict: + self.add_inventory_update(job) + elif type(job) is JobDict: + self.add_job_template_job(job) + + def add_jobs(self, jobs): + for j in jobs: + self.add_job(j) + diff --git a/awx/main/scheduler/partial.py b/awx/main/scheduler/partial.py new file mode 100644 index 0000000000..16c6597f99 --- /dev/null +++ b/awx/main/scheduler/partial.py @@ -0,0 +1,109 @@ + +# AWX +from awx.main.models import ( + Job, + ProjectUpdate, + InventoryUpdate, +) + +class PartialModelDict(object): + FIELDS = () + model = None + data = None + + def __init__(self, data): + if type(data) is not dict: + raise RuntimeError("Expected data to be of type dict not %s" % type(data)) + self.data = data + + def __getitem__(self, index): + return self.data[index] + + def __setitem__(self, key, value): + self.data[key] = value + + def get(self, key, **kwargs): + return self.data.get(key, **kwargs) + + def get_full(self): + return self.model.objects.get(id=self.data['id']) + + def refresh_partial(self): + return self.__class__(self.model.objects.filter(id=self.data['id']).values(*self.__class__.get_db_values())[0]) + + @classmethod + def get_partial(cls, id): + return cls(cls.model.objects.filter(id=id).values(*cls.get_db_values())[0]) + + @classmethod + def get_db_values(cls): + return cls.FIELDS + + @classmethod + def filter_partial(cls, status=[]): + kv = { + 'status__in': status + } + return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())] + + def get_job_type_str(self): + raise RuntimeError("Inherit and implement me") + + def task_impact(self): + raise RuntimeError("Inherit and implement me") + +class JobDict(PartialModelDict): + FIELDS = ( + 'id', 'status', 'job_template_id', 'inventory_id', 'project_id', + 'launch_type', 'limit', 'allow_simultaneous', 'created', + 'job_type', 'celery_task_id', 'project__scm_update_on_launch', + 'forks', + ) + model = Job + + def get_job_type_str(self): + return 'job' + + def task_impact(self): + return (5 if self.data['forks'] == 0 else self.data['forks']) * 10 + +class ProjectUpdateDict(PartialModelDict): + FIELDS = ( + 'id', 'status', 'project_id', 'created', 'celery_task_id', 'launch_type', 'project__scm_update_cache_timeout', 'project__scm_update_on_launch', + ) + model = ProjectUpdate + + def get_job_type_str(self): + return 'project_update' + + def task_impact(self): + return 10 + +class ProjectUpdateLatestDict(ProjectUpdateDict): + FIELDS = ( + 'id', 'status', 'project_id', 'created', 'finished', 'project__scm_update_cache_timeout', 'launch_type', 'project__scm_update_on_launch', + ) + model = ProjectUpdate + + @classmethod + def filter_partial(cls, project_ids): + # TODO: This can shurley be made more efficient + results = [] + for project_id in project_ids: + qs = cls.model.objects.filter(project_id=project_id, status__in=['waiting', 'successful', 'failed']).order_by('-finished') + if qs.count() > 0: + results.append(cls(cls.model.objects.filter(id=qs[0].id).values(*cls.get_db_values())[0])) + return results + +class InventoryUpdateDict(PartialModelDict): + FIELDS = ( + 'id', 'status', 'created', 'celery_task_id', + ) + model = InventoryUpdate + + def get_job_type_str(self): + return 'inventory_update' + + def task_impact(self): + return 20 + diff --git a/awx/main/scheduler/tasks.py b/awx/main/scheduler/tasks.py index 343bdd1546..ef0334e316 100644 --- a/awx/main/scheduler/tasks.py +++ b/awx/main/scheduler/tasks.py @@ -1,14 +1,17 @@ # Python import logging -import time + +# Django +from django.db import transaction +from django.db.utils import DatabaseError # Celery from celery import task # AWX -from awx.main.models import UnifiedJob -from awx.main.scheduler import schedule +from awx.main.models import Instance +from awx.main.scheduler import Scheduler logger = logging.getLogger('awx.main.scheduler') @@ -18,6 +21,7 @@ logger = logging.getLogger('awx.main.scheduler') @task def run_job_launch(job_id): + ''' # Wait for job to exist. # The job is created in a transaction then the message is created, but # the transaction may not have completed. @@ -45,11 +49,13 @@ def run_job_launch(job_id): # TODO: while not loop should call get wrapped in a try except #job = UnifiedJob.objects.get(id=job_id) + ''' - schedule() + Scheduler().schedule() @task def run_job_complete(job_id): + ''' # TODO: use list of finished status from jobs.py or unified_jobs.py finished_status = ['successful', 'error', 'failed', 'completed'] q = UnifiedJob.objects.filter(id=job_id) @@ -74,6 +80,29 @@ def run_job_complete(job_id): logger.error("Expected job status '%s' to be one of '%s' while processing 'job_complete' message." % (job.status, finished_status)) return retry += 1 + ''' - schedule() + Scheduler().schedule() + +@task +def run_scheduler(): + Scheduler().schedule() + +@task +def run_fail_inconsistent_running_jobs(): + return + print("run_fail_inconsistent_running_jobs() running") + with transaction.atomic(): + # Lock + try: + Instance.objects.select_for_update(nowait=True).all()[0] + scheduler = Scheduler() + active_tasks = scheduler.get_activate_tasks() + if active_tasks is None: + return None + + all_sorted_tasks = scheduler.get_tasks() + scheduler.process_celery_tasks(active_tasks, all_sorted_tasks) + except DatabaseError: + return diff --git a/awx/main/tests/functional/test_partial.py b/awx/main/tests/functional/test_partial.py new file mode 100644 index 0000000000..69ad71c4df --- /dev/null +++ b/awx/main/tests/functional/test_partial.py @@ -0,0 +1,65 @@ + +# Python +import pytest +from django.utils.timezone import now as tz_now +from datetime import timedelta + +# AWX +from awx.main.models import ( + Project, + ProjectUpdate, +) +from awx.main.scheduler.partial import ( + ProjectUpdateLatestDict, +) + + +@pytest.fixture +def failed_project_update(): + p = Project.objects.create(name="proj1") + pu = ProjectUpdate.objects.create(project=p, status='failed', finished=tz_now() - timedelta(seconds=20)) + + return (p, pu) + +@pytest.fixture +def successful_project_update(): + p = Project.objects.create(name="proj1") + pu = ProjectUpdate.objects.create(project=p, status='successful', finished=tz_now() - timedelta(seconds=20)) + + return (p, pu) + +# Failed project updates newer than successful ones +@pytest.fixture +def multiple_project_updates(): + p = Project.objects.create(name="proj1") + + epoch = tz_now() + + successful_pus = [ProjectUpdate.objects.create(project=p, + status='successful', + finished=epoch - timedelta(seconds=100 + i)) for i in xrange(0, 5)] + failed_pus = [ProjectUpdate.objects.create(project=p, + status='failed', + finished=epoch - timedelta(seconds=100 - len(successful_pus) + i)) for i in xrange(0, 5)] + return (p, failed_pus, successful_pus) + +class TestProjectUpdateLatestDictDict(): + @pytest.mark.django_db + class TestFilterPartial(): + def test_project_update_successful(self, successful_project_update): + (project, project_update) = successful_project_update + + tasks = ProjectUpdateLatestDict.filter_partial(project_ids=[project.id]) + + assert 1 == len(tasks) + assert project_update.id == tasks[0]['id'] + + def test_correct_project_update(self, multiple_project_updates): + (project, failed_pus, successful_pus) = multiple_project_updates + + tasks = ProjectUpdateLatestDict.filter_partial(project_ids=[project.id]) + + assert 1 == len(tasks) + assert failed_pus[0].id == tasks[0]['id'] + + diff --git a/awx/main/tests/unit/scheduler/__init__.py b/awx/main/tests/unit/scheduler/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/awx/main/tests/unit/scheduler/test_dependency_graph.py b/awx/main/tests/unit/scheduler/test_dependency_graph.py new file mode 100644 index 0000000000..081f175027 --- /dev/null +++ b/awx/main/tests/unit/scheduler/test_dependency_graph.py @@ -0,0 +1,121 @@ + +# Python +import pytest +from datetime import timedelta + +# Django +from django.utils.timezone import now as tz_now + +# AWX +from awx.main.scheduler.dependency_graph import DependencyGraph +from awx.main.scheduler.partial import ProjectUpdateDict + +@pytest.fixture +def graph(): + return DependencyGraph() + +@pytest.fixture +def job(): + return dict(project_id=1) + +@pytest.fixture +def unsuccessful_last_project(graph, job): + pu = ProjectUpdateDict(dict(id=1, + project__scm_update_cache_timeout=999999, + project_id=1, + status='failed', + created='3', + finished='3',)) + + graph.add_latest_project_update(pu) + + return graph + +@pytest.fixture +def last_dependent_project(graph): + now = tz_now() + + job = { + 'project_id': 1, + 'created': now, + } + pu = ProjectUpdateDict(dict(id=1, project_id=1, status='waiting', + project__scm_update_cache_timeout=0, + launch_type='dependency', + created=now - timedelta(seconds=1),)) + + graph.add_latest_project_update(pu) + + return (graph, job) + +@pytest.fixture +def timedout_project_update(graph, job): + now = tz_now() + + job = { + 'project_id': 1, + 'created': now, + } + pu = ProjectUpdateDict(dict(id=1, project_id=1, status='successful', + project__scm_update_cache_timeout=10, + launch_type='dependency', + created=now - timedelta(seconds=100), + finished=now - timedelta(seconds=11),)) + + graph.add_latest_project_update(pu) + + return (graph, job) + +@pytest.fixture +def not_timedout_project_update(graph, job): + now = tz_now() + + job = { + 'project_id': 1, + 'created': now, + } + pu = ProjectUpdateDict(dict(id=1, project_id=1, status='successful', + project__scm_update_cache_timeout=3600, + launch_type='dependency', + created=now - timedelta(seconds=100), + finished=now - timedelta(seconds=11),)) + + graph.add_latest_project_update(pu) + + return (graph, job) + + +class TestShouldUpdateRelatedProject(): + + def test_no_project_updates(self, graph, job): + actual = graph.should_update_related_project(job) + + assert True is actual + + def test_timedout_project_update(self, timedout_project_update): + (graph, job) = timedout_project_update + + actual = graph.should_update_related_project(job) + + assert True is actual + + def test_not_timedout_project_update(self, not_timedout_project_update): + (graph, job) = not_timedout_project_update + + actual = graph.should_update_related_project(job) + + assert False is actual + + def test_unsuccessful_last_project(self, unsuccessful_last_project, job): + graph = unsuccessful_last_project + + actual = graph.should_update_related_project(job) + + assert True is actual + + def test_last_dependent_project(self, last_dependent_project): + (graph, job) = last_dependent_project + + actual = graph.should_update_related_project(job) + assert False is actual + diff --git a/awx/main/tests/unit/scheduler/test_scheduler_project_update.py b/awx/main/tests/unit/scheduler/test_scheduler_project_update.py new file mode 100644 index 0000000000..54add63d51 --- /dev/null +++ b/awx/main/tests/unit/scheduler/test_scheduler_project_update.py @@ -0,0 +1,194 @@ + +# Python +import pytest +from datetime import timedelta + +# Django +from django.utils.timezone import now as tz_now + +# awx +from awx.main.scheduler.partial import ( + JobDict, + ProjectUpdateDict, +) +from awx.main.scheduler import Scheduler + +# TODO: wherever get_latest_rpoject_update_task() is stubbed and returns a +# ProjectUpdateDict. We should instead return a ProjectUpdateLatestDict() +# For now, this is ok since the fields on deviate that much. + +@pytest.fixture +def epoch(): + return tz_now() + + +@pytest.fixture +def scheduler_factory(mocker, epoch): + def fn(tasks=[], latest_project_updates=[], create_project_update=None): + sched = Scheduler() + sched.capacity_total = 999999999 + + sched.graph.get_now = lambda: epoch + + mocker.patch.object(sched, 'get_tasks', return_value=tasks) + mocker.patch.object(sched, 'get_latest_project_update_tasks', return_value=latest_project_updates) + mocker.patch.object(sched, 'create_project_update', return_value=create_project_update) + mocker.patch.object(sched, 'start_task') + return sched + return fn + +@pytest.fixture +def project_update_factory(epoch): + def fn(): + return ProjectUpdateDict({ + 'id': 1, + 'created': epoch - timedelta(seconds=100), + 'project_id': 1, + 'project__scm_update_cache_timeout': 0, + 'celery_task_id': '', + 'launch_type': 'dependency', + 'project__scm_update_on_launch': True, + }) + return fn + +@pytest.fixture +def pending_project_update(project_update_factory): + project_update = project_update_factory() + project_update['status'] = 'pending' + return project_update + +@pytest.fixture +def waiting_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['status'] = 'waiting' + return project_update + +@pytest.fixture +def pending_job(epoch): + return JobDict({ + 'id': 1, + 'status': 'pending', + 'job_template_id': 1, + 'project_id': 1, + 'inventory_id': 1, + 'launch_type': 'manual', + 'allow_simultaneous': False, + 'created': epoch - timedelta(seconds=99), + 'celery_task_id': '', + 'project__scm_update_on_launch': True, + 'forks': 5 + }) + +@pytest.fixture +def running_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['status'] = 'running' + return project_update + +@pytest.fixture +def successful_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['finished'] = epoch - timedelta(seconds=90) + project_update['status'] = 'successful' + return project_update + +@pytest.fixture +def successful_project_update_cache_expired(epoch, project_update_factory): + project_update = project_update_factory() + + project_update['status'] = 'successful' + project_update['created'] = epoch - timedelta(seconds=120) + project_update['finished'] = epoch - timedelta(seconds=110) + project_update['project__scm_update_cache_timeout'] = 1 + return project_update + +@pytest.fixture +def failed_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['finished'] = epoch - timedelta(seconds=90) + project_update['status'] = 'failed' + return project_update + +class TestStartProjectUpdate(): + def test(self, scheduler_factory, pending_project_update): + scheduler = scheduler_factory(tasks=[pending_project_update]) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_project_update) + assert scheduler.create_project_update.call_count == 0 + + ''' + Explicit project update should always run. They should not use cache logic. + ''' + def test_cache_oblivious(self, scheduler_factory, successful_project_update, pending_project_update): + scheduler = scheduler_factory(tasks=[pending_project_update], + latest_project_updates=[successful_project_update]) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_project_update) + assert scheduler.create_project_update.call_count == 0 + + +class TestCreateDependentProjectUpdate(): + + def test(self, scheduler_factory, pending_job, waiting_project_update): + scheduler = scheduler_factory(tasks=[pending_job], + create_project_update=waiting_project_update) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_project_update, [pending_job]) + + def test_cache_hit(self, scheduler_factory, pending_job, successful_project_update): + scheduler = scheduler_factory(tasks=[successful_project_update, pending_job], + latest_project_updates=[successful_project_update]) + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_job) + + def test_cache_miss(self, scheduler_factory, pending_job, successful_project_update_cache_expired, waiting_project_update): + scheduler = scheduler_factory(tasks=[successful_project_update_cache_expired, pending_job], + latest_project_updates=[successful_project_update_cache_expired], + create_project_update=waiting_project_update) + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_project_update, [pending_job]) + + def test_last_update_failed(self, scheduler_factory, pending_job, failed_project_update, waiting_project_update): + scheduler = scheduler_factory(tasks=[failed_project_update, pending_job], + latest_project_updates=[failed_project_update], + create_project_update=waiting_project_update) + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_project_update, [pending_job]) + + +class TestJobBlockedOnProjectUpdate(): + def test(self, scheduler_factory, pending_job, waiting_project_update): + scheduler = scheduler_factory(tasks=[waiting_project_update, pending_job], + latest_project_updates=[waiting_project_update]) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + assert scheduler.create_project_update.call_count == 0 + + def test_project_running(self, scheduler_factory, pending_job, running_project_update): + scheduler = scheduler_factory(tasks=[running_project_update, pending_job]) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + assert scheduler.create_project_update.call_count == 0 + +class TestProjectUpdateBlocked(): + def test(self, scheduler_factory, running_project_update, pending_project_update): + scheduler = scheduler_factory(tasks=[running_project_update, pending_project_update], + latest_project_updates=[running_project_update]) + scheduler._schedule() + + scheduler.start_task.assert_not_called() + assert scheduler.create_project_update.call_count == 0 + diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 9c6ed0950b..a5c7975920 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -392,6 +392,14 @@ CELERYBEAT_SCHEDULE = { 'task': 'awx.main.tasks.cluster_node_heartbeat', 'schedule': timedelta(seconds=60) }, + 'task_scheduler': { + 'task': 'awx.main.scheduler.tasks.run_scheduler', + 'schedule': timedelta(seconds=10) + }, + 'task_fail_inconsistent_running_jobs': { + 'task': 'awx.main.scheduler.tasks.run_fail_inconsistent_running_jobs', + 'schedule': timedelta(seconds=30) + }, } # Django Caching Configuration From 306562cd670c38b9fb1ae07e941c417ac471c046 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Thu, 20 Oct 2016 15:05:02 -0400 Subject: [PATCH 02/17] inventory updates running correctly --- awx/main/models/inventory.py | 2 +- awx/main/models/unified_jobs.py | 10 +- awx/main/scheduler/__init__.py | 77 ++++-- awx/main/scheduler/dependency_graph.py | 82 +++++- awx/main/scheduler/partial.py | 66 ++++- awx/main/scheduler/tasks.py | 57 ----- awx/main/tests/functional/test_partial.py | 111 +++++--- awx/main/tests/unit/scheduler/conftest.py | 238 ++++++++++++++++++ .../test_scheduler_inventory_update.py | 85 +++++++ .../unit/scheduler/test_scheduler_job.py | 66 +++++ .../test_scheduler_project_update.py | 123 +-------- 11 files changed, 681 insertions(+), 236 deletions(-) create mode 100644 awx/main/tests/unit/scheduler/conftest.py create mode 100644 awx/main/tests/unit/scheduler/test_scheduler_inventory_update.py create mode 100644 awx/main/tests/unit/scheduler/test_scheduler_job.py diff --git a/awx/main/models/inventory.py b/awx/main/models/inventory.py index 6fb3e2f992..c77868759e 100644 --- a/awx/main/models/inventory.py +++ b/awx/main/models/inventory.py @@ -1089,7 +1089,7 @@ class InventorySource(UnifiedJobTemplate, InventorySourceOptions): def _get_unified_job_field_names(cls): return ['name', 'description', 'source', 'source_path', 'source_script', 'source_vars', 'schedule', 'credential', 'source_regions', 'instance_filters', 'group_by', 'overwrite', 'overwrite_vars', - 'timeout'] + 'timeout', 'launch_type',] def save(self, *args, **kwargs): # If update_fields has been specified, add our field names to it, diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index 674bedbffe..19bc265c18 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -13,7 +13,7 @@ from StringIO import StringIO # Django from django.conf import settings -from django.db import models +from django.db import models, connection from django.core.exceptions import NON_FIELD_ERRORS from django.utils.translation import ugettext_lazy as _ from django.utils.timezone import now @@ -835,6 +835,10 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique return (True, opts) + def start_celery_task(self, opts, error_callback, success_callback): + task_class = self._get_task_class() + task_class().apply_async((self.pk,), opts, link_error=error_callback, link=success_callback) + def start(self, error_callback, success_callback, **kwargs): ''' Start the task running via Celery. @@ -842,7 +846,7 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique task_class = self._get_task_class() (res, opts) = self.pre_start(**kwargs) if res: - task_class().apply_async((self.pk,), opts, link_error=error_callback, link=success_callback) + self.start_celery_task(opts, error_callback, success_callback) return res def signal_start(self, **kwargs): @@ -871,7 +875,7 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique self.websocket_emit_status("pending") from awx.main.scheduler.tasks import run_job_launch - run_job_launch.delay(self.id) + connection.on_commit(lambda: run_job_launch.delay(self.id)) # Each type of unified job has a different Task class; get the # appropirate one. diff --git a/awx/main/scheduler/__init__.py b/awx/main/scheduler/__init__.py index 0711528c56..e704b3ef8a 100644 --- a/awx/main/scheduler/__init__.py +++ b/awx/main/scheduler/__init__.py @@ -8,7 +8,7 @@ from sets import Set # Django from django.conf import settings -from django.db import transaction +from django.db import transaction, connection from django.db.utils import DatabaseError # AWX @@ -20,8 +20,10 @@ from awx.main.scheduler.dependency_graph import DependencyGraph from awx.main.scheduler.partial import ( JobDict, ProjectUpdateDict, - InventoryUpdateDict, ProjectUpdateLatestDict, + InventoryUpdateDict, + InventoryUpdateLatestDict, + InventorySourceDict, ) # Celery @@ -72,11 +74,34 @@ class Scheduler(): return ProjectUpdateLatestDict.filter_partial(list(project_ids)) + # TODO: Consider a database query for this logic + def get_latest_inventory_update_tasks(self, all_sorted_tasks): + inventory_ids = Set() + for task in all_sorted_tasks: + if type(task) == JobDict: + inventory_ids.add(task['inventory_id']) + + return InventoryUpdateLatestDict.filter_partial(list(inventory_ids)) + + def get_running_workflow_jobs(self): graph_workflow_jobs = [wf for wf in WorkflowJob.objects.filter(status='running')] return graph_workflow_jobs + # TODO: Consider a database query for this logic + def get_inventory_source_tasks(self, all_sorted_tasks): + inventory_ids = Set() + results = [] + for task in all_sorted_tasks: + if type(task) is JobDict: + inventory_ids.add(task['inventory_id']) + + for inventory_id in inventory_ids: + results.append((inventory_id, InventorySourceDict.filter_partial(inventory_id))) + + return results + def spawn_workflow_graph_jobs(self, workflow_jobs): # TODO: Consider using transaction.atomic for workflow_job in workflow_jobs: @@ -134,8 +159,6 @@ class Scheduler(): def start_task(self, task, dependent_tasks=[]): from awx.main.tasks import handle_work_error, handle_work_success - #print("start_task() <%s, %s> with deps %s" % (task.get_job_type_str(), task['id'], dependent_tasks)) - # TODO: spawn inventory and project updates task_actual = { 'type':task.get_job_type_str(), @@ -148,10 +171,8 @@ class Scheduler(): job_obj = task.get_full() job_obj.status = 'waiting' - job_obj.save() - #print("For real, starting job <%s, %s>" % (type(job_obj), job_obj.id)) - start_status = job_obj.start(error_callback=error_handler, success_callback=success_handler) + (start_status, opts) = job_obj.pre_start() if not start_status: job_obj.status = 'failed' if job_obj.job_explanation: @@ -163,6 +184,8 @@ class Scheduler(): self.consume_capacity(task) + connection.on_commit(lambda: job_obj.start_celery_task(opts, error_callback=error_handler, success_callback=success_handler)) + def process_runnable_tasks(self, runnable_tasks): for i, task in enumerate(runnable_tasks): # TODO: maybe batch process new tasks. @@ -179,10 +202,20 @@ class Scheduler(): dep.save() project_task = ProjectUpdateDict.get_partial(dep.id) - #waiting_tasks.insert(waiting_tasks.index(task), dep) return project_task + def create_inventory_update(self, task, inventory_source_task): + dep = InventorySource.objects.get(id=inventory_source_task['id']).create_inventory_update(launch_type='dependency') + + dep.created = task['created'] - timedelta(seconds=2) + dep.status = 'waiting' + dep.save() + + inventory_task = InventoryUpdateDict.get_partial(dep.id) + + return inventory_task + def generate_dependencies(self, task): dependencies = [] # TODO: What if the project is null ? @@ -191,12 +224,24 @@ class Scheduler(): self.graph.should_update_related_project(task): project_task = self.create_project_update(task) dependencies.append(project_task) - # Inventory created 2 seconds behind + # Inventory created 2 seconds behind job + + for inventory_source_task in self.graph.get_inventory_sources(task['inventory_id']): + if self.graph.should_update_related_inventory_source(task, inventory_source_task['id']): + inventory_task = self.create_inventory_update(task, inventory_source_task) + dependencies.append(inventory_task) return dependencies def process_latest_project_updates(self, latest_project_updates): - for task in latest_project_updates: - self.graph.add_latest_project_update(task) + map(lambda task: self.graph.add_latest_project_update(task), latest_project_updates) + + def process_latest_inventory_updates(self, latest_inventory_updates): + map(lambda task: self.graph.add_latest_inventory_update(task), latest_inventory_updates) + + def process_inventory_sources(self, inventory_id_sources): + #map(lambda inventory_id, inventory_sources: self.graph.add_inventory_sources(inventory_id, inventory_sources), inventory_id_sources) + for inventory_id, inventory_sources in inventory_id_sources: + self.graph.add_inventory_sources(inventory_id, inventory_sources) def process_dependencies(self, dependent_task, dependency_tasks): for task in dependency_tasks: @@ -205,7 +250,6 @@ class Scheduler(): if not self.graph.is_job_blocked(task): self.graph.add_job(task) if not self.would_exceed_capacity(task): - #print("process_dependencies() going to run project update <%s, %s>" % (task['id'], task['project_id'])) self.start_task(task, [dependent_task]) else: self.graph.add_job(task) @@ -214,7 +258,6 @@ class Scheduler(): for task in pending_tasks: if not self.graph.is_job_blocked(task): - #print("process_pending_tasks() generating deps for job <%s, %s, %s>" % (task['id'], task['project_id'], task.model)) dependencies = self.generate_dependencies(task) self.process_dependencies(task, dependencies) @@ -222,7 +265,6 @@ class Scheduler(): if not self.graph.is_job_blocked(task): self.graph.add_job(task) if not self.would_exceed_capacity(task): - #print("Starting the original task <%s, %s>" % (task.get_job_type_str(), task['id'])) self.start_task(task) else: self.graph.add_job(task) @@ -272,7 +314,6 @@ class Scheduler(): def consume_capacity(self, task): self.capacity_used += task.task_impact() - #print("Capacity used %s vs total %s" % (self.capacity_used, self.capacity_total)) def get_remaining_capacity(self): return (self.capacity_total - self.capacity_used) @@ -320,6 +361,12 @@ class Scheduler(): latest_project_updates = self.get_latest_project_update_tasks(all_sorted_tasks) self.process_latest_project_updates(latest_project_updates) + latest_inventory_updates = self.get_latest_inventory_update_tasks(all_sorted_tasks) + self.process_latest_inventory_updates(latest_inventory_updates) + + inventory_id_sources = self.get_inventory_source_tasks(all_sorted_tasks) + self.process_inventory_sources(inventory_id_sources) + self.process_tasks(all_sorted_tasks) #print("Finished schedule()") diff --git a/awx/main/scheduler/dependency_graph.py b/awx/main/scheduler/dependency_graph.py index 5ecea91385..14a77ab697 100644 --- a/awx/main/scheduler/dependency_graph.py +++ b/awx/main/scheduler/dependency_graph.py @@ -6,7 +6,12 @@ class DependencyGraph(object): PROJECT_UPDATES = 'project_updates' INVENTORY_UPDATES = 'inventory_updates' JOB_TEMPLATE_JOBS = 'job_template_jobs' + INVENTORY_SOURCE_UPDATES = 'inventory_source_updates' + LATEST_PROJECT_UPDATES = 'latest_project_updates' + LATEST_INVENTORY_UPDATES = 'latest_inventory_updates' + + INVENTORY_SOURCES = 'inventory_source_ids' def __init__(self, *args, **kwargs): self.data = {} @@ -16,13 +21,29 @@ class DependencyGraph(object): self.data[self.INVENTORY_UPDATES] = {} # job_template_id -> True / False self.data[self.JOB_TEMPLATE_JOBS] = {} + # inventory_source_id -> True / False + self.data[self.INVENTORY_SOURCE_UPDATES] = {} - # project_id -> latest ProjectUpdateDict + # project_id -> latest ProjectUpdateLatestDict self.data[self.LATEST_PROJECT_UPDATES] = {} + # inventory_source_id -> latest InventoryUpdateLatestDict + self.data[self.LATEST_INVENTORY_UPDATES] = {} + + # inventory_id -> [inventory_source_ids] + self.data[self.INVENTORY_SOURCES] = {} def add_latest_project_update(self, job): self.data[self.LATEST_PROJECT_UPDATES][job['project_id']] = job + def add_latest_inventory_update(self, job): + self.data[self.LATEST_INVENTORY_UPDATES][job['inventory_source_id']] = job + + def add_inventory_sources(self, inventory_id, inventory_sources): + self.data[self.INVENTORY_SOURCES][inventory_id] = inventory_sources + + def get_inventory_sources(self, inventory_id): + return self.data[self.INVENTORY_SOURCES].get(inventory_id, []) + def get_now(self): return tz_now() @@ -61,25 +82,59 @@ class DependencyGraph(object): return False - def add_project_update(self, job): + def should_update_related_inventory_source(self, job, inventory_source_id): + now = self.get_now() + latest_inventory_update = self.data[self.LATEST_INVENTORY_UPDATES].get(inventory_source_id, None) + if not latest_inventory_update: + return True + + # TODO: Other finished, failed cases? i.e. error ? + if latest_inventory_update['status'] == 'failed': + return True + + ''' + This is a bit of fuzzy logic. + If the latest inventory update has a created time == job_created_time-2 + then consider the inventory update found. This is so we don't enter an infinite loop + of updating the project when cache timeout is 0. + ''' + if latest_inventory_update['inventory_source__update_cache_timeout'] == 0 and \ + latest_inventory_update['launch_type'] == 'dependency' and \ + latest_inventory_update['created'] == job['created'] - timedelta(seconds=2): + return False + + ''' + Normal, expected, cache timeout logic + ''' + timeout_seconds = timedelta(seconds=latest_inventory_update['inventory_source__update_cache_timeout']) + if (latest_inventory_update['finished'] + timeout_seconds) < now: + return True + + return False + + def mark_project_update(self, job): self.data[self.PROJECT_UPDATES][job['project_id']] = False - def add_inventory_update(self, job): + def mark_inventory_update(self, inventory_id): + self.data[self.INVENTORY_UPDATES][inventory_id] = False + + def mark_inventory_source_update(self, inventory_source_id): + self.data[self.INVENTORY_SOURCE_UPDATES][inventory_source_id] = False + + def mark_job_template_job(self, job): self.data[self.INVENTORY_UPDATES][job['inventory_id']] = False - - def add_job_template_job(self, job): + self.data[self.PROJECT_UPDATES][job['project_id']] = False self.data[self.JOB_TEMPLATE_JOBS][job['job_template_id']] = False - def can_project_update_run(self, job): return self.data[self.PROJECT_UPDATES].get(job['project_id'], True) - def can_inventory_update_run(self, job): - return self.data[self.INVENTORY_UPDATES].get(job['inventory_id'], True) + def can_inventory_update_run(self, inventory_source_id): + return self.data[self.INVENTORY_SOURCE_UPDATES].get(inventory_source_id, True) def can_job_run(self, job): if self.can_project_update_run(job) is True and \ - self.can_inventory_update_run(job) is True: + self.data[self.INVENTORY_UPDATES].get(job['inventory_id'], True) is True: if job['allow_simultaneous'] is False: return self.data[self.JOB_TEMPLATE_JOBS].get(job['job_template_id'], True) else: @@ -90,17 +145,18 @@ class DependencyGraph(object): if type(job) is ProjectUpdateDict: return not self.can_project_update_run(job) elif type(job) is InventoryUpdateDict: - return not self.can_inventory_update_run(job) + return not self.can_inventory_update_run(job['inventory_source_id']) elif type(job) is JobDict: return not self.can_job_run(job) def add_job(self, job): if type(job) is ProjectUpdateDict: - self.add_project_update(job) + self.mark_project_update(job) elif type(job) is InventoryUpdateDict: - self.add_inventory_update(job) + self.mark_inventory_update(job['inventory_source__inventory_id']) + self.mark_inventory_source_update(job['inventory_source_id']) elif type(job) is JobDict: - self.add_job_template_job(job) + self.mark_job_template_job(job) def add_jobs(self, jobs): for j in jobs: diff --git a/awx/main/scheduler/partial.py b/awx/main/scheduler/partial.py index 16c6597f99..e3677dab2c 100644 --- a/awx/main/scheduler/partial.py +++ b/awx/main/scheduler/partial.py @@ -4,6 +4,7 @@ from awx.main.models import ( Job, ProjectUpdate, InventoryUpdate, + InventorySource, ) class PartialModelDict(object): @@ -57,7 +58,7 @@ class JobDict(PartialModelDict): 'id', 'status', 'job_template_id', 'inventory_id', 'project_id', 'launch_type', 'limit', 'allow_simultaneous', 'created', 'job_type', 'celery_task_id', 'project__scm_update_on_launch', - 'forks', + 'forks', 'inventory__inventory_sources', ) model = Job @@ -69,7 +70,9 @@ class JobDict(PartialModelDict): class ProjectUpdateDict(PartialModelDict): FIELDS = ( - 'id', 'status', 'project_id', 'created', 'celery_task_id', 'launch_type', 'project__scm_update_cache_timeout', 'project__scm_update_on_launch', + 'id', 'status', 'project_id', 'created', 'celery_task_id', + 'launch_type', 'project__scm_update_cache_timeout', + 'project__scm_update_on_launch', ) model = ProjectUpdate @@ -81,23 +84,29 @@ class ProjectUpdateDict(PartialModelDict): class ProjectUpdateLatestDict(ProjectUpdateDict): FIELDS = ( - 'id', 'status', 'project_id', 'created', 'finished', 'project__scm_update_cache_timeout', 'launch_type', 'project__scm_update_on_launch', + 'id', 'status', 'project_id', 'created', 'finished', + 'project__scm_update_cache_timeout', + 'launch_type', 'project__scm_update_on_launch', ) model = ProjectUpdate @classmethod def filter_partial(cls, project_ids): # TODO: This can shurley be made more efficient + # * shouldn't have to do a query per inventory_id + # * shouldn't have to call .values() on all the results, only to get the first result results = [] for project_id in project_ids: - qs = cls.model.objects.filter(project_id=project_id, status__in=['waiting', 'successful', 'failed']).order_by('-finished') + qs = cls.model.objects.filter(project_id=project_id, status__in=['waiting', 'successful', 'failed']).order_by('-finished', '-started', '-created',) if qs.count() > 0: results.append(cls(cls.model.objects.filter(id=qs[0].id).values(*cls.get_db_values())[0])) return results class InventoryUpdateDict(PartialModelDict): + #'inventory_source__update_on_launch', + #'inventory_source__update_cache_timeout', FIELDS = ( - 'id', 'status', 'created', 'celery_task_id', + 'id', 'status', 'created', 'celery_task_id', 'inventory_source_id', 'inventory_source__inventory_id', ) model = InventoryUpdate @@ -107,3 +116,50 @@ class InventoryUpdateDict(PartialModelDict): def task_impact(self): return 20 +class InventoryUpdateLatestDict(InventoryUpdateDict): + #'inventory_source__update_on_launch', + #'inventory_source__update_cache_timeout', + FIELDS = ( + 'id', 'status', 'created', 'celery_task_id', 'inventory_source_id', + 'finished', 'inventory_source__update_cache_timeout', 'launch_type', + ) + model = InventoryUpdate + + @classmethod + def filter_partial(cls, inventory_ids): + # TODO: This can shurley be made more efficient + # * shouldn't have to do a query per inventory_id nor per inventory_source_id + # * shouldn't have to call .values() on all the results, only to get the first result + results = [] + for inventory_id in inventory_ids: + inventory_source_ids = InventorySource.objects.filter(inventory_id=inventory_id, + update_on_launch=True).values_list('id', flat=True) + # Find the most recent inventory update for each inventory source + for inventory_source_id in inventory_source_ids: + qs = cls.model.objects.filter(inventory_source_id=inventory_source_id, + status__in=['waiting', 'successful', 'failed'], + inventory_source__update_on_launch=True).order_by('-finished', '-started', '-created') + if qs.count() > 0: + results.append(cls(cls.model.objects.filter(id=qs[0].id).values(*cls.get_db_values())[0])) + return results + +class InventorySourceDict(PartialModelDict): + FIELDS = ( + 'id', + ) + model = InventorySource + + def get_job_type_str(self): + return 'inventory_source' + + def task_impact(self): + return 20 + + @classmethod + # TODO: Optimize this to run the query once + def filter_partial(cls, inventory_id): + kv = { + 'inventory_id': inventory_id, + 'update_on_launch': True, + } + return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())] diff --git a/awx/main/scheduler/tasks.py b/awx/main/scheduler/tasks.py index ef0334e316..ba1ddaeecc 100644 --- a/awx/main/scheduler/tasks.py +++ b/awx/main/scheduler/tasks.py @@ -21,67 +21,10 @@ logger = logging.getLogger('awx.main.scheduler') @task def run_job_launch(job_id): - ''' - # Wait for job to exist. - # The job is created in a transaction then the message is created, but - # the transaction may not have completed. - - # FIXME: We could generate the message in a Django signal handler. - # OR, we could call an explicit commit in the view and then send the - # message. - - retries = 10 - retry = 0 - while not UnifiedJob.objects.filter(id=job_id).exists(): - time.sleep(0.3) - - if retry >= retries: - logger.error("Failed to process 'job_launch' message for job %d" % job_id) - # ack the message so we don't build up the queue. - # - # The job can still be chosen to run during tower startup or - # when another job is started or completes - return - retry += 1 - - # "Safe" to get the job now since it exists. - # Really, there is a race condition from exists to get - - # TODO: while not loop should call get wrapped in a try except - #job = UnifiedJob.objects.get(id=job_id) - ''' - Scheduler().schedule() @task def run_job_complete(job_id): - ''' - # TODO: use list of finished status from jobs.py or unified_jobs.py - finished_status = ['successful', 'error', 'failed', 'completed'] - q = UnifiedJob.objects.filter(id=job_id) - - # Ensure that the job is updated in the database before we call to - # schedule the next job. - retries = 10 - retry = 0 - while True: - # Job not found, most likely deleted. That's fine - if not q.exists(): - logger.warn("Failed to find job '%d' while processing 'job_complete' message. Presume that it was deleted." % job_id) - break - - job = q[0] - if job.status in finished_status: - break - - time.sleep(0.3) - - if retry >= retries: - logger.error("Expected job status '%s' to be one of '%s' while processing 'job_complete' message." % (job.status, finished_status)) - return - retry += 1 - ''' - Scheduler().schedule() @task diff --git a/awx/main/tests/functional/test_partial.py b/awx/main/tests/functional/test_partial.py index 69ad71c4df..0ab84dc901 100644 --- a/awx/main/tests/functional/test_partial.py +++ b/awx/main/tests/functional/test_partial.py @@ -6,44 +6,48 @@ from datetime import timedelta # AWX from awx.main.models import ( + Organization, + Inventory, + Group, Project, ProjectUpdate, + InventoryUpdate, + InventorySource, ) from awx.main.scheduler.partial import ( ProjectUpdateLatestDict, + InventoryUpdateDict, + InventoryUpdateLatestDict, ) - @pytest.fixture -def failed_project_update(): - p = Project.objects.create(name="proj1") - pu = ProjectUpdate.objects.create(project=p, status='failed', finished=tz_now() - timedelta(seconds=20)) - - return (p, pu) - -@pytest.fixture -def successful_project_update(): - p = Project.objects.create(name="proj1") - pu = ProjectUpdate.objects.create(project=p, status='successful', finished=tz_now() - timedelta(seconds=20)) - - return (p, pu) - -# Failed project updates newer than successful ones -@pytest.fixture -def multiple_project_updates(): - p = Project.objects.create(name="proj1") - - epoch = tz_now() - - successful_pus = [ProjectUpdate.objects.create(project=p, - status='successful', - finished=epoch - timedelta(seconds=100 + i)) for i in xrange(0, 5)] - failed_pus = [ProjectUpdate.objects.create(project=p, - status='failed', - finished=epoch - timedelta(seconds=100 - len(successful_pus) + i)) for i in xrange(0, 5)] - return (p, failed_pus, successful_pus) +def org(): + return Organization.objects.create(name="org1") class TestProjectUpdateLatestDictDict(): + @pytest.fixture + def successful_project_update(self): + p = Project.objects.create(name="proj1") + pu = ProjectUpdate.objects.create(project=p, status='successful', finished=tz_now() - timedelta(seconds=20)) + + return (p, pu) + + # Failed project updates newer than successful ones + @pytest.fixture + def multiple_project_updates(self): + p = Project.objects.create(name="proj1") + + epoch = tz_now() + + successful_pus = [ProjectUpdate.objects.create(project=p, + status='successful', + finished=epoch - timedelta(seconds=100 + i)) for i in xrange(0, 5)] + failed_pus = [ProjectUpdate.objects.create(project=p, + status='failed', + finished=epoch - timedelta(seconds=100 - len(successful_pus) + i)) for i in xrange(0, 5)] + return (p, failed_pus, successful_pus) + + @pytest.mark.django_db class TestFilterPartial(): def test_project_update_successful(self, successful_project_update): @@ -63,3 +67,54 @@ class TestProjectUpdateLatestDictDict(): assert failed_pus[0].id == tasks[0]['id'] +class TestInventoryUpdateDict(): + @pytest.fixture + def waiting_inventory_update(self, org): + i = Inventory.objects.create(name='inv1', organization=org) + g = Group.objects.create(name='group1', inventory=i) + #Inventory.groups.add(g) + inv_src = InventorySource.objects.create(group=g) + iu = InventoryUpdate.objects.create(inventory_source=inv_src, status='waiting') + return iu + + @pytest.mark.django_db + class TestFilterPartial(): + def test_simple(self, waiting_inventory_update): + tasks = InventoryUpdateDict.filter_partial(status=['waiting']) + + assert 1 == len(tasks) + assert waiting_inventory_update.id == tasks[0]['id'] + +class TestInventoryUpdateLatestDict(): + @pytest.fixture + def inventory(self, org): + i = Inventory.objects.create(name='inv1', organization=org) + return i + + @pytest.fixture + def inventory_updates(self, inventory): + g1 = Group.objects.create(name='group1', inventory=inventory) + g2 = Group.objects.create(name='group2', inventory=inventory) + g3 = Group.objects.create(name='group3', inventory=inventory) + + inv_src1 = InventorySource.objects.create(group=g1, update_on_launch=True, inventory=inventory) + inv_src2 = InventorySource.objects.create(group=g2, update_on_launch=False, inventory=inventory) + inv_src3 = InventorySource.objects.create(group=g3, update_on_launch=True, inventory=inventory) + + iu1 = InventoryUpdate.objects.create(inventory_source=inv_src1, status='successful') + iu2 = InventoryUpdate.objects.create(inventory_source=inv_src2, status='waiting') + iu3 = InventoryUpdate.objects.create(inventory_source=inv_src3, status='waiting') + return [iu1, iu2, iu3] + + @pytest.mark.django_db + def test_filter_partial(self, inventory, inventory_updates): + + tasks = InventoryUpdateLatestDict.filter_partial([inventory.id]) + + inventory_updates_expected = [inventory_updates[0], inventory_updates[2]] + + assert 2 == len(tasks) + for i, inventory_update in enumerate(inventory_updates_expected): + assert inventory_update.id == tasks[i]['id'] + + diff --git a/awx/main/tests/unit/scheduler/conftest.py b/awx/main/tests/unit/scheduler/conftest.py new file mode 100644 index 0000000000..d8a71d456e --- /dev/null +++ b/awx/main/tests/unit/scheduler/conftest.py @@ -0,0 +1,238 @@ + +# Python +import pytest +from datetime import timedelta + +# Django +from django.utils.timezone import now as tz_now + +# awx +from awx.main.scheduler.partial import ( + JobDict, + ProjectUpdateDict, + InventoryUpdateDict, + InventorySourceDict, +) +from awx.main.scheduler import Scheduler + + +@pytest.fixture +def epoch(): + return tz_now() + +@pytest.fixture +def scheduler_factory(mocker, epoch): + def fn(tasks=[], inventory_sources=[], latest_project_updates=[], latest_inventory_updates=[], create_project_update=None, create_inventory_update=None): + sched = Scheduler() + sched.capacity_total = 999999999 + + sched.graph.get_now = lambda: epoch + + def no_create_inventory_update(task, ignore): + raise RuntimeError("create_inventory_update should not be called") + def no_create_project_update(task): + raise RuntimeError("create_project_update should not be called") + + mocker.patch.object(sched, 'get_tasks', return_value=tasks) + mocker.patch.object(sched, 'get_inventory_source_tasks', return_value=inventory_sources) + mocker.patch.object(sched, 'get_latest_project_update_tasks', return_value=latest_project_updates) + mocker.patch.object(sched, 'get_latest_inventory_update_tasks', return_value=latest_inventory_updates) + create_project_update_mock = mocker.patch.object(sched, 'create_project_update', return_value=create_project_update) + create_inventory_update_mock = mocker.patch.object(sched, 'create_inventory_update', return_value=create_inventory_update) + mocker.patch.object(sched, 'start_task') + + if not create_project_update: + create_project_update_mock.side_effect = no_create_project_update + if not create_inventory_update: + create_inventory_update_mock.side_effect = no_create_inventory_update + return sched + return fn + +@pytest.fixture +def project_update_factory(epoch): + def fn(): + return ProjectUpdateDict({ + 'id': 1, + 'created': epoch - timedelta(seconds=100), + 'project_id': 1, + 'project__scm_update_cache_timeout': 0, + 'celery_task_id': '', + 'launch_type': 'dependency', + 'project__scm_update_on_launch': True, + }) + return fn + +@pytest.fixture +def pending_project_update(project_update_factory): + project_update = project_update_factory() + project_update['status'] = 'pending' + return project_update + +@pytest.fixture +def waiting_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['status'] = 'waiting' + return project_update + +@pytest.fixture +def running_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['status'] = 'running' + return project_update + +@pytest.fixture +def successful_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['finished'] = epoch - timedelta(seconds=90) + project_update['status'] = 'successful' + return project_update + +@pytest.fixture +def successful_project_update_cache_expired(epoch, project_update_factory): + project_update = project_update_factory() + + project_update['status'] = 'successful' + project_update['created'] = epoch - timedelta(seconds=120) + project_update['finished'] = epoch - timedelta(seconds=110) + project_update['project__scm_update_cache_timeout'] = 1 + return project_update + +@pytest.fixture +def failed_project_update(epoch, project_update_factory): + project_update = project_update_factory() + project_update['finished'] = epoch - timedelta(seconds=90) + project_update['status'] = 'failed' + return project_update + +@pytest.fixture +def inventory_update_factory(epoch): + def fn(): + return InventoryUpdateDict({ + 'id': 1, + 'created': epoch - timedelta(seconds=101), + 'inventory_id': 1, + 'celery_task_id': '', + 'status': 'pending', + 'launch_type': 'dependency', + 'inventory_source_id': 1, + 'inventory_source__inventory_id': 1, + }) + return fn + +@pytest.fixture +def inventory_update_latest_factory(epoch): + def fn(): + return InventoryUpdateDict({ + 'id': 1, + 'created': epoch - timedelta(seconds=101), + 'inventory_id': 1, + 'celery_task_id': '', + 'status': 'pending', + 'launch_type': 'dependency', + 'inventory_source_id': 1, + 'finished': None, + }) + return fn + +@pytest.fixture +def inventory_update_latest(inventory_update_latest_factory): + return inventory_update_latest_factory() + +@pytest.fixture +def successful_inventory_update_latest(inventory_update_latest_factory): + iu = inventory_update_latest_factory() + iu['status'] = 'successful' + iu['finished'] = iu['created'] + timedelta(seconds=10) + return iu + +@pytest.fixture +def failed_inventory_update_latest(inventory_update_latest_factory): + iu = inventory_update_latest_factory() + iu['status'] = 'failed' + return iu + +@pytest.fixture +def pending_inventory_update(epoch, inventory_update_factory): + inventory_update = inventory_update_factory() + inventory_update['status'] = 'pending' + return inventory_update + +@pytest.fixture +def waiting_inventory_update(epoch, inventory_update_factory): + inventory_update = inventory_update_factory() + inventory_update['status'] = 'waiting' + return inventory_update + +@pytest.fixture +def failed_inventory_update(epoch, inventory_update_factory): + inventory_update = inventory_update_factory() + inventory_update['status'] = 'failed' + return inventory_update + +@pytest.fixture +def running_inventory_update(epoch, inventory_update_factory): + inventory_update = inventory_update_factory() + inventory_update['status'] = 'running' + return inventory_update + +@pytest.fixture +def successful_inventory_update(epoch, inventory_update_factory): + inventory_update = inventory_update_factory() + inventory_update['finished'] = epoch - timedelta(seconds=90) + inventory_update['status'] = 'successful' + return inventory_update + +''' +Job +''' +@pytest.fixture +def job_factory(epoch): + def fn(project__scm_update_on_launch=True, inventory__inventory_sources=[]): + return JobDict({ + 'id': 1, + 'status': 'pending', + 'job_template_id': 1, + 'project_id': 1, + 'inventory_id': 1, + 'launch_type': 'manual', + 'allow_simultaneous': False, + 'created': epoch - timedelta(seconds=99), + 'celery_task_id': '', + 'project__scm_update_on_launch': project__scm_update_on_launch, + 'inventory__inventory_sources': inventory__inventory_sources, + 'forks': 5 + }) + return fn + +@pytest.fixture +def pending_job(job_factory): + job = job_factory() + job['status'] = 'pending' + return job + +@pytest.fixture +def running_job(job_factory): + job = job_factory() + job['status'] = 'running' + return job + +''' +Inventory id -> [InventorySourceDict, ...] +''' +@pytest.fixture +def inventory_source_factory(): + def fn(id=1): + return InventorySourceDict({ + 'id': id, + }) + return fn + +@pytest.fixture +def inventory_id_sources(inventory_source_factory): + return [ + (1, [ + inventory_source_factory(id=1), + inventory_source_factory(id=2), + ]), + ] + diff --git a/awx/main/tests/unit/scheduler/test_scheduler_inventory_update.py b/awx/main/tests/unit/scheduler/test_scheduler_inventory_update.py new file mode 100644 index 0000000000..09125df527 --- /dev/null +++ b/awx/main/tests/unit/scheduler/test_scheduler_inventory_update.py @@ -0,0 +1,85 @@ + +# Python +import pytest +from datetime import timedelta + +@pytest.fixture +def pending_job(job_factory): + return job_factory(project__scm_update_on_launch=False, inventory__inventory_sources=['1']) + +@pytest.fixture +def successful_inventory_update_latest(inventory_update_latest_factory): + iu = inventory_update_latest_factory() + iu['inventory_source__update_cache_timeout'] = 100 + iu['status'] = 'successful' + iu['finished'] = iu['created'] + timedelta(seconds=10) + return iu + +@pytest.fixture +def successful_inventory_update_latest_cache_expired(inventory_update_latest_factory): + iu = inventory_update_latest_factory() + iu['inventory_source__update_cache_timeout'] = 1 + iu['finished'] = iu['created'] + timedelta(seconds=2) + return iu + +class TestStartInventoryUpdate(): + def test_pending(self, scheduler_factory, pending_inventory_update): + scheduler = scheduler_factory(tasks=[pending_inventory_update]) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_inventory_update) + +class TestInventoryUpdateBlocked(): + def test_running_inventory_update(self, epoch, scheduler_factory, running_inventory_update, pending_inventory_update): + running_inventory_update['created'] = epoch - timedelta(seconds=100) + pending_inventory_update['created'] = epoch - timedelta(seconds=90) + + scheduler = scheduler_factory(tasks=[running_inventory_update, pending_inventory_update]) + + scheduler._schedule() + + def test_waiting_inventory_update(self, epoch, scheduler_factory, waiting_inventory_update, pending_inventory_update): + waiting_inventory_update['created'] = epoch - timedelta(seconds=100) + pending_inventory_update['created'] = epoch - timedelta(seconds=90) + + scheduler = scheduler_factory(tasks=[waiting_inventory_update, pending_inventory_update]) + + scheduler._schedule() + +class TestCreateDependentInventoryUpdate(): + + def test(self, scheduler_factory, pending_job, waiting_inventory_update, inventory_id_sources): + scheduler = scheduler_factory(tasks=[pending_job], + create_inventory_update=waiting_inventory_update, + inventory_sources=inventory_id_sources) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_inventory_update, [pending_job]) + + def test_cache_hit(self, scheduler_factory, pending_job, successful_inventory_update, successful_inventory_update_latest): + scheduler = scheduler_factory(tasks=[successful_inventory_update, pending_job], + latest_inventory_updates=[successful_inventory_update_latest]) + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_job) + + def test_cache_miss(self, scheduler_factory, pending_job, successful_inventory_update, successful_inventory_update_latest_cache_expired, waiting_inventory_update, inventory_id_sources): + scheduler = scheduler_factory(tasks=[successful_inventory_update, pending_job], + latest_inventory_updates=[successful_inventory_update_latest_cache_expired], + create_inventory_update=waiting_inventory_update, + inventory_sources=inventory_id_sources) + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_inventory_update, [pending_job]) + + def test_last_update_failed(self, scheduler_factory, pending_job, failed_inventory_update, failed_inventory_update_latest, waiting_inventory_update, inventory_id_sources): + scheduler = scheduler_factory(tasks=[failed_inventory_update, pending_job], + latest_inventory_updates=[failed_inventory_update_latest], + create_inventory_update=waiting_inventory_update, + inventory_sources=inventory_id_sources) + scheduler._schedule() + + scheduler.start_task.assert_called_with(waiting_inventory_update, [pending_job]) + diff --git a/awx/main/tests/unit/scheduler/test_scheduler_job.py b/awx/main/tests/unit/scheduler/test_scheduler_job.py new file mode 100644 index 0000000000..37af2ead05 --- /dev/null +++ b/awx/main/tests/unit/scheduler/test_scheduler_job.py @@ -0,0 +1,66 @@ + +# Python +import pytest +from datetime import timedelta + +# awx +from awx.main.scheduler.partial import ( + JobDict, + ProjectUpdateDict, +) + +# TODO: wherever get_latest_rpoject_update_task() is stubbed and returns a +# ProjectUpdateDict. We should instead return a ProjectUpdateLatestDict() +# For now, this is ok since the fields on deviate that much. + +class TestJobBlocked(): + def test_inventory_update_waiting(self, scheduler_factory, waiting_inventory_update, pending_job): + scheduler = scheduler_factory(tasks=[waiting_inventory_update, pending_job]) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + + def test_inventory_update_running(self, scheduler_factory, running_inventory_update, pending_job, inventory_source_factory, inventory_id_sources): + scheduler = scheduler_factory(tasks=[running_inventory_update, pending_job], + inventory_sources=inventory_id_sources) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + + def test_project_update_running(self, scheduler_factory, pending_job, running_project_update): + scheduler = scheduler_factory(tasks=[running_project_update, pending_job]) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + assert scheduler.create_project_update.call_count == 0 + + def test_project_update_waiting(self, scheduler_factory, pending_job, waiting_project_update): + scheduler = scheduler_factory(tasks=[waiting_project_update, pending_job], + latest_project_updates=[waiting_project_update]) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + assert scheduler.create_project_update.call_count == 0 + +class TestJob(): + @pytest.fixture + def successful_project_update(self, project_update_factory): + project_update = project_update_factory() + project_update['status'] = 'successful' + project_update['finished'] = project_update['created'] + timedelta(seconds=10) + project_update['project__scm_update_cache_timeout'] = 3600 + return project_update + + def test_existing_dependencies_finished(self, scheduler_factory, successful_project_update, successful_inventory_update_latest, pending_job): + scheduler = scheduler_factory(tasks=[successful_project_update, pending_job], + latest_project_updates=[successful_project_update], + latest_inventory_updates=[successful_inventory_update_latest]) + + scheduler._schedule() + + scheduler.start_task.assert_called_with(pending_job) + diff --git a/awx/main/tests/unit/scheduler/test_scheduler_project_update.py b/awx/main/tests/unit/scheduler/test_scheduler_project_update.py index 54add63d51..e0fcbc3b1e 100644 --- a/awx/main/tests/unit/scheduler/test_scheduler_project_update.py +++ b/awx/main/tests/unit/scheduler/test_scheduler_project_update.py @@ -17,98 +17,6 @@ from awx.main.scheduler import Scheduler # ProjectUpdateDict. We should instead return a ProjectUpdateLatestDict() # For now, this is ok since the fields on deviate that much. -@pytest.fixture -def epoch(): - return tz_now() - - -@pytest.fixture -def scheduler_factory(mocker, epoch): - def fn(tasks=[], latest_project_updates=[], create_project_update=None): - sched = Scheduler() - sched.capacity_total = 999999999 - - sched.graph.get_now = lambda: epoch - - mocker.patch.object(sched, 'get_tasks', return_value=tasks) - mocker.patch.object(sched, 'get_latest_project_update_tasks', return_value=latest_project_updates) - mocker.patch.object(sched, 'create_project_update', return_value=create_project_update) - mocker.patch.object(sched, 'start_task') - return sched - return fn - -@pytest.fixture -def project_update_factory(epoch): - def fn(): - return ProjectUpdateDict({ - 'id': 1, - 'created': epoch - timedelta(seconds=100), - 'project_id': 1, - 'project__scm_update_cache_timeout': 0, - 'celery_task_id': '', - 'launch_type': 'dependency', - 'project__scm_update_on_launch': True, - }) - return fn - -@pytest.fixture -def pending_project_update(project_update_factory): - project_update = project_update_factory() - project_update['status'] = 'pending' - return project_update - -@pytest.fixture -def waiting_project_update(epoch, project_update_factory): - project_update = project_update_factory() - project_update['status'] = 'waiting' - return project_update - -@pytest.fixture -def pending_job(epoch): - return JobDict({ - 'id': 1, - 'status': 'pending', - 'job_template_id': 1, - 'project_id': 1, - 'inventory_id': 1, - 'launch_type': 'manual', - 'allow_simultaneous': False, - 'created': epoch - timedelta(seconds=99), - 'celery_task_id': '', - 'project__scm_update_on_launch': True, - 'forks': 5 - }) - -@pytest.fixture -def running_project_update(epoch, project_update_factory): - project_update = project_update_factory() - project_update['status'] = 'running' - return project_update - -@pytest.fixture -def successful_project_update(epoch, project_update_factory): - project_update = project_update_factory() - project_update['finished'] = epoch - timedelta(seconds=90) - project_update['status'] = 'successful' - return project_update - -@pytest.fixture -def successful_project_update_cache_expired(epoch, project_update_factory): - project_update = project_update_factory() - - project_update['status'] = 'successful' - project_update['created'] = epoch - timedelta(seconds=120) - project_update['finished'] = epoch - timedelta(seconds=110) - project_update['project__scm_update_cache_timeout'] = 1 - return project_update - -@pytest.fixture -def failed_project_update(epoch, project_update_factory): - project_update = project_update_factory() - project_update['finished'] = epoch - timedelta(seconds=90) - project_update['status'] = 'failed' - return project_update - class TestStartProjectUpdate(): def test(self, scheduler_factory, pending_project_update): scheduler = scheduler_factory(tasks=[pending_project_update]) @@ -164,31 +72,18 @@ class TestCreateDependentProjectUpdate(): scheduler.start_task.assert_called_with(waiting_project_update, [pending_job]) - -class TestJobBlockedOnProjectUpdate(): - def test(self, scheduler_factory, pending_job, waiting_project_update): - scheduler = scheduler_factory(tasks=[waiting_project_update, pending_job], - latest_project_updates=[waiting_project_update]) - - scheduler._schedule() - - scheduler.start_task.assert_not_called() - assert scheduler.create_project_update.call_count == 0 - - def test_project_running(self, scheduler_factory, pending_job, running_project_update): - scheduler = scheduler_factory(tasks=[running_project_update, pending_job]) - - scheduler._schedule() - - scheduler.start_task.assert_not_called() - assert scheduler.create_project_update.call_count == 0 - class TestProjectUpdateBlocked(): - def test(self, scheduler_factory, running_project_update, pending_project_update): - scheduler = scheduler_factory(tasks=[running_project_update, pending_project_update], - latest_project_updates=[running_project_update]) + def test_projct_update_running(self, scheduler_factory, running_project_update, pending_project_update): + scheduler = scheduler_factory(tasks=[running_project_update, pending_project_update]) scheduler._schedule() scheduler.start_task.assert_not_called() assert scheduler.create_project_update.call_count == 0 + def test_job_running(self, scheduler_factory, running_job, pending_project_update): + scheduler = scheduler_factory(tasks=[running_job, pending_project_update]) + + scheduler._schedule() + + scheduler.start_task.assert_not_called() + From 5fa5d4b34b3b4a531207e1af88b663f84e933e95 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Mon, 24 Oct 2016 14:53:34 -0400 Subject: [PATCH 03/17] support distributed project updates --- awx/main/scheduler/dependency_graph.py | 4 ++-- awx/main/scheduler/partial.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/awx/main/scheduler/dependency_graph.py b/awx/main/scheduler/dependency_graph.py index 14a77ab697..26b7518cf1 100644 --- a/awx/main/scheduler/dependency_graph.py +++ b/awx/main/scheduler/dependency_graph.py @@ -59,7 +59,7 @@ class DependencyGraph(object): return True # TODO: Other finished, failed cases? i.e. error ? - if latest_project_update['status'] == 'failed': + if latest_project_update['status'] in ['failed', 'canceled']: return True ''' @@ -89,7 +89,7 @@ class DependencyGraph(object): return True # TODO: Other finished, failed cases? i.e. error ? - if latest_inventory_update['status'] == 'failed': + if latest_inventory_update['status'] in ['failed', 'canceled']: return True ''' diff --git a/awx/main/scheduler/partial.py b/awx/main/scheduler/partial.py index e3677dab2c..a1870ccf4f 100644 --- a/awx/main/scheduler/partial.py +++ b/awx/main/scheduler/partial.py @@ -82,6 +82,14 @@ class ProjectUpdateDict(PartialModelDict): def task_impact(self): return 10 + @classmethod + def filter_partial(cls, status=[]): + kv = { + 'status__in': status, + 'job_type': 'check', + } + return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())] + class ProjectUpdateLatestDict(ProjectUpdateDict): FIELDS = ( 'id', 'status', 'project_id', 'created', 'finished', From 46faeffbb3888c9fcbca9c7a677fe4bb7391a8ea Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Mon, 24 Oct 2016 15:32:43 -0400 Subject: [PATCH 04/17] added task manager system job support --- awx/main/scheduler/__init__.py | 23 +++++++++-------------- awx/main/scheduler/dependency_graph.py | 23 ++++++++++++++++++++--- awx/main/scheduler/partial.py | 21 +++++++++++++++++++++ 3 files changed, 50 insertions(+), 17 deletions(-) diff --git a/awx/main/scheduler/__init__.py b/awx/main/scheduler/__init__.py index e704b3ef8a..67c0c81a6a 100644 --- a/awx/main/scheduler/__init__.py +++ b/awx/main/scheduler/__init__.py @@ -24,6 +24,7 @@ from awx.main.scheduler.partial import ( InventoryUpdateDict, InventoryUpdateLatestDict, InventorySourceDict, + SystemJobDict, ) # Celery @@ -37,16 +38,16 @@ class Scheduler(): self.capacity_total = 200 self.capacity_used = 0 - def _get_tasks_with_status(self, status_list): + def get_tasks(self): + status_list = ('pending', 'waiting', 'running') - graph_jobs = JobDict.filter_partial(status=status_list) + jobs = JobDict.filter_partial(status=status_list) ''' graph_ad_hoc_commands = [ahc for ahc in AdHocCommand.objects.filter(**kv)] - graph_inventory_updates = [iu for iu in - InventoryUpdate.objects.filter(**kv)] ''' - graph_inventory_updates = InventoryUpdateDict.filter_partial(status=status_list) - graph_project_updates = ProjectUpdateDict.filter_partial(status=status_list) + inventory_updates = InventoryUpdateDict.filter_partial(status=status_list) + project_updates = ProjectUpdateDict.filter_partial(status=status_list) + system_jobs = SystemJobDict.filter_partial(status=status_list) ''' graph_system_jobs = [sj for sj in SystemJob.objects.filter(**kv)] @@ -57,14 +58,10 @@ class Scheduler(): graph_workflow_jobs, key=lambda task: task.created) ''' - all_actions = sorted(graph_jobs + graph_project_updates + graph_inventory_updates, + all_actions = sorted(jobs + project_updates + inventory_updates + system_jobs, key=lambda task: task['created']) return all_actions - def get_tasks(self): - RELEVANT_JOBS = ('pending', 'waiting', 'running') - return self._get_tasks_with_status(RELEVANT_JOBS) - # TODO: Consider a database query for this logic def get_latest_project_update_tasks(self, all_sorted_tasks): project_ids = Set() @@ -239,9 +236,7 @@ class Scheduler(): map(lambda task: self.graph.add_latest_inventory_update(task), latest_inventory_updates) def process_inventory_sources(self, inventory_id_sources): - #map(lambda inventory_id, inventory_sources: self.graph.add_inventory_sources(inventory_id, inventory_sources), inventory_id_sources) - for inventory_id, inventory_sources in inventory_id_sources: - self.graph.add_inventory_sources(inventory_id, inventory_sources) + map(lambda inventory_id, inventory_sources: self.graph.add_inventory_sources(inventory_id, inventory_sources), inventory_id_sources) def process_dependencies(self, dependent_task, dependency_tasks): for task in dependency_tasks: diff --git a/awx/main/scheduler/dependency_graph.py b/awx/main/scheduler/dependency_graph.py index 26b7518cf1..e71cd6f0a7 100644 --- a/awx/main/scheduler/dependency_graph.py +++ b/awx/main/scheduler/dependency_graph.py @@ -1,11 +1,17 @@ from datetime import timedelta from django.utils.timezone import now as tz_now -from awx.main.scheduler.partial import JobDict, ProjectUpdateDict, InventoryUpdateDict +from awx.main.scheduler.partial import ( + JobDict, + ProjectUpdateDict, + InventoryUpdateDict, + SystemJobDict, +) class DependencyGraph(object): PROJECT_UPDATES = 'project_updates' INVENTORY_UPDATES = 'inventory_updates' JOB_TEMPLATE_JOBS = 'job_template_jobs' + SYSTEM_JOB = 'system_job' INVENTORY_SOURCE_UPDATES = 'inventory_source_updates' LATEST_PROJECT_UPDATES = 'latest_project_updates' @@ -23,6 +29,8 @@ class DependencyGraph(object): self.data[self.JOB_TEMPLATE_JOBS] = {} # inventory_source_id -> True / False self.data[self.INVENTORY_SOURCE_UPDATES] = {} + # True / False + self.data[self.SYSTEM_JOB] = True # project_id -> latest ProjectUpdateLatestDict self.data[self.LATEST_PROJECT_UPDATES] = {} @@ -112,6 +120,9 @@ class DependencyGraph(object): return False + def mark_system_job(self): + self.data[self.SYSTEM_JOB] = False + def mark_project_update(self, job): self.data[self.PROJECT_UPDATES][job['project_id']] = False @@ -141,6 +152,9 @@ class DependencyGraph(object): return True return False + def can_system_job_run(self): + return self.data[self.SYSTEM_JOB] + def is_job_blocked(self, job): if type(job) is ProjectUpdateDict: return not self.can_project_update_run(job) @@ -148,6 +162,8 @@ class DependencyGraph(object): return not self.can_inventory_update_run(job['inventory_source_id']) elif type(job) is JobDict: return not self.can_job_run(job) + elif type(job) is SystemJobDict: + return not self.can_system_job_run() def add_job(self, job): if type(job) is ProjectUpdateDict: @@ -157,8 +173,9 @@ class DependencyGraph(object): self.mark_inventory_source_update(job['inventory_source_id']) elif type(job) is JobDict: self.mark_job_template_job(job) + elif type(job) is SystemJobDict: + self.mark_system_job() def add_jobs(self, jobs): - for j in jobs: - self.add_job(j) + map(lambda j: self.add_job(j), jobs) diff --git a/awx/main/scheduler/partial.py b/awx/main/scheduler/partial.py index a1870ccf4f..dddbf763e7 100644 --- a/awx/main/scheduler/partial.py +++ b/awx/main/scheduler/partial.py @@ -5,6 +5,7 @@ from awx.main.models import ( ProjectUpdate, InventoryUpdate, InventorySource, + SystemJob, ) class PartialModelDict(object): @@ -171,3 +172,23 @@ class InventorySourceDict(PartialModelDict): 'update_on_launch': True, } return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())] + +class SystemJobDict(PartialModelDict): + FIELDS = ( + 'id', 'created', 'status', + ) + model = SystemJob + + def get_job_type_str(self): + return 'system_job' + + def task_impact(self): + return 20 + + @classmethod + def filter_partial(cls, status=[]): + kv = { + 'status__in': status + } + return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())] + From 9802b1f379f8b3b303370467bd908988f4389e61 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Tue, 25 Oct 2016 10:11:11 -0400 Subject: [PATCH 05/17] AdHocCommand support added to task manager --- awx/main/scheduler/__init__.py | 11 ++++------- awx/main/scheduler/dependency_graph.py | 14 +++++++++++--- awx/main/scheduler/partial.py | 13 +++++++++++++ awx/main/tasks.py | 2 +- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/awx/main/scheduler/__init__.py b/awx/main/scheduler/__init__.py index 67c0c81a6a..ca79fb8aca 100644 --- a/awx/main/scheduler/__init__.py +++ b/awx/main/scheduler/__init__.py @@ -25,6 +25,7 @@ from awx.main.scheduler.partial import ( InventoryUpdateLatestDict, InventorySourceDict, SystemJobDict, + AdHocCommandDict, ) # Celery @@ -42,15 +43,11 @@ class Scheduler(): status_list = ('pending', 'waiting', 'running') jobs = JobDict.filter_partial(status=status_list) - ''' - graph_ad_hoc_commands = [ahc for ahc in AdHocCommand.objects.filter(**kv)] - ''' inventory_updates = InventoryUpdateDict.filter_partial(status=status_list) project_updates = ProjectUpdateDict.filter_partial(status=status_list) system_jobs = SystemJobDict.filter_partial(status=status_list) + ad_hoc_commands = AdHocCommandDict.filter_partial(status=status_list) ''' - graph_system_jobs = [sj for sj in - SystemJob.objects.filter(**kv)] graph_workflow_jobs = [wf for wf in WorkflowJob.objects.filter(**kv)] all_actions = sorted(graph_jobs + graph_ad_hoc_commands + graph_inventory_updates + @@ -58,7 +55,7 @@ class Scheduler(): graph_workflow_jobs, key=lambda task: task.created) ''' - all_actions = sorted(jobs + project_updates + inventory_updates + system_jobs, + all_actions = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands, key=lambda task: task['created']) return all_actions @@ -236,7 +233,7 @@ class Scheduler(): map(lambda task: self.graph.add_latest_inventory_update(task), latest_inventory_updates) def process_inventory_sources(self, inventory_id_sources): - map(lambda inventory_id, inventory_sources: self.graph.add_inventory_sources(inventory_id, inventory_sources), inventory_id_sources) + map(lambda (inventory_id, inventory_sources): self.graph.add_inventory_sources(inventory_id, inventory_sources), inventory_id_sources) def process_dependencies(self, dependent_task, dependency_tasks): for task in dependency_tasks: diff --git a/awx/main/scheduler/dependency_graph.py b/awx/main/scheduler/dependency_graph.py index e71cd6f0a7..3142699077 100644 --- a/awx/main/scheduler/dependency_graph.py +++ b/awx/main/scheduler/dependency_graph.py @@ -6,6 +6,7 @@ from awx.main.scheduler.partial import ( ProjectUpdateDict, InventoryUpdateDict, SystemJobDict, + AdHocCommandDict, ) class DependencyGraph(object): PROJECT_UPDATES = 'project_updates' @@ -140,8 +141,8 @@ class DependencyGraph(object): def can_project_update_run(self, job): return self.data[self.PROJECT_UPDATES].get(job['project_id'], True) - def can_inventory_update_run(self, inventory_source_id): - return self.data[self.INVENTORY_SOURCE_UPDATES].get(inventory_source_id, True) + def can_inventory_update_run(self, job): + return self.data[self.INVENTORY_SOURCE_UPDATES].get(job['inventory_source_id'], True) def can_job_run(self, job): if self.can_project_update_run(job) is True and \ @@ -155,15 +156,20 @@ class DependencyGraph(object): def can_system_job_run(self): return self.data[self.SYSTEM_JOB] + def can_ad_hoc_command_run(self, job): + return self.data[self.INVENTORY_UPDATES].get(job['inventory_id'], True) + def is_job_blocked(self, job): if type(job) is ProjectUpdateDict: return not self.can_project_update_run(job) elif type(job) is InventoryUpdateDict: - return not self.can_inventory_update_run(job['inventory_source_id']) + return not self.can_inventory_update_run(job) elif type(job) is JobDict: return not self.can_job_run(job) elif type(job) is SystemJobDict: return not self.can_system_job_run() + elif type(job) is AdHocCommandDict: + return not self.can_ad_hoc_command_run(job) def add_job(self, job): if type(job) is ProjectUpdateDict: @@ -175,6 +181,8 @@ class DependencyGraph(object): self.mark_job_template_job(job) elif type(job) is SystemJobDict: self.mark_system_job() + elif type(job) is AdHocCommandDict: + self.mark_inventory_update(job['inventory_id']) def add_jobs(self, jobs): map(lambda j: self.add_job(j), jobs) diff --git a/awx/main/scheduler/partial.py b/awx/main/scheduler/partial.py index dddbf763e7..a92c5c7bd6 100644 --- a/awx/main/scheduler/partial.py +++ b/awx/main/scheduler/partial.py @@ -6,6 +6,7 @@ from awx.main.models import ( InventoryUpdate, InventorySource, SystemJob, + AdHocCommand, ) class PartialModelDict(object): @@ -192,3 +193,15 @@ class SystemJobDict(PartialModelDict): } return [cls(o) for o in cls.model.objects.filter(**kv).values(*cls.get_db_values())] +class AdHocCommandDict(PartialModelDict): + FIELDS = ( + 'id', 'created', 'status', 'inventory_id', + ) + model = AdHocCommand + + def get_job_type_str(self): + return 'ad_hoc_command' + + def task_impact(self): + return 20 + diff --git a/awx/main/tasks.py b/awx/main/tasks.py index b76bcb48a6..fac9dca68f 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -1756,7 +1756,7 @@ class RunAdHocCommand(BaseTask): ''' Hook for actions to run after ad hoc command has completed. ''' - super(RunAdHocCommand, self).post_run_hook(ad_hoc_command, **kwargs) + super(RunAdHocCommand, self).post_run_hook(ad_hoc_command, status, **kwargs) class RunSystemJob(BaseTask): From fd8c641fa558a122c4ece500ae45cb88052a03e1 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Tue, 25 Oct 2016 13:37:31 -0400 Subject: [PATCH 06/17] flake8 fixes --- awx/main/models/unified_jobs.py | 1 - awx/main/tests/unit/scheduler/conftest.py | 1 + .../tests/unit/scheduler/test_scheduler_job.py | 10 ---------- .../scheduler/test_scheduler_project_update.py | 14 -------------- 4 files changed, 1 insertion(+), 25 deletions(-) diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index 19bc265c18..b8657431ab 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -843,7 +843,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique ''' Start the task running via Celery. ''' - task_class = self._get_task_class() (res, opts) = self.pre_start(**kwargs) if res: self.start_celery_task(opts, error_callback, success_callback) diff --git a/awx/main/tests/unit/scheduler/conftest.py b/awx/main/tests/unit/scheduler/conftest.py index d8a71d456e..2fd84474f7 100644 --- a/awx/main/tests/unit/scheduler/conftest.py +++ b/awx/main/tests/unit/scheduler/conftest.py @@ -30,6 +30,7 @@ def scheduler_factory(mocker, epoch): def no_create_inventory_update(task, ignore): raise RuntimeError("create_inventory_update should not be called") + def no_create_project_update(task): raise RuntimeError("create_project_update should not be called") diff --git a/awx/main/tests/unit/scheduler/test_scheduler_job.py b/awx/main/tests/unit/scheduler/test_scheduler_job.py index 37af2ead05..735ce04d95 100644 --- a/awx/main/tests/unit/scheduler/test_scheduler_job.py +++ b/awx/main/tests/unit/scheduler/test_scheduler_job.py @@ -3,16 +3,6 @@ import pytest from datetime import timedelta -# awx -from awx.main.scheduler.partial import ( - JobDict, - ProjectUpdateDict, -) - -# TODO: wherever get_latest_rpoject_update_task() is stubbed and returns a -# ProjectUpdateDict. We should instead return a ProjectUpdateLatestDict() -# For now, this is ok since the fields on deviate that much. - class TestJobBlocked(): def test_inventory_update_waiting(self, scheduler_factory, waiting_inventory_update, pending_job): scheduler = scheduler_factory(tasks=[waiting_inventory_update, pending_job]) diff --git a/awx/main/tests/unit/scheduler/test_scheduler_project_update.py b/awx/main/tests/unit/scheduler/test_scheduler_project_update.py index e0fcbc3b1e..8122d93c09 100644 --- a/awx/main/tests/unit/scheduler/test_scheduler_project_update.py +++ b/awx/main/tests/unit/scheduler/test_scheduler_project_update.py @@ -1,18 +1,4 @@ -# Python -import pytest -from datetime import timedelta - -# Django -from django.utils.timezone import now as tz_now - -# awx -from awx.main.scheduler.partial import ( - JobDict, - ProjectUpdateDict, -) -from awx.main.scheduler import Scheduler - # TODO: wherever get_latest_rpoject_update_task() is stubbed and returns a # ProjectUpdateDict. We should instead return a ProjectUpdateLatestDict() # For now, this is ok since the fields on deviate that much. From 4ef4b4709b25a43809b57073aac3caf9c11485e3 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Wed, 26 Oct 2016 14:34:13 -0400 Subject: [PATCH 07/17] workflow execution added --- awx/main/models/workflow.py | 17 ++--- awx/main/scheduler/__init__.py | 89 +++++++++-------------- awx/main/scheduler/dependency_graph.py | 14 ++++ awx/main/scheduler/partial.py | 13 ++++ awx/main/tests/unit/scheduler/conftest.py | 1 + requirements/requirements.txt | 1 + 6 files changed, 68 insertions(+), 67 deletions(-) diff --git a/awx/main/models/workflow.py b/awx/main/models/workflow.py index a4f02deef2..318d32ff48 100644 --- a/awx/main/models/workflow.py +++ b/awx/main/models/workflow.py @@ -393,11 +393,6 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, JobNotificationMixin, Workflow def _get_parent_field_name(cls): return 'workflow_job_template' - @classmethod - def _get_task_class(cls): - from awx.main.tasks import RunWorkflowJob - return RunWorkflowJob - def _has_failed(self): return False @@ -426,11 +421,9 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, JobNotificationMixin, Workflow def get_notification_friendly_name(self): return "Workflow Job" - def start(self, *args, **kwargs): - (res, opts) = self.pre_start(**kwargs) - if res: - self.status = 'running' - self.save() - self.websocket_emit_status("running") - return res + ''' + A WorkflowJob is a virtual job. It doesn't result in a celery task. + ''' + def start_celery_task(self, opts, error_callback, success_callback): + return None diff --git a/awx/main/scheduler/__init__.py b/awx/main/scheduler/__init__.py index ca79fb8aca..cf5fbecddc 100644 --- a/awx/main/scheduler/__init__.py +++ b/awx/main/scheduler/__init__.py @@ -26,6 +26,7 @@ from awx.main.scheduler.partial import ( InventorySourceDict, SystemJobDict, AdHocCommandDict, + WorkflowJobDict, ) # Celery @@ -47,15 +48,9 @@ class Scheduler(): project_updates = ProjectUpdateDict.filter_partial(status=status_list) system_jobs = SystemJobDict.filter_partial(status=status_list) ad_hoc_commands = AdHocCommandDict.filter_partial(status=status_list) - ''' - graph_workflow_jobs = [wf for wf in - WorkflowJob.objects.filter(**kv)] - all_actions = sorted(graph_jobs + graph_ad_hoc_commands + graph_inventory_updates + - graph_project_updates + graph_system_jobs + - graph_workflow_jobs, - key=lambda task: task.created) - ''' - all_actions = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands, + workflow_jobs = WorkflowJobDict.filter_partial(status=status_list) + + all_actions = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands + workflow_jobs, key=lambda task: task['created']) return all_actions @@ -111,7 +106,7 @@ class Scheduler(): job.status = 'failed' job.job_explanation = "Workflow job could not start because it was not in the right state or required manual credentials" job.save(update_fields=['status', 'job_explanation']) - job.websocket_emit_status("failed") + connection.on_commit(lambda: job.websocket_emit_status('failed')) # TODO: should we emit a status on the socket here similar to tasks.py tower_periodic_scheduler() ? #emit_websocket_notification('/socket.io/jobs', '', dict(id=)) @@ -122,12 +117,9 @@ class Scheduler(): dag = WorkflowDAG(workflow_job) if dag.is_workflow_done(): # TODO: detect if wfj failed - if workflow_job._has_failed(): - workflow_job.status = 'failed' - else: - workflow_job.status = 'successful' + workflow_job.status = 'completed' workflow_job.save() - workflow_job.websocket_emit_status(workflow_job.status) + connection.on_commit(lambda: workflow_job.websocket_emit_status(workflow_job.status)) def get_activate_tasks(self): inspector = inspect() @@ -153,6 +145,8 @@ class Scheduler(): def start_task(self, task, dependent_tasks=[]): from awx.main.tasks import handle_work_error, handle_work_success + status_changed = False + # TODO: spawn inventory and project updates task_actual = { 'type':task.get_job_type_str(), @@ -164,21 +158,36 @@ class Scheduler(): success_handler = handle_work_success.s(task_actual=task_actual) job_obj = task.get_full() - job_obj.status = 'waiting' + if job_obj.status == 'pending': + status_changed = True + job_obj.status = 'waiting' (start_status, opts) = job_obj.pre_start() if not start_status: + status_changed = True job_obj.status = 'failed' if job_obj.job_explanation: job_obj.job_explanation += ' ' job_obj.job_explanation += 'Task failed pre-start check.' job_obj.save() # TODO: run error handler to fail sub-tasks and send notifications - return + else: + if type(job_obj) is WorkflowJob: + job_obj.status = 'running' + status_changed = True - self.consume_capacity(task) + if status_changed is True: + job_obj.save() - connection.on_commit(lambda: job_obj.start_celery_task(opts, error_callback=error_handler, success_callback=success_handler)) + self.consume_capacity(task) + + def post_commit(): + if status_changed: + job_obj.websocket_emit_status(job_obj.status) + if job_obj.status != 'failed': + job_obj.start_celery_task(opts, error_callback=error_handler, success_callback=success_handler) + + connection.on_commit(post_commit) def process_runnable_tasks(self, runnable_tasks): for i, task in enumerate(runnable_tasks): @@ -281,7 +290,7 @@ class Scheduler(): 'Celery, so it has been marked as failed.', )) task_obj.save() - task_obj.websocket_emit_status("failed") + connection.on_commit(lambda: task_obj.websocket_emit_status('failed')) all_sorted_tasks.pop(i) logger.error("Task %s appears orphaned... marking as failed" % task) @@ -323,28 +332,6 @@ class Scheduler(): pending_tasks = filter(lambda t: t['status'] == 'pending', all_sorted_tasks) self.process_pending_tasks(pending_tasks) - - ''' - def do_graph_things(): - # Rebuild graph - graph = SimpleDAG() - for task in running_tasks: - graph.add_node(task) - #for wait_task in waiting_tasks[:50]: - for wait_task in waiting_tasks: - node_dependencies = [] - for node in graph: - if wait_task.is_blocked_by(node['node_object']): - node_dependencies.append(node['node_object']) - graph.add_node(wait_task) - for dependency in node_dependencies: - graph.add_edge(wait_task, dependency) - if settings.DEBUG: - graph.generate_graphviz_plot() - return graph - ''' - #return do_graph_things() - def _schedule(self): all_sorted_tasks = self.get_tasks() if len(all_sorted_tasks) > 0: @@ -359,23 +346,21 @@ class Scheduler(): inventory_id_sources = self.get_inventory_source_tasks(all_sorted_tasks) self.process_inventory_sources(inventory_id_sources) - self.process_tasks(all_sorted_tasks) + running_workflow_tasks = self.get_running_workflow_jobs() + self.process_finished_workflow_jobs(running_workflow_tasks) - #print("Finished schedule()") + self.spawn_workflow_graph_jobs(running_workflow_tasks) + + self.process_tasks(all_sorted_tasks) def schedule(self): with transaction.atomic(): - #t1 = datetime.now() # Lock try: Instance.objects.select_for_update(nowait=True).all()[0] except DatabaseError: return - #workflow_jobs = get_running_workflow_jobs() - #process_finished_workflow_jobs(workflow_jobs) - #spawn_workflow_graph_jobs(workflow_jobs) - ''' Get tasks known by celery ''' @@ -387,10 +372,4 @@ class Scheduler(): ''' self._schedule() - # Unlock, due to transaction ending - #t2 = datetime.now() - #t_diff = t2 - t1 - #print("schedule() time %s" % (t_diff.total_seconds())) - - diff --git a/awx/main/scheduler/dependency_graph.py b/awx/main/scheduler/dependency_graph.py index 3142699077..edd49c98a9 100644 --- a/awx/main/scheduler/dependency_graph.py +++ b/awx/main/scheduler/dependency_graph.py @@ -7,6 +7,7 @@ from awx.main.scheduler.partial import ( InventoryUpdateDict, SystemJobDict, AdHocCommandDict, + WorkflowJobDict, ) class DependencyGraph(object): PROJECT_UPDATES = 'project_updates' @@ -14,6 +15,7 @@ class DependencyGraph(object): JOB_TEMPLATE_JOBS = 'job_template_jobs' SYSTEM_JOB = 'system_job' INVENTORY_SOURCE_UPDATES = 'inventory_source_updates' + WORKFLOW_JOB_TEMPLATES_JOBS = 'workflow_job_template_jobs' LATEST_PROJECT_UPDATES = 'latest_project_updates' LATEST_INVENTORY_UPDATES = 'latest_inventory_updates' @@ -32,6 +34,8 @@ class DependencyGraph(object): self.data[self.INVENTORY_SOURCE_UPDATES] = {} # True / False self.data[self.SYSTEM_JOB] = True + # workflow_job_template_id -> True / False + self.data[self.WORKFLOW_JOB_TEMPLATES_JOBS] = {} # project_id -> latest ProjectUpdateLatestDict self.data[self.LATEST_PROJECT_UPDATES] = {} @@ -138,6 +142,9 @@ class DependencyGraph(object): self.data[self.PROJECT_UPDATES][job['project_id']] = False self.data[self.JOB_TEMPLATE_JOBS][job['job_template_id']] = False + def mark_workflow_job(self, job): + self.data[self.WORKFLOW_JOB_TEMPLATES_JOBS][job['workflow_job_template_id']] = False + def can_project_update_run(self, job): return self.data[self.PROJECT_UPDATES].get(job['project_id'], True) @@ -153,6 +160,9 @@ class DependencyGraph(object): return True return False + def can_workflow_job_run(self, job): + return self.data[self.WORKFLOW_JOB_TEMPLATES_JOBS].get(job['workflow_job_template_id'], True) + def can_system_job_run(self): return self.data[self.SYSTEM_JOB] @@ -170,6 +180,8 @@ class DependencyGraph(object): return not self.can_system_job_run() elif type(job) is AdHocCommandDict: return not self.can_ad_hoc_command_run(job) + elif type(job) is WorkflowJobDict: + return not self.can_workflow_job_run(job) def add_job(self, job): if type(job) is ProjectUpdateDict: @@ -179,6 +191,8 @@ class DependencyGraph(object): self.mark_inventory_source_update(job['inventory_source_id']) elif type(job) is JobDict: self.mark_job_template_job(job) + elif type(job) is WorkflowJobDict: + self.mark_workflow_job(job) elif type(job) is SystemJobDict: self.mark_system_job() elif type(job) is AdHocCommandDict: diff --git a/awx/main/scheduler/partial.py b/awx/main/scheduler/partial.py index a92c5c7bd6..576b66a9c3 100644 --- a/awx/main/scheduler/partial.py +++ b/awx/main/scheduler/partial.py @@ -7,6 +7,7 @@ from awx.main.models import ( InventorySource, SystemJob, AdHocCommand, + WorkflowJob, ) class PartialModelDict(object): @@ -205,3 +206,15 @@ class AdHocCommandDict(PartialModelDict): def task_impact(self): return 20 +class WorkflowJobDict(PartialModelDict): + FIELDS = ( + 'id', 'created', 'status', 'workflow_job_template_id', + ) + model = WorkflowJob + + def get_job_type_str(self): + return 'workflow_job' + + def task_impact(self): + return 10 + diff --git a/awx/main/tests/unit/scheduler/conftest.py b/awx/main/tests/unit/scheduler/conftest.py index 2fd84474f7..cec68b1ef7 100644 --- a/awx/main/tests/unit/scheduler/conftest.py +++ b/awx/main/tests/unit/scheduler/conftest.py @@ -35,6 +35,7 @@ def scheduler_factory(mocker, epoch): raise RuntimeError("create_project_update should not be called") mocker.patch.object(sched, 'get_tasks', return_value=tasks) + mocker.patch.object(sched, 'get_running_workflow_jobs', return_value=[]) mocker.patch.object(sched, 'get_inventory_source_tasks', return_value=inventory_sources) mocker.patch.object(sched, 'get_latest_project_update_tasks', return_value=latest_project_updates) mocker.patch.object(sched, 'get_latest_inventory_update_tasks', return_value=latest_inventory_updates) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index fb885a8842..5f2448d9e6 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -25,6 +25,7 @@ django-radius==1.0.0 djangorestframework==3.3.2 djangorestframework-yaml==1.0.2 django-split-settings==0.1.1 +django-transaction-hooks==0.2 django-taggit==0.17.6 git+https://github.com/matburt/dm.xmlsec.binding.git@master#egg=dm.xmlsec.binding dogpile.core==0.4.1 From 454b3edb7c96995942716935e43005c25a20abca Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Thu, 27 Oct 2016 16:31:47 -0400 Subject: [PATCH 08/17] rectify celery<->db inconsistent running job --- awx/main/scheduler/__init__.py | 51 +++++++++++++++++----------------- awx/main/scheduler/tasks.py | 10 +++---- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/awx/main/scheduler/__init__.py b/awx/main/scheduler/__init__.py index cf5fbecddc..21fe954546 100644 --- a/awx/main/scheduler/__init__.py +++ b/awx/main/scheduler/__init__.py @@ -54,6 +54,22 @@ class Scheduler(): key=lambda task: task['created']) return all_actions + ''' + Tasks that are running and SHOULD have a celery task. + ''' + def get_running_tasks(self): + status_list = ('running',) + + jobs = JobDict.filter_partial(status=status_list) + inventory_updates = InventoryUpdateDict.filter_partial(status=status_list) + project_updates = ProjectUpdateDict.filter_partial(status=status_list) + system_jobs = SystemJobDict.filter_partial(status=status_list) + ad_hoc_commands = AdHocCommandDict.filter_partial(status=status_list) + + all_actions = sorted(jobs + project_updates + inventory_updates + system_jobs + ad_hoc_commands, + key=lambda task: task['created']) + return all_actions + # TODO: Consider a database query for this logic def get_latest_project_update_tasks(self, all_sorted_tasks): project_ids = Set() @@ -121,7 +137,7 @@ class Scheduler(): workflow_job.save() connection.on_commit(lambda: workflow_job.websocket_emit_status(workflow_job.status)) - def get_activate_tasks(self): + def get_active_tasks(self): inspector = inspect() if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'): active_task_queues = inspector.active() @@ -129,10 +145,10 @@ class Scheduler(): logger.warn("Ignoring celery task inspector") active_task_queues = None - active_tasks = [] + active_tasks = set() if active_task_queues is not None: for queue in active_task_queues: - active_tasks += [at['id'] for at in active_task_queues[queue]] + map(lambda at: active_tasks.add(at['id']), active_task_queues[queue]) else: logger.error("Could not communicate with celery!") # TODO: Something needs to be done here to signal to the system @@ -274,10 +290,11 @@ class Scheduler(): if self.get_remaining_capacity() <= 0: return - def fail_inconsistent_running_jobs(self, active_tasks, all_sorted_tasks): - for i, task in enumerate(all_sorted_tasks): - if task['status'] != 'running': - continue + def process_celery_tasks(self, active_tasks, all_running_sorted_tasks): + ''' + Rectify tower db <-> celery inconsistent view of jobs state + ''' + for task in all_running_sorted_tasks: if (task['celery_task_id'] not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')): # NOTE: Pull status again and make sure it didn't finish in @@ -290,20 +307,11 @@ class Scheduler(): 'Celery, so it has been marked as failed.', )) task_obj.save() + print("Going to fail %s" % task_obj.id) connection.on_commit(lambda: task_obj.websocket_emit_status('failed')) - all_sorted_tasks.pop(i) logger.error("Task %s appears orphaned... marking as failed" % task) - def process_celery_tasks(self, active_tasks, all_sorted_tasks): - - ''' - Rectify tower db <-> celery inconsistent view of jobs state - ''' - # Check running tasks and make sure they are active in celery - logger.debug("Active celery tasks: " + str(active_tasks)) - all_sorted_tasks = self.fail_inconsistent_running_jobs(active_tasks, - all_sorted_tasks) def calculate_capacity_used(self, tasks): self.capacity_used = 0 @@ -361,15 +369,6 @@ class Scheduler(): except DatabaseError: return - ''' - Get tasks known by celery - ''' - ''' - active_tasks = self.get_activate_tasks() - # Communication with celery failed :(, return - if active_tasks is None: - return None - ''' self._schedule() diff --git a/awx/main/scheduler/tasks.py b/awx/main/scheduler/tasks.py index ba1ddaeecc..2b35b5ab64 100644 --- a/awx/main/scheduler/tasks.py +++ b/awx/main/scheduler/tasks.py @@ -33,19 +33,19 @@ def run_scheduler(): @task def run_fail_inconsistent_running_jobs(): - return - print("run_fail_inconsistent_running_jobs() running") with transaction.atomic(): # Lock try: Instance.objects.select_for_update(nowait=True).all()[0] scheduler = Scheduler() - active_tasks = scheduler.get_activate_tasks() + active_tasks = scheduler.get_active_tasks() + if active_tasks is None: + # TODO: Failed to contact celery. We should surface this. return None - all_sorted_tasks = scheduler.get_tasks() - scheduler.process_celery_tasks(active_tasks, all_sorted_tasks) + all_running_sorted_tasks = scheduler.get_running_tasks() + scheduler.process_celery_tasks(active_tasks, all_running_sorted_tasks) except DatabaseError: return From 03a484a6a6c52233cfe58b53ba3990d23842c1f2 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Mon, 31 Oct 2016 08:01:02 -0500 Subject: [PATCH 09/17] remove old task manager code --- awx/main/models/ad_hoc_commands.py | 38 ---------------- awx/main/models/inventory.py | 10 ----- awx/main/models/jobs.py | 61 +------------------------- awx/main/models/projects.py | 10 ----- awx/main/models/unified_jobs.py | 9 ---- awx/main/models/workflow.py | 3 -- awx/main/tests/functional/test_jobs.py | 34 -------------- 7 files changed, 1 insertion(+), 164 deletions(-) diff --git a/awx/main/models/ad_hoc_commands.py b/awx/main/models/ad_hoc_commands.py index 65f40427b0..aadd34c190 100644 --- a/awx/main/models/ad_hoc_commands.py +++ b/awx/main/models/ad_hoc_commands.py @@ -4,7 +4,6 @@ # Python import datetime import hmac -import json import logging from urlparse import urljoin @@ -24,7 +23,6 @@ from jsonfield import JSONField # AWX from awx.main.models.base import * # noqa from awx.main.models.unified_jobs import * # noqa -from awx.main.utils import decrypt_field from awx.main.models.notifications import JobNotificationMixin logger = logging.getLogger('awx.main.models.ad_hoc_commands') @@ -181,13 +179,6 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin): def get_passwords_needed_to_start(self): return self.passwords_needed_to_start - def is_blocked_by(self, obj): - from awx.main.models import InventoryUpdate - if type(obj) == InventoryUpdate: - if self.inventory == obj.inventory_source.inventory: - return True - return False - @property def task_impact(self): # NOTE: We sorta have to assume the host count matches and that forks default to 5 @@ -195,35 +186,6 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin): count_hosts = Host.objects.filter( enabled=True, inventory__ad_hoc_commands__pk=self.pk).count() return min(count_hosts, 5 if self.forks == 0 else self.forks) * 10 - def generate_dependencies(self, active_tasks): - from awx.main.models import InventoryUpdate - if not self.inventory: - return [] - inventory_sources = self.inventory.inventory_sources.filter( update_on_launch=True) - inventory_sources_found = [] - dependencies = [] - for obj in active_tasks: - if type(obj) == InventoryUpdate: - if obj.inventory_source in inventory_sources: - inventory_sources_found.append(obj.inventory_source) - # Skip updating any inventory sources that were already updated before - # running this job (via callback inventory refresh). - try: - start_args = json.loads(decrypt_field(self, 'start_args')) - except Exception: - start_args = None - start_args = start_args or {} - inventory_sources_already_updated = start_args.get('inventory_sources_already_updated', []) - if inventory_sources_already_updated: - for source in inventory_sources.filter(pk__in=inventory_sources_already_updated): - if source not in inventory_sources_found: - inventory_sources_found.append(source) - if inventory_sources.count(): # and not has_setup_failures? Probably handled as an error scenario in the task runner - for source in inventory_sources: - if source not in inventory_sources_found and source.needs_update_on_launch: - dependencies.append(source.create_inventory_update(launch_type='dependency')) - return dependencies - def copy(self): data = {} for field in ('job_type', 'inventory_id', 'limit', 'credential_id', diff --git a/awx/main/models/inventory.py b/awx/main/models/inventory.py index c77868759e..662b1702f9 100644 --- a/awx/main/models/inventory.py +++ b/awx/main/models/inventory.py @@ -22,7 +22,6 @@ from awx.main.constants import CLOUD_PROVIDERS from awx.main.fields import AutoOneToOneField, ImplicitRoleField from awx.main.managers import HostManager from awx.main.models.base import * # noqa -from awx.main.models.jobs import Job from awx.main.models.unified_jobs import * # noqa from awx.main.models.mixins import ResourceMixin from awx.main.models.notifications import ( @@ -1250,15 +1249,6 @@ class InventoryUpdate(UnifiedJob, InventorySourceOptions, JobNotificationMixin): def get_ui_url(self): return urljoin(settings.TOWER_URL_BASE, "/#/inventory_sync/{}".format(self.pk)) - def is_blocked_by(self, obj): - if type(obj) == InventoryUpdate: - if self.inventory_source.inventory == obj.inventory_source.inventory: - return True - if type(obj) == Job: - if self.inventory_source.inventory == obj.inventory: - return True - return False - @property def task_impact(self): return 50 diff --git a/awx/main/models/jobs.py b/awx/main/models/jobs.py index 3377968eba..8ed723626b 100644 --- a/awx/main/models/jobs.py +++ b/awx/main/models/jobs.py @@ -33,7 +33,7 @@ from awx.main.models.notifications import ( NotificationTemplate, JobNotificationMixin, ) -from awx.main.utils import decrypt_field, ignore_inventory_computed_fields +from awx.main.utils import ignore_inventory_computed_fields from awx.main.redact import PlainTextCleaner from awx.main.fields import ImplicitRoleField from awx.main.models.mixins import ResourceMixin @@ -646,29 +646,6 @@ class Job(UnifiedJob, JobOptions, JobNotificationMixin): kwargs['job_host_summaries__job__pk'] = self.pk return Host.objects.filter(**kwargs) - def is_blocked_by(self, obj): - from awx.main.models import InventoryUpdate, ProjectUpdate - if type(obj) == Job: - if obj.job_template is not None and obj.inventory is not None: - if obj.job_template == self.job_template and \ - obj.inventory == self.inventory: - if self.allow_simultaneous: - return False - if obj.launch_type == 'callback' and self.launch_type == 'callback' and \ - obj.limit != self.limit: - return False - return True - return False - if type(obj) == InventoryUpdate: - if self.inventory == obj.inventory_source.inventory: - return True - return False - if type(obj) == ProjectUpdate: - if obj.project == self.project: - return True - return False - return False - @property def task_impact(self): # NOTE: We sorta have to assume the host count matches and that forks default to 5 @@ -707,39 +684,6 @@ class Job(UnifiedJob, JobOptions, JobNotificationMixin): def processed_hosts(self): return self._get_hosts(job_host_summaries__processed__gt=0) - def generate_dependencies(self, active_tasks): - from awx.main.models import InventoryUpdate, ProjectUpdate - inventory_sources = self.inventory.inventory_sources.filter(update_on_launch=True) - project_found = False - inventory_sources_found = [] - dependencies = [] - for obj in active_tasks: - if type(obj) == ProjectUpdate and self.project is not None: - if obj.project == self.project: - project_found = True - if type(obj) == InventoryUpdate: - if obj.inventory_source in inventory_sources: - inventory_sources_found.append(obj.inventory_source) - # Skip updating any inventory sources that were already updated before - # running this job (via callback inventory refresh). - try: - start_args = json.loads(decrypt_field(self, 'start_args')) - except Exception: - start_args = None - start_args = start_args or {} - inventory_sources_already_updated = start_args.get('inventory_sources_already_updated', []) - if inventory_sources_already_updated: - for source in inventory_sources.filter(pk__in=inventory_sources_already_updated): - if source not in inventory_sources_found: - inventory_sources_found.append(source) - if not project_found and self.project is not None and self.project.needs_update_on_launch: - dependencies.append(self.project.create_project_update(launch_type='dependency')) - if inventory_sources.count(): # and not has_setup_failures? Probably handled as an error scenario in the task runner - for source in inventory_sources: - if source not in inventory_sources_found and source.needs_update_on_launch: - dependencies.append(source.create_inventory_update(launch_type='dependency')) - return dependencies - def notification_data(self, block=5): data = super(Job, self).notification_data() all_hosts = {} @@ -1526,9 +1470,6 @@ class SystemJob(UnifiedJob, SystemJobOptions, JobNotificationMixin): def get_ui_url(self): return urljoin(settings.TOWER_URL_BASE, "/#/management_jobs/{}".format(self.pk)) - def is_blocked_by(self, obj): - return True - def handle_extra_data(self, extra_data): extra_vars = {} if isinstance(extra_data, dict): diff --git a/awx/main/models/projects.py b/awx/main/models/projects.py index 4c20e01e08..c3763ff34f 100644 --- a/awx/main/models/projects.py +++ b/awx/main/models/projects.py @@ -22,7 +22,6 @@ from django.utils.timezone import now, make_aware, get_default_timezone # AWX from awx.main.models.base import * # noqa -from awx.main.models.jobs import Job from awx.main.models.notifications import ( NotificationTemplate, JobNotificationMixin, @@ -424,15 +423,6 @@ class ProjectUpdate(UnifiedJob, ProjectOptions, JobNotificationMixin): from awx.main.tasks import RunProjectUpdate return RunProjectUpdate - def is_blocked_by(self, obj): - if type(obj) == ProjectUpdate: - if self.project == obj.project: - return True - if type(obj) == Job: - if self.project == obj.project: - return True - return False - def websocket_emit_data(self): return dict(project_id=self.project.id) diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index b8657431ab..fa989ad60c 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -778,10 +778,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique def task_impact(self): raise NotImplementedError # Implement in subclass. - def is_blocked_by(self, task_object): - ''' Given another task object determine if this task would be blocked by it ''' - raise NotImplementedError # Implement in subclass. - def websocket_emit_data(self): ''' Return extra data that should be included when submitting data to the browser over the websocket connection ''' return {} @@ -792,11 +788,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique status_data['group_name'] = 'jobs' emit_channel_notification('jobs-status_changed', status_data) - def generate_dependencies(self, active_tasks): - ''' Generate any tasks that the current task might be dependent on given a list of active - tasks that might preclude creating one''' - return [] - def notification_data(self): return dict(id=self.id, name=self.name, diff --git a/awx/main/models/workflow.py b/awx/main/models/workflow.py index 318d32ff48..b267343ea3 100644 --- a/awx/main/models/workflow.py +++ b/awx/main/models/workflow.py @@ -406,9 +406,6 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, JobNotificationMixin, Workflow #def get_ui_url(self): # return urlparse.urljoin(tower_settings.TOWER_URL_BASE, "/#/workflow_jobs/{}".format(self.pk)) - def is_blocked_by(self, obj): - return True - @property def task_impact(self): return 0 diff --git a/awx/main/tests/functional/test_jobs.py b/awx/main/tests/functional/test_jobs.py index 83302e7400..55b5d428c9 100644 --- a/awx/main/tests/functional/test_jobs.py +++ b/awx/main/tests/functional/test_jobs.py @@ -2,40 +2,6 @@ from awx.main.models import Job import pytest -@pytest.mark.django_db -def test_job_blocking(get, post, job_template, inventory, inventory_factory): - j1 = Job.objects.create(job_template=job_template, - inventory=inventory) - j2 = Job.objects.create(job_template=job_template, - inventory=inventory) - assert j1.is_blocked_by(j2) - j2.inventory = inventory_factory(name='test-different-inventory') - assert not j1.is_blocked_by(j2) - j_callback_1 = Job.objects.create(job_template=job_template, - inventory=inventory, - launch_type='callback', - limit='a') - j_callback_2 = Job.objects.create(job_template=job_template, - inventory=inventory, - launch_type='callback', - limit='a') - assert j_callback_1.is_blocked_by(j_callback_2) - j_callback_2.limit = 'b' - assert not j_callback_1.is_blocked_by(j_callback_2) - -@pytest.mark.django_db -def test_job_blocking_allow_simul(get, post, job_template, inventory): - job_template.allow_simultaneous = True - j1 = Job.objects.create(job_template=job_template, - inventory=inventory) - j2 = Job.objects.create(job_template=job_template, - inventory=inventory) - assert not j1.is_blocked_by(j2) - assert not j2.is_blocked_by(j1) - job_template.allow_simultaneous = False - assert j1.is_blocked_by(j2) - assert j2.is_blocked_by(j1) - @pytest.mark.django_db def test_orphan_unified_job_creation(instance, inventory): job = Job.objects.create(job_template=None, inventory=inventory, name='hi world') From 0f98e1edec2805682de25a8133e2a6159489f14c Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Mon, 31 Oct 2016 09:26:27 -0500 Subject: [PATCH 10/17] remove todo's --- awx/main/scheduler/__init__.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/awx/main/scheduler/__init__.py b/awx/main/scheduler/__init__.py index 21fe954546..99bc87917c 100644 --- a/awx/main/scheduler/__init__.py +++ b/awx/main/scheduler/__init__.py @@ -108,7 +108,6 @@ class Scheduler(): return results def spawn_workflow_graph_jobs(self, workflow_jobs): - # TODO: Consider using transaction.atomic for workflow_job in workflow_jobs: dag = WorkflowDAG(workflow_job) spawn_nodes = dag.bfs_nodes_to_run() @@ -150,9 +149,6 @@ class Scheduler(): for queue in active_task_queues: map(lambda at: active_tasks.add(at['id']), active_task_queues[queue]) else: - logger.error("Could not communicate with celery!") - # TODO: Something needs to be done here to signal to the system - # as a whole that celery appears to be down. if not hasattr(settings, 'CELERY_UNIT_TEST'): return None @@ -163,7 +159,6 @@ class Scheduler(): status_changed = False - # TODO: spawn inventory and project updates task_actual = { 'type':task.get_job_type_str(), 'id': task['id'], @@ -214,7 +209,6 @@ class Scheduler(): def create_project_update(self, task): dep = Project.objects.get(id=task['project_id']).create_project_update(launch_type='dependency') - # TODO: Consider using milliseconds or microseconds # Project created 1 seconds behind dep.created = task['created'] - timedelta(seconds=1) dep.status = 'waiting' @@ -329,7 +323,6 @@ class Scheduler(): def process_tasks(self, all_sorted_tasks): - # TODO: Process new tasks running_tasks = filter(lambda t: t['status'] == 'running', all_sorted_tasks) runnable_tasks = filter(lambda t: t['status'] in ['waiting', 'running'], all_sorted_tasks) From ed37e68c53f1ad7a037da94e622f6f0dee7850fe Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Mon, 31 Oct 2016 14:16:59 -0500 Subject: [PATCH 11/17] run dependencies when capacity is available --- awx/main/scheduler/__init__.py | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/awx/main/scheduler/__init__.py b/awx/main/scheduler/__init__.py index 99bc87917c..0f551997bf 100644 --- a/awx/main/scheduler/__init__.py +++ b/awx/main/scheduler/__init__.py @@ -157,8 +157,6 @@ class Scheduler(): def start_task(self, task, dependent_tasks=[]): from awx.main.tasks import handle_work_error, handle_work_success - status_changed = False - task_actual = { 'type':task.get_job_type_str(), 'id': task['id'], @@ -169,13 +167,10 @@ class Scheduler(): success_handler = handle_work_success.s(task_actual=task_actual) job_obj = task.get_full() - if job_obj.status == 'pending': - status_changed = True - job_obj.status = 'waiting' + job_obj.status = 'waiting' (start_status, opts) = job_obj.pre_start() if not start_status: - status_changed = True job_obj.status = 'failed' if job_obj.job_explanation: job_obj.job_explanation += ' ' @@ -185,33 +180,27 @@ class Scheduler(): else: if type(job_obj) is WorkflowJob: job_obj.status = 'running' - status_changed = True - if status_changed is True: - job_obj.save() + job_obj.save() self.consume_capacity(task) def post_commit(): - if status_changed: - job_obj.websocket_emit_status(job_obj.status) + job_obj.websocket_emit_status(job_obj.status) if job_obj.status != 'failed': job_obj.start_celery_task(opts, error_callback=error_handler, success_callback=success_handler) connection.on_commit(post_commit) def process_runnable_tasks(self, runnable_tasks): - for i, task in enumerate(runnable_tasks): - # TODO: maybe batch process new tasks. - # Processing a new task individually seems to be expensive - self.graph.add_job(task) + map(lambda task: self.graph.add_job(task), runnable_tasks) def create_project_update(self, task): dep = Project.objects.get(id=task['project_id']).create_project_update(launch_type='dependency') # Project created 1 seconds behind dep.created = task['created'] - timedelta(seconds=1) - dep.status = 'waiting' + dep.status = 'pending' dep.save() project_task = ProjectUpdateDict.get_partial(dep.id) @@ -222,7 +211,7 @@ class Scheduler(): dep = InventorySource.objects.get(id=inventory_source_task['id']).create_inventory_update(launch_type='dependency') dep.created = task['created'] - timedelta(seconds=2) - dep.status = 'waiting' + dep.status = 'pending' dep.save() inventory_task = InventoryUpdateDict.get_partial(dep.id) @@ -267,6 +256,9 @@ class Scheduler(): def process_pending_tasks(self, pending_tasks): for task in pending_tasks: + # Stop processing tasks if we know we are out of capacity + if self.get_remaining_capacity() <= 0: + return if not self.graph.is_job_blocked(task): dependencies = self.generate_dependencies(task) @@ -280,10 +272,6 @@ class Scheduler(): else: self.graph.add_job(task) - # Stop processing tasks if we know we are out of capacity - if self.get_remaining_capacity() <= 0: - return - def process_celery_tasks(self, active_tasks, all_running_sorted_tasks): ''' Rectify tower db <-> celery inconsistent view of jobs state @@ -329,8 +317,8 @@ class Scheduler(): self.calculate_capacity_used(running_tasks) self.process_runnable_tasks(runnable_tasks) - - pending_tasks = filter(lambda t: t['status'] == 'pending', all_sorted_tasks) + + pending_tasks = filter(lambda t: t['status'] in 'pending', all_sorted_tasks) self.process_pending_tasks(pending_tasks) def _schedule(self): From c4a3b604f8fe3c1f53d912ade04adf983a215622 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Mon, 31 Oct 2016 15:07:14 -0500 Subject: [PATCH 12/17] add back in alan's workflow fail detection --- awx/main/scheduler/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/awx/main/scheduler/__init__.py b/awx/main/scheduler/__init__.py index 0f551997bf..8c9679eaec 100644 --- a/awx/main/scheduler/__init__.py +++ b/awx/main/scheduler/__init__.py @@ -131,8 +131,10 @@ class Scheduler(): for workflow_job in workflow_jobs: dag = WorkflowDAG(workflow_job) if dag.is_workflow_done(): - # TODO: detect if wfj failed - workflow_job.status = 'completed' + if workflow_job._has_failed(): + workflow_job.status = 'failed' + else: + workflow_job.status = 'successful' workflow_job.save() connection.on_commit(lambda: workflow_job.websocket_emit_status(workflow_job.status)) From 6efa468f2973eefd56048bba7a748ef8caca799a Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Tue, 1 Nov 2016 07:41:18 -0500 Subject: [PATCH 13/17] init task manager architecture docs --- docs/task_manager_system.md | 57 +++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 docs/task_manager_system.md diff --git a/docs/task_manager_system.md b/docs/task_manager_system.md new file mode 100644 index 0000000000..92b8e35880 --- /dev/null +++ b/docs/task_manager_system.md @@ -0,0 +1,57 @@ +# Task Manager Overview + +The task manager is responsible for deciding when jobs should be introduced to celery for running. When choosing a task to run the considerations are: (1) creation time, (2) job dependency, (3) capacity. + +Independent jobs are ran in order of creation time, earliest first. Jobs with dependencies are also ran in creation time order within the group of job dependencies. Capacity is the final consideration when deciding to release a job to be ran by celery. + +## Task Manager Architecture + +The task manager has a single entry point, `Scheduler().schedule()`. The method may be called in parallel, at any time, as many times as the user wants. The `schedule()` function tries to aquire a single, global, lock using the Instance table first record in the database. If the lock can not be aquired the method returns. The failure to aquire the lock indicates that there is another instance currently running `schedule()`. + +### Hybrid Scheduler: Periodic + Event +The `schedule()` function is ran (a) periodically by a celery task and (b) on job creation or completion. The task manager system would behave correctly if ran, exclusively, via (a) or (b). We chose to trigger `schedule()` via both mechanisms because of the nice properties I will now mention. (b) reduces the time from launch to running, resulting a better user experience. (a) is a fail-safe in case we miss code-paths, in the present and future, that change the 3 scheduling considerations for which we should call `schedule()` (i.e. adding new nodes to tower changes the capacity, obscure job error handling that fails a job) + Emperically, the periodic task manager has served us well in the past and we will continue to rely on it with the added event-triggered `schedule()`. + + ### Scheduler Algorithm + * Get all non-completed jobs, `all_tasks` + * Generate the hash tables from `all_tasks`: + * `` indicates a job is running + * `` indicates a project update is running + * `` indicates a job template or inventory update is running + * `` indiciates an inventory update is running + * `` indiciates a workflow job is running + * `` used to determine cache timeout + * `` used to determine cache timeout and dependencies to spawn + * `` used to determine cache timeout + * Detect finished workflow jobs + * Spawn next workflow jobs if needed + * For each pending jobs; start with oldest created job and stop when no capacity == 0 + * If job is not blocked, determined using generated hash tables, and there is capacity, then mark the as `waiting` and submit the job to celery. + +### Job Lifecycle +| Job Status | State | +|:----------:|:------------------------------------------------------------------------------------------------------------------:| +| pending | Job launched.
1. Hasn't yet been seen by the scheduler
2. Is blocked by another task
3. Not enough capacity | +| waiting | Job submitted to celery. | +| running | Job running in celery. | +| successful | Job finished with ansible-playbook return code 0. | +| failed | Job finished with ansible-playbook return code other than 0. | +| error | System failure. | + +## todo + +## Code Composition +* partials +* + +## Acceptance Tests +* assemelate with .md and trim the fat https://docs.google.com/a/redhat.com/document/d/1AOvKiTMSV0A2RHykHW66BZKBuaJ_l0SJ-VbMwvu-5Gk/edit?usp=sharing + + + + + + + + + From 87dd91e849161d57c6c190b6f61d69f74916c936 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Tue, 1 Nov 2016 09:52:54 -0500 Subject: [PATCH 14/17] rename Scheduler to TaskManager --- awx/main/scheduler/__init__.py | 2 +- awx/main/scheduler/tasks.py | 10 +++++----- awx/main/tests/unit/scheduler/conftest.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/awx/main/scheduler/__init__.py b/awx/main/scheduler/__init__.py index 8c9679eaec..c0a680c7d4 100644 --- a/awx/main/scheduler/__init__.py +++ b/awx/main/scheduler/__init__.py @@ -34,7 +34,7 @@ from celery.task.control import inspect logger = logging.getLogger('awx.main.scheduler') -class Scheduler(): +class TaskManager(): def __init__(self): self.graph = DependencyGraph() self.capacity_total = 200 diff --git a/awx/main/scheduler/tasks.py b/awx/main/scheduler/tasks.py index 2b35b5ab64..eb9c7691a7 100644 --- a/awx/main/scheduler/tasks.py +++ b/awx/main/scheduler/tasks.py @@ -11,7 +11,7 @@ from celery import task # AWX from awx.main.models import Instance -from awx.main.scheduler import Scheduler +from awx.main.scheduler import TaskManager logger = logging.getLogger('awx.main.scheduler') @@ -21,15 +21,15 @@ logger = logging.getLogger('awx.main.scheduler') @task def run_job_launch(job_id): - Scheduler().schedule() + TaskManager().schedule() @task def run_job_complete(job_id): - Scheduler().schedule() + TaskManager().schedule() @task def run_scheduler(): - Scheduler().schedule() + TaskManager().schedule() @task def run_fail_inconsistent_running_jobs(): @@ -37,7 +37,7 @@ def run_fail_inconsistent_running_jobs(): # Lock try: Instance.objects.select_for_update(nowait=True).all()[0] - scheduler = Scheduler() + scheduler = TaskManager() active_tasks = scheduler.get_active_tasks() if active_tasks is None: diff --git a/awx/main/tests/unit/scheduler/conftest.py b/awx/main/tests/unit/scheduler/conftest.py index cec68b1ef7..6b07649fd0 100644 --- a/awx/main/tests/unit/scheduler/conftest.py +++ b/awx/main/tests/unit/scheduler/conftest.py @@ -13,7 +13,7 @@ from awx.main.scheduler.partial import ( InventoryUpdateDict, InventorySourceDict, ) -from awx.main.scheduler import Scheduler +from awx.main.scheduler import TaskManager @pytest.fixture @@ -23,7 +23,7 @@ def epoch(): @pytest.fixture def scheduler_factory(mocker, epoch): def fn(tasks=[], inventory_sources=[], latest_project_updates=[], latest_inventory_updates=[], create_project_update=None, create_inventory_update=None): - sched = Scheduler() + sched = TaskManager() sched.capacity_total = 999999999 sched.graph.get_now = lambda: epoch From 13c89ab78c081066eecfa145fb51f4e283125bd4 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Tue, 1 Nov 2016 10:53:14 -0500 Subject: [PATCH 15/17] HAify job schedules and more task_manager renaming --- awx/main/migrations/0046_v310_tower_state.py | 24 +++++++++++++++++ awx/main/models/ha.py | 6 ++++- awx/main/scheduler/tasks.py | 2 +- awx/main/tasks.py | 28 ++++---------------- awx/settings/defaults.py | 7 ++--- requirements/requirements.txt | 1 + 6 files changed, 40 insertions(+), 28 deletions(-) create mode 100644 awx/main/migrations/0046_v310_tower_state.py diff --git a/awx/main/migrations/0046_v310_tower_state.py b/awx/main/migrations/0046_v310_tower_state.py new file mode 100644 index 0000000000..e9f785e0a6 --- /dev/null +++ b/awx/main/migrations/0046_v310_tower_state.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('main', '0045_v310_job_event_stdout'), + ] + + operations = [ + migrations.CreateModel( + name='TowerState', + fields=[ + ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('schedule_last_run', models.DateTimeField(auto_now_add=True)), + ], + options={ + 'abstract': False, + }, + ), + ] diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 3f92aebc12..818233672b 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -5,13 +5,15 @@ from django.db import models from django.db.models.signals import post_save from django.dispatch import receiver +from solo.models import SingletonModel + from awx.main.managers import InstanceManager from awx.main.models.inventory import InventoryUpdate from awx.main.models.jobs import Job from awx.main.models.projects import ProjectUpdate from awx.main.models.unified_jobs import UnifiedJob -__all__ = ('Instance', 'JobOrigin') +__all__ = ('Instance', 'JobOrigin', 'TowerState',) class Instance(models.Model): @@ -33,6 +35,8 @@ class Instance(models.Model): # NOTE: TODO: Likely to repurpose this once standalone ramparts are a thing return "tower" +class TowerState(SingletonModel): + schedule_last_run = models.DateTimeField(auto_now_add=True) class JobOrigin(models.Model): """A model representing the relationship between a unified job and diff --git a/awx/main/scheduler/tasks.py b/awx/main/scheduler/tasks.py index eb9c7691a7..622876a44e 100644 --- a/awx/main/scheduler/tasks.py +++ b/awx/main/scheduler/tasks.py @@ -28,7 +28,7 @@ def run_job_complete(job_id): TaskManager().schedule() @task -def run_scheduler(): +def run_task_manager(): TaskManager().schedule() @task diff --git a/awx/main/tasks.py b/awx/main/tasks.py index fac9dca68f..b6448e3761 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -21,7 +21,6 @@ import traceback import urlparse import uuid from distutils.version import LooseVersion as Version -import dateutil.parser import yaml try: import psutil @@ -137,30 +136,12 @@ def cluster_node_heartbeat(self): @task(bind=True, queue='default') def tower_periodic_scheduler(self): - def get_last_run(): - if not os.path.exists(settings.SCHEDULE_METADATA_LOCATION): - return None - fd = open(settings.SCHEDULE_METADATA_LOCATION) - try: - last_run = dateutil.parser.parse(fd.read()) - return last_run - except Exception as exc: - logger.error("get_last_run failed: {}".format(exc)) - return None - - def write_last_run(last_run): - fd = open(settings.SCHEDULE_METADATA_LOCATION, 'w') - fd.write(last_run.isoformat()) - fd.close() - run_now = now() - last_run = get_last_run() - if not last_run: - logger.debug("First run time") - write_last_run(run_now) - return + state = TowerState.get_solo() + last_run = state.schedule_last_run logger.debug("Last run was: %s", last_run) - write_last_run(run_now) + state.schedule_last_run = run_now + state.save() old_schedules = Schedule.objects.enabled().before(last_run) for schedule in old_schedules: @@ -180,6 +161,7 @@ def tower_periodic_scheduler(self): new_unified_job.save(update_fields=['status', 'job_explanation']) new_unified_job.websocket_emit_status("failed") emit_channel_notification('schedules-changed', dict(id=schedule.id, group_name="schedules")) + state.save() def _send_notification_templates(instance, status_str): if status_str not in ['succeeded', 'failed']: diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index a5c7975920..02cbf3fc31 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -201,6 +201,7 @@ INSTALLED_APPS = ( 'awx.ui', 'awx.fact', 'awx.sso', + 'solo', ) INTERNAL_IPS = ('127.0.0.1',) @@ -392,9 +393,9 @@ CELERYBEAT_SCHEDULE = { 'task': 'awx.main.tasks.cluster_node_heartbeat', 'schedule': timedelta(seconds=60) }, - 'task_scheduler': { - 'task': 'awx.main.scheduler.tasks.run_scheduler', - 'schedule': timedelta(seconds=10) + 'task_manager': { + 'task': 'awx.main.scheduler.tasks.run_task_manager', + 'schedule': timedelta(seconds=20) }, 'task_fail_inconsistent_running_jobs': { 'task': 'awx.main.scheduler.tasks.run_fail_inconsistent_running_jobs', diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 5f2448d9e6..44e4f58bee 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -24,6 +24,7 @@ django-polymorphic==0.7.2 django-radius==1.0.0 djangorestframework==3.3.2 djangorestframework-yaml==1.0.2 +django-solo==1.1.2 django-split-settings==0.1.1 django-transaction-hooks==0.2 django-taggit==0.17.6 From e1a84f4c85606bd19e5edd9141947866011cce2d Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Tue, 1 Nov 2016 13:55:37 -0500 Subject: [PATCH 16/17] bump migrations --- .../{0046_v310_tower_state.py => 0047_v310_tower_state.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename awx/main/migrations/{0046_v310_tower_state.py => 0047_v310_tower_state.py} (92%) diff --git a/awx/main/migrations/0046_v310_tower_state.py b/awx/main/migrations/0047_v310_tower_state.py similarity index 92% rename from awx/main/migrations/0046_v310_tower_state.py rename to awx/main/migrations/0047_v310_tower_state.py index e9f785e0a6..f1227830a4 100644 --- a/awx/main/migrations/0046_v310_tower_state.py +++ b/awx/main/migrations/0047_v310_tower_state.py @@ -7,7 +7,7 @@ from django.db import migrations, models class Migration(migrations.Migration): dependencies = [ - ('main', '0045_v310_job_event_stdout'), + ('main', '0046_v310_job_event_stdout'), ] operations = [ From 25b85c4a0bafb4d7d376bb523d80cfc9db461701 Mon Sep 17 00:00:00 2001 From: Chris Meyers Date: Tue, 1 Nov 2016 14:07:00 -0500 Subject: [PATCH 17/17] rename scheduler config singleton --- awx/main/migrations/0047_v310_tower_state.py | 2 +- awx/main/models/ha.py | 4 ++-- awx/main/tasks.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/awx/main/migrations/0047_v310_tower_state.py b/awx/main/migrations/0047_v310_tower_state.py index f1227830a4..941dfd0ba2 100644 --- a/awx/main/migrations/0047_v310_tower_state.py +++ b/awx/main/migrations/0047_v310_tower_state.py @@ -12,7 +12,7 @@ class Migration(migrations.Migration): operations = [ migrations.CreateModel( - name='TowerState', + name='TowerScheduleState', fields=[ ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), ('schedule_last_run', models.DateTimeField(auto_now_add=True)), diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 818233672b..691faf6305 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -13,7 +13,7 @@ from awx.main.models.jobs import Job from awx.main.models.projects import ProjectUpdate from awx.main.models.unified_jobs import UnifiedJob -__all__ = ('Instance', 'JobOrigin', 'TowerState',) +__all__ = ('Instance', 'JobOrigin', 'TowerScheduleState',) class Instance(models.Model): @@ -35,7 +35,7 @@ class Instance(models.Model): # NOTE: TODO: Likely to repurpose this once standalone ramparts are a thing return "tower" -class TowerState(SingletonModel): +class TowerScheduleState(SingletonModel): schedule_last_run = models.DateTimeField(auto_now_add=True) class JobOrigin(models.Model): diff --git a/awx/main/tasks.py b/awx/main/tasks.py index b6448e3761..7f187c7bce 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -137,7 +137,7 @@ def cluster_node_heartbeat(self): @task(bind=True, queue='default') def tower_periodic_scheduler(self): run_now = now() - state = TowerState.get_solo() + state = TowerScheduleState.get_solo() last_run = state.schedule_last_run logger.debug("Last run was: %s", last_run) state.schedule_last_run = run_now