mirror of
https://github.com/ansible/awx.git
synced 2026-01-15 03:40:42 -03:30
Merge branch 'feature-ha_task_manager' into devel
This commit is contained in:
commit
dd3c3c9f29
13
Makefile
13
Makefile
@ -361,7 +361,6 @@ server_noattach:
|
||||
tmux rename-window 'Tower'
|
||||
tmux select-window -t tower:0
|
||||
tmux split-window -v 'exec make celeryd'
|
||||
tmux split-window -h 'exec make taskmanager'
|
||||
tmux new-window 'exec make receiver'
|
||||
tmux select-window -t tower:1
|
||||
tmux rename-window 'Extra Services'
|
||||
@ -401,7 +400,7 @@ celeryd:
|
||||
@if [ "$(VENV_BASE)" ]; then \
|
||||
. $(VENV_BASE)/tower/bin/activate; \
|
||||
fi; \
|
||||
$(PYTHON) manage.py celeryd -l DEBUG -B --autoscale=20,3 --schedule=$(CELERY_SCHEDULE_FILE) -Q projects,jobs,default
|
||||
$(PYTHON) manage.py celeryd -l DEBUG -B --autoscale=20,3 --schedule=$(CELERY_SCHEDULE_FILE) -Q projects,jobs,default,scheduler
|
||||
#$(PYTHON) manage.py celery multi show projects jobs default -l DEBUG -Q:projects projects -Q:jobs jobs -Q:default default -c:projects 1 -c:jobs 3 -c:default 3 -Ofair -B --schedule=$(CELERY_SCHEDULE_FILE)
|
||||
|
||||
# Run to start the zeromq callback receiver
|
||||
@ -411,16 +410,6 @@ receiver:
|
||||
fi; \
|
||||
$(PYTHON) manage.py run_callback_receiver
|
||||
|
||||
taskmanager:
|
||||
@if [ "$(VENV_BASE)" ]; then \
|
||||
. $(VENV_BASE)/tower/bin/activate; \
|
||||
fi; \
|
||||
if [ "$(COMPOSE_HOST)" == "tower_1" ] || [ "$(COMPOSE_HOST)" == "tower" ]; then \
|
||||
$(PYTHON) manage.py run_task_system; \
|
||||
else \
|
||||
while true; do sleep 2; done; \
|
||||
fi
|
||||
|
||||
socketservice:
|
||||
@if [ "$(VENV_BASE)" ]; then \
|
||||
. $(VENV_BASE)/tower/bin/activate; \
|
||||
|
||||
3
Procfile
3
Procfile
@ -1,7 +1,6 @@
|
||||
runserver: make runserver
|
||||
celeryd: make celeryd
|
||||
taskmanager: make taskmanager
|
||||
receiver: make receiver
|
||||
socketservice: make socketservice
|
||||
factcacher: make factcacher
|
||||
flower: make flower
|
||||
flower: make flower
|
||||
|
||||
@ -2275,6 +2275,7 @@ class JobTemplateLaunch(RetrieveAPIView, GenericAPIView):
|
||||
|
||||
new_job = obj.create_unified_job(**kv)
|
||||
result = new_job.signal_start(**kv)
|
||||
|
||||
if not result:
|
||||
data = dict(passwords_needed_to_start=new_job.passwords_needed_to_start)
|
||||
new_job.delete()
|
||||
|
||||
@ -1,483 +0,0 @@
|
||||
#Copyright (c) 2015 Ansible, Inc.
|
||||
# All Rights Reserved
|
||||
|
||||
# Python
|
||||
import os
|
||||
import datetime
|
||||
import logging
|
||||
import signal
|
||||
import time
|
||||
|
||||
# Django
|
||||
from django.conf import settings
|
||||
from django.core.management.base import NoArgsCommand
|
||||
|
||||
# AWX
|
||||
from awx.main.models import * # noqa
|
||||
from awx.main.queue import FifoQueue
|
||||
from awx.main.tasks import handle_work_error, handle_work_success
|
||||
from awx.main.utils import get_system_task_capacity
|
||||
|
||||
# Celery
|
||||
from celery.task.control import inspect
|
||||
|
||||
logger = logging.getLogger('awx.main.commands.run_task_system')
|
||||
|
||||
queue = FifoQueue('tower_task_manager')
|
||||
|
||||
class SimpleDAG(object):
|
||||
''' A simple implementation of a directed acyclic graph '''
|
||||
|
||||
def __init__(self):
|
||||
self.nodes = []
|
||||
self.edges = []
|
||||
|
||||
def __contains__(self, obj):
|
||||
for node in self.nodes:
|
||||
if node['node_object'] == obj:
|
||||
return True
|
||||
return False
|
||||
|
||||
def __len__(self):
|
||||
return len(self.nodes)
|
||||
|
||||
def __iter__(self):
|
||||
return self.nodes.__iter__()
|
||||
|
||||
def generate_graphviz_plot(self):
|
||||
def short_string_obj(obj):
|
||||
if type(obj) == Job:
|
||||
type_str = "Job"
|
||||
if type(obj) == AdHocCommand:
|
||||
type_str = "AdHocCommand"
|
||||
elif type(obj) == InventoryUpdate:
|
||||
type_str = "Inventory"
|
||||
elif type(obj) == ProjectUpdate:
|
||||
type_str = "Project"
|
||||
elif type(obj) == WorkflowJob:
|
||||
type_str = "Workflow"
|
||||
else:
|
||||
type_str = "Unknown"
|
||||
type_str += "%s" % str(obj.id)
|
||||
return type_str
|
||||
|
||||
doc = """
|
||||
digraph g {
|
||||
rankdir = LR
|
||||
"""
|
||||
for n in self.nodes:
|
||||
doc += "%s [color = %s]\n" % (
|
||||
short_string_obj(n['node_object']),
|
||||
"red" if n['node_object'].status == 'running' else "black",
|
||||
)
|
||||
for from_node, to_node, label in self.edges:
|
||||
doc += "%s -> %s [ label=\"%s\" ];\n" % (
|
||||
short_string_obj(self.nodes[from_node]['node_object']),
|
||||
short_string_obj(self.nodes[to_node]['node_object']),
|
||||
label,
|
||||
)
|
||||
doc += "}\n"
|
||||
gv_file = open('/tmp/graph.gv', 'w')
|
||||
gv_file.write(doc)
|
||||
gv_file.close()
|
||||
|
||||
def add_node(self, obj, metadata=None):
|
||||
if self.find_ord(obj) is None:
|
||||
self.nodes.append(dict(node_object=obj, metadata=metadata))
|
||||
|
||||
def add_edge(self, from_obj, to_obj, label=None):
|
||||
from_obj_ord = self.find_ord(from_obj)
|
||||
to_obj_ord = self.find_ord(to_obj)
|
||||
if from_obj_ord is None or to_obj_ord is None:
|
||||
raise LookupError("Object not found")
|
||||
self.edges.append((from_obj_ord, to_obj_ord, label))
|
||||
|
||||
def add_edges(self, edgelist):
|
||||
for edge_pair in edgelist:
|
||||
self.add_edge(edge_pair[0], edge_pair[1], edge_pair[2])
|
||||
|
||||
def find_ord(self, obj):
|
||||
for idx in range(len(self.nodes)):
|
||||
if obj == self.nodes[idx]['node_object']:
|
||||
return idx
|
||||
return None
|
||||
|
||||
def get_node_type(self, obj):
|
||||
if type(obj) == Job:
|
||||
return "job"
|
||||
elif type(obj) == AdHocCommand:
|
||||
return "ad_hoc_command"
|
||||
elif type(obj) == InventoryUpdate:
|
||||
return "inventory_update"
|
||||
elif type(obj) == ProjectUpdate:
|
||||
return "project_update"
|
||||
elif type(obj) == SystemJob:
|
||||
return "system_job"
|
||||
elif type(obj) == WorkflowJob:
|
||||
return "workflow_job"
|
||||
return "unknown"
|
||||
|
||||
def get_dependencies(self, obj, label=None):
|
||||
antecedents = []
|
||||
this_ord = self.find_ord(obj)
|
||||
for node, dep, lbl in self.edges:
|
||||
if label:
|
||||
if node == this_ord and lbl == label:
|
||||
antecedents.append(self.nodes[dep])
|
||||
else:
|
||||
if node == this_ord:
|
||||
antecedents.append(self.nodes[dep])
|
||||
return antecedents
|
||||
|
||||
def get_dependents(self, obj, label=None):
|
||||
decendents = []
|
||||
this_ord = self.find_ord(obj)
|
||||
for node, dep, lbl in self.edges:
|
||||
if label:
|
||||
if dep == this_ord and lbl == label:
|
||||
decendents.append(self.nodes[node])
|
||||
else:
|
||||
if dep == this_ord:
|
||||
decendents.append(self.nodes[node])
|
||||
return decendents
|
||||
|
||||
def get_leaf_nodes(self):
|
||||
leafs = []
|
||||
for n in self.nodes:
|
||||
if len(self.get_dependencies(n['node_object'])) < 1:
|
||||
leafs.append(n)
|
||||
return leafs
|
||||
|
||||
def get_root_nodes(self):
|
||||
roots = []
|
||||
for n in self.nodes:
|
||||
if len(self.get_dependents(n['node_object'])) < 1:
|
||||
roots.append(n)
|
||||
return roots
|
||||
|
||||
class WorkflowDAG(SimpleDAG):
|
||||
def __init__(self, workflow_job=None):
|
||||
super(WorkflowDAG, self).__init__()
|
||||
if workflow_job:
|
||||
self._init_graph(workflow_job)
|
||||
|
||||
def _init_graph(self, workflow_job):
|
||||
workflow_nodes = workflow_job.workflow_job_nodes.all()
|
||||
for workflow_node in workflow_nodes:
|
||||
self.add_node(workflow_node)
|
||||
|
||||
for node_type in ['success_nodes', 'failure_nodes', 'always_nodes']:
|
||||
for workflow_node in workflow_nodes:
|
||||
related_nodes = getattr(workflow_node, node_type).all()
|
||||
for related_node in related_nodes:
|
||||
self.add_edge(workflow_node, related_node, node_type)
|
||||
|
||||
def bfs_nodes_to_run(self):
|
||||
root_nodes = self.get_root_nodes()
|
||||
nodes = root_nodes
|
||||
nodes_found = []
|
||||
|
||||
for index, n in enumerate(nodes):
|
||||
obj = n['node_object']
|
||||
job = obj.job
|
||||
|
||||
if not job:
|
||||
nodes_found.append(n)
|
||||
# Job is about to run or is running. Hold our horses and wait for
|
||||
# the job to finish. We can't proceed down the graph path until we
|
||||
# have the job result.
|
||||
elif job.status not in ['failed', 'error', 'successful']:
|
||||
continue
|
||||
elif job.status in ['failed', 'error']:
|
||||
children_failed = self.get_dependencies(obj, 'failure_nodes')
|
||||
children_always = self.get_dependencies(obj, 'always_nodes')
|
||||
children_all = children_failed + children_always
|
||||
nodes.extend(children_all)
|
||||
elif job.status in ['successful']:
|
||||
children_success = self.get_dependencies(obj, 'success_nodes')
|
||||
nodes.extend(children_success)
|
||||
else:
|
||||
logger.warn("Incorrect graph structure")
|
||||
return [n['node_object'] for n in nodes_found]
|
||||
|
||||
def is_workflow_done(self):
|
||||
root_nodes = self.get_root_nodes()
|
||||
nodes = root_nodes
|
||||
|
||||
for index, n in enumerate(nodes):
|
||||
obj = n['node_object']
|
||||
job = obj.job
|
||||
|
||||
if not job:
|
||||
return False
|
||||
# Job is about to run or is running. Hold our horses and wait for
|
||||
# the job to finish. We can't proceed down the graph path until we
|
||||
# have the job result.
|
||||
elif job.status not in ['failed', 'error', 'successful']:
|
||||
return False
|
||||
elif job.status in ['failed', 'error']:
|
||||
children_failed = self.get_dependencies(obj, 'failure_nodes')
|
||||
children_always = self.get_dependencies(obj, 'always_nodes')
|
||||
children_all = children_failed + children_always
|
||||
nodes.extend(children_all)
|
||||
elif job.status in ['successful']:
|
||||
children_success = self.get_dependencies(obj, 'success_nodes')
|
||||
nodes.extend(children_success)
|
||||
else:
|
||||
logger.warn("Incorrect graph structure")
|
||||
return True
|
||||
|
||||
def get_tasks():
|
||||
"""Fetch all Tower tasks that are relevant to the task management
|
||||
system.
|
||||
"""
|
||||
RELEVANT_JOBS = ('pending', 'waiting', 'running')
|
||||
# TODO: Replace this when we can grab all objects in a sane way.
|
||||
graph_jobs = [j for j in Job.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
graph_ad_hoc_commands = [ahc for ahc in AdHocCommand.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
graph_inventory_updates = [iu for iu in
|
||||
InventoryUpdate.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
graph_project_updates = [pu for pu in
|
||||
ProjectUpdate.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
graph_system_jobs = [sj for sj in
|
||||
SystemJob.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
graph_workflow_jobs = [wf for wf in
|
||||
WorkflowJob.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
all_actions = sorted(graph_jobs + graph_ad_hoc_commands + graph_inventory_updates +
|
||||
graph_project_updates + graph_system_jobs +
|
||||
graph_workflow_jobs,
|
||||
key=lambda task: task.created)
|
||||
return all_actions
|
||||
|
||||
def get_running_workflow_jobs():
|
||||
graph_workflow_jobs = [wf for wf in
|
||||
WorkflowJob.objects.filter(status='running')]
|
||||
return graph_workflow_jobs
|
||||
|
||||
def do_spawn_workflow_jobs():
|
||||
workflow_jobs = get_running_workflow_jobs()
|
||||
for workflow_job in workflow_jobs:
|
||||
dag = WorkflowDAG(workflow_job)
|
||||
spawn_nodes = dag.bfs_nodes_to_run()
|
||||
for spawn_node in spawn_nodes:
|
||||
# TODO: Inject job template template params as kwargs.
|
||||
# Make sure to take into account extra_vars merge logic
|
||||
kv = {}
|
||||
job = spawn_node.unified_job_template.create_unified_job(**kv)
|
||||
spawn_node.job = job
|
||||
spawn_node.save()
|
||||
can_start = job.signal_start(**kv)
|
||||
if not can_start:
|
||||
job.status = 'failed'
|
||||
job.job_explanation = "Workflow job could not start because it was not in the right state or required manual credentials"
|
||||
job.save(update_fields=['status', 'job_explanation'])
|
||||
job.socketio_emit_status("failed")
|
||||
|
||||
# TODO: should we emit a status on the socket here similar to tasks.py tower_periodic_scheduler() ?
|
||||
#emit_websocket_notification('/socket.io/jobs', '', dict(id=))
|
||||
|
||||
|
||||
def rebuild_graph(message):
|
||||
"""Regenerate the task graph by refreshing known tasks from Tower, purging
|
||||
orphaned running tasks, and creating dependencies for new tasks before
|
||||
generating directed edge relationships between those tasks.
|
||||
"""
|
||||
# Sanity check: Only do this on the primary node.
|
||||
if Instance.objects.my_role() == 'secondary':
|
||||
return None
|
||||
|
||||
inspector = inspect()
|
||||
if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'):
|
||||
active_task_queues = inspector.active()
|
||||
else:
|
||||
logger.warn("Ignoring celery task inspector")
|
||||
active_task_queues = None
|
||||
|
||||
do_spawn_workflow_jobs()
|
||||
|
||||
all_sorted_tasks = get_tasks()
|
||||
if not len(all_sorted_tasks):
|
||||
return None
|
||||
|
||||
active_tasks = []
|
||||
if active_task_queues is not None:
|
||||
for queue in active_task_queues:
|
||||
active_tasks += [at['id'] for at in active_task_queues[queue]]
|
||||
else:
|
||||
logger.error("Could not communicate with celery!")
|
||||
# TODO: Something needs to be done here to signal to the system
|
||||
# as a whole that celery appears to be down.
|
||||
if not hasattr(settings, 'CELERY_UNIT_TEST'):
|
||||
return None
|
||||
|
||||
running_tasks = filter(lambda t: t.status == 'running', all_sorted_tasks)
|
||||
waiting_tasks = filter(lambda t: t.status != 'running', all_sorted_tasks)
|
||||
new_tasks = filter(lambda t: t.status == 'pending', all_sorted_tasks)
|
||||
|
||||
# Check running tasks and make sure they are active in celery
|
||||
logger.debug("Active celery tasks: " + str(active_tasks))
|
||||
for task in list(running_tasks):
|
||||
if (task.celery_task_id not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')):
|
||||
# NOTE: Pull status again and make sure it didn't finish in
|
||||
# the meantime?
|
||||
task.status = 'failed'
|
||||
task.job_explanation += ' '.join((
|
||||
'Task was marked as running in Tower but was not present in',
|
||||
'Celery, so it has been marked as failed.',
|
||||
))
|
||||
task.save()
|
||||
task.socketio_emit_status("failed")
|
||||
running_tasks.pop(running_tasks.index(task))
|
||||
logger.error("Task %s appears orphaned... marking as failed" % task)
|
||||
|
||||
# Create and process dependencies for new tasks
|
||||
for task in new_tasks:
|
||||
logger.debug("Checking dependencies for: %s" % str(task))
|
||||
try:
|
||||
task_dependencies = task.generate_dependencies(running_tasks + waiting_tasks)
|
||||
except Exception, e:
|
||||
logger.error("Failed processing dependencies for {}: {}".format(task, e))
|
||||
task.status = 'failed'
|
||||
task.job_explanation += 'Task failed to generate dependencies: {}'.format(e)
|
||||
task.save()
|
||||
task.socketio_emit_status("failed")
|
||||
continue
|
||||
logger.debug("New dependencies: %s" % str(task_dependencies))
|
||||
for dep in task_dependencies:
|
||||
# We recalculate the created time for the moment to ensure the
|
||||
# dependencies are always sorted in the right order relative to
|
||||
# the dependent task.
|
||||
time_delt = len(task_dependencies) - task_dependencies.index(dep)
|
||||
dep.created = task.created - datetime.timedelta(seconds=1 + time_delt)
|
||||
dep.status = 'waiting'
|
||||
dep.save()
|
||||
waiting_tasks.insert(waiting_tasks.index(task), dep)
|
||||
if not hasattr(settings, 'UNIT_TEST_IGNORE_TASK_WAIT'):
|
||||
task.status = 'waiting'
|
||||
task.save()
|
||||
|
||||
# Rebuild graph
|
||||
graph = SimpleDAG()
|
||||
for task in running_tasks:
|
||||
graph.add_node(task)
|
||||
for wait_task in waiting_tasks[:50]:
|
||||
node_dependencies = []
|
||||
for node in graph:
|
||||
if wait_task.is_blocked_by(node['node_object']):
|
||||
node_dependencies.append(node['node_object'])
|
||||
graph.add_node(wait_task)
|
||||
for dependency in node_dependencies:
|
||||
graph.add_edge(wait_task, dependency)
|
||||
if settings.DEBUG:
|
||||
graph.generate_graphviz_plot()
|
||||
return graph
|
||||
|
||||
def process_graph(graph, task_capacity):
|
||||
"""Given a task dependency graph, start and manage tasks given their
|
||||
priority and weight.
|
||||
"""
|
||||
leaf_nodes = graph.get_leaf_nodes()
|
||||
running_nodes = filter(lambda x: x['node_object'].status == 'running', leaf_nodes)
|
||||
running_impact = sum([t['node_object'].task_impact for t in running_nodes])
|
||||
ready_nodes = filter(lambda x: x['node_object'].status != 'running', leaf_nodes)
|
||||
remaining_volume = task_capacity - running_impact
|
||||
logger.info('Running Nodes: %s; Capacity: %s; Running Impact: %s; '
|
||||
'Remaining Capacity: %s' %
|
||||
(str(running_nodes), str(task_capacity),
|
||||
str(running_impact), str(remaining_volume)))
|
||||
logger.info("Ready Nodes: %s" % str(ready_nodes))
|
||||
for task_node in ready_nodes:
|
||||
node_obj = task_node['node_object']
|
||||
# NOTE: This could be used to pass metadata through the task system
|
||||
# node_args = task_node['metadata']
|
||||
impact = node_obj.task_impact
|
||||
if impact <= remaining_volume or running_impact == 0:
|
||||
node_dependencies = graph.get_dependents(node_obj)
|
||||
# Allow other tasks to continue if a job fails, even if they are
|
||||
# other jobs.
|
||||
if graph.get_node_type(node_obj) == 'job':
|
||||
node_dependencies = []
|
||||
dependent_nodes = [{'type': graph.get_node_type(node_obj), 'id': node_obj.id}] + \
|
||||
[{'type': graph.get_node_type(n['node_object']),
|
||||
'id': n['node_object'].id} for n in node_dependencies]
|
||||
error_handler = handle_work_error.s(subtasks=dependent_nodes)
|
||||
success_handler = handle_work_success.s(task_actual={'type': graph.get_node_type(node_obj),
|
||||
'id': node_obj.id})
|
||||
start_status = node_obj.start(error_callback=error_handler, success_callback=success_handler)
|
||||
if not start_status:
|
||||
node_obj.status = 'failed'
|
||||
if node_obj.job_explanation:
|
||||
node_obj.job_explanation += ' '
|
||||
node_obj.job_explanation += 'Task failed pre-start check.'
|
||||
node_obj.save()
|
||||
continue
|
||||
remaining_volume -= impact
|
||||
running_impact += impact
|
||||
logger.info('Started Node: %s (capacity hit: %s) '
|
||||
'Remaining Capacity: %s' %
|
||||
(str(node_obj), str(impact), str(remaining_volume)))
|
||||
|
||||
def run_taskmanager():
|
||||
"""Receive task start and finish signals to rebuild a dependency graph
|
||||
and manage the actual running of tasks.
|
||||
"""
|
||||
def shutdown_handler():
|
||||
def _handler(signum, frame):
|
||||
signal.signal(signum, signal.SIG_DFL)
|
||||
os.kill(os.getpid(), signum)
|
||||
return _handler
|
||||
signal.signal(signal.SIGINT, shutdown_handler())
|
||||
signal.signal(signal.SIGTERM, shutdown_handler())
|
||||
paused = False
|
||||
task_capacity = get_system_task_capacity()
|
||||
last_rebuild = datetime.datetime.fromtimestamp(0)
|
||||
|
||||
# Attempt to pull messages off of the task system queue into perpetuity.
|
||||
#
|
||||
# A quick explanation of what is happening here:
|
||||
# The popping messages off the queue bit is something of a sham. We remove
|
||||
# the messages from the queue and then immediately throw them away. The
|
||||
# `rebuild_graph` function, while it takes the message as an argument,
|
||||
# ignores it.
|
||||
#
|
||||
# What actually happens is that we just check the database every 10 seconds
|
||||
# to see what the task dependency graph looks like, and go do that. This
|
||||
# is the job of the `rebuild_graph` function.
|
||||
#
|
||||
# There is some placeholder here: we may choose to actually use the message
|
||||
# in the future.
|
||||
while True:
|
||||
# Pop a message off the queue.
|
||||
# (If the queue is empty, None will be returned.)
|
||||
message = queue.pop()
|
||||
|
||||
# Parse out the message appropriately, rebuilding our graph if
|
||||
# appropriate.
|
||||
if (datetime.datetime.now() - last_rebuild).seconds > 10:
|
||||
if message is not None and 'pause' in message:
|
||||
logger.info("Pause command received: %s" % str(message))
|
||||
paused = message['pause']
|
||||
graph = rebuild_graph(message)
|
||||
if not paused and graph is not None:
|
||||
process_graph(graph, task_capacity)
|
||||
last_rebuild = datetime.datetime.now()
|
||||
time.sleep(0.1)
|
||||
|
||||
|
||||
class Command(NoArgsCommand):
|
||||
"""Tower Task Management System
|
||||
This daemon is designed to reside between our tasks and celery and
|
||||
provide a mechanism for understanding the relationship between those tasks
|
||||
and their dependencies.
|
||||
|
||||
It also actively prevents situations in which Tower can get blocked
|
||||
because it doesn't have an understanding of what is progressing through
|
||||
celery.
|
||||
"""
|
||||
help = 'Launch the Tower task management system'
|
||||
|
||||
def handle_noargs(self, **options):
|
||||
try:
|
||||
run_taskmanager()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
@ -798,34 +798,43 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
|
||||
status=self.status,
|
||||
traceback=self.result_traceback)
|
||||
|
||||
def start(self, error_callback, success_callback, **kwargs):
|
||||
'''
|
||||
Start the task running via Celery.
|
||||
'''
|
||||
task_class = self._get_task_class()
|
||||
def pre_start(self, **kwargs):
|
||||
if not self.can_start:
|
||||
self.job_explanation = u'%s is not in a startable status: %s, expecting one of %s' % (self._meta.verbose_name, self.status, str(('new', 'waiting')))
|
||||
self.save(update_fields=['job_explanation'])
|
||||
return False
|
||||
return (False, None)
|
||||
|
||||
needed = self.get_passwords_needed_to_start()
|
||||
try:
|
||||
start_args = json.loads(decrypt_field(self, 'start_args'))
|
||||
except Exception:
|
||||
start_args = None
|
||||
|
||||
if start_args in (None, ''):
|
||||
start_args = kwargs
|
||||
|
||||
opts = dict([(field, start_args.get(field, '')) for field in needed])
|
||||
|
||||
if not all(opts.values()):
|
||||
missing_fields = ', '.join([k for k,v in opts.items() if not v])
|
||||
self.job_explanation = u'Missing needed fields: %s.' % missing_fields
|
||||
self.save(update_fields=['job_explanation'])
|
||||
return False
|
||||
#extra_data = dict([(field, kwargs[field]) for field in kwargs
|
||||
# if field not in needed])
|
||||
return (False, None)
|
||||
|
||||
if 'extra_vars' in kwargs:
|
||||
self.handle_extra_data(kwargs['extra_vars'])
|
||||
task_class().apply_async((self.pk,), opts, link_error=error_callback, link=success_callback)
|
||||
return True
|
||||
|
||||
return (True, opts)
|
||||
|
||||
def start(self, error_callback, success_callback, **kwargs):
|
||||
'''
|
||||
Start the task running via Celery.
|
||||
'''
|
||||
task_class = self._get_task_class()
|
||||
(res, opts) = self.pre_start(**kwargs)
|
||||
if res:
|
||||
task_class().apply_async((self.pk,), opts, link_error=error_callback, link=success_callback)
|
||||
return res
|
||||
|
||||
def signal_start(self, **kwargs):
|
||||
"""Notify the task runner system to begin work on this task."""
|
||||
@ -852,6 +861,9 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
|
||||
self.update_fields(start_args=json.dumps(kwargs), status='pending')
|
||||
self.socketio_emit_status("pending")
|
||||
|
||||
from awx.main.scheduler.tasks import run_job_launch
|
||||
run_job_launch.delay(self.id)
|
||||
|
||||
# Each type of unified job has a different Task class; get the
|
||||
# appropirate one.
|
||||
# task_type = get_type_for_model(self)
|
||||
|
||||
@ -240,3 +240,11 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, JobNotificationMixin, Workflow
|
||||
def get_notification_friendly_name(self):
|
||||
return "Workflow Job"
|
||||
|
||||
def start(self, *args, **kwargs):
|
||||
(res, opts) = self.pre_start(**kwargs)
|
||||
if res:
|
||||
self.status = 'running'
|
||||
self.save()
|
||||
self.socketio_emit_status("running")
|
||||
return res
|
||||
|
||||
|
||||
257
awx/main/scheduler/__init__.py
Normal file
257
awx/main/scheduler/__init__.py
Normal file
@ -0,0 +1,257 @@
|
||||
#Copyright (c) 2015 Ansible, Inc.
|
||||
# All Rights Reserved
|
||||
|
||||
# Python
|
||||
import datetime
|
||||
import logging
|
||||
|
||||
# Django
|
||||
from django.conf import settings
|
||||
from django.db import transaction
|
||||
|
||||
# AWX
|
||||
from awx.main.models import * # noqa
|
||||
from awx.main.utils import get_system_task_capacity
|
||||
from awx.main.scheduler.dag_simple import SimpleDAG
|
||||
from awx.main.scheduler.dag_workflow import WorkflowDAG
|
||||
|
||||
# Celery
|
||||
from celery.task.control import inspect
|
||||
|
||||
logger = logging.getLogger('awx.main.scheduler')
|
||||
|
||||
def get_tasks():
|
||||
"""Fetch all Tower tasks that are relevant to the task management
|
||||
system.
|
||||
"""
|
||||
RELEVANT_JOBS = ('pending', 'waiting', 'running')
|
||||
# TODO: Replace this when we can grab all objects in a sane way.
|
||||
graph_jobs = [j for j in Job.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
graph_ad_hoc_commands = [ahc for ahc in AdHocCommand.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
graph_inventory_updates = [iu for iu in
|
||||
InventoryUpdate.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
graph_project_updates = [pu for pu in
|
||||
ProjectUpdate.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
graph_system_jobs = [sj for sj in
|
||||
SystemJob.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
graph_workflow_jobs = [wf for wf in
|
||||
WorkflowJob.objects.filter(status__in=RELEVANT_JOBS)]
|
||||
all_actions = sorted(graph_jobs + graph_ad_hoc_commands + graph_inventory_updates +
|
||||
graph_project_updates + graph_system_jobs +
|
||||
graph_workflow_jobs,
|
||||
key=lambda task: task.created)
|
||||
return all_actions
|
||||
|
||||
def get_running_workflow_jobs():
|
||||
graph_workflow_jobs = [wf for wf in
|
||||
WorkflowJob.objects.filter(status='running')]
|
||||
return graph_workflow_jobs
|
||||
|
||||
def spawn_workflow_graph_jobs(workflow_jobs):
|
||||
# TODO: Consider using transaction.atomic
|
||||
for workflow_job in workflow_jobs:
|
||||
dag = WorkflowDAG(workflow_job)
|
||||
spawn_nodes = dag.bfs_nodes_to_run()
|
||||
for spawn_node in spawn_nodes:
|
||||
# TODO: Inject job template template params as kwargs.
|
||||
# Make sure to take into account extra_vars merge logic
|
||||
kv = {}
|
||||
job = spawn_node.unified_job_template.create_unified_job(**kv)
|
||||
spawn_node.job = job
|
||||
spawn_node.save()
|
||||
can_start = job.signal_start(**kv)
|
||||
if not can_start:
|
||||
job.status = 'failed'
|
||||
job.job_explanation = "Workflow job could not start because it was not in the right state or required manual credentials"
|
||||
job.save(update_fields=['status', 'job_explanation'])
|
||||
job.socketio_emit_status("failed")
|
||||
|
||||
# TODO: should we emit a status on the socket here similar to tasks.py tower_periodic_scheduler() ?
|
||||
#emit_websocket_notification('/socket.io/jobs', '', dict(id=))
|
||||
|
||||
# See comment in tasks.py::RunWorkflowJob::run()
|
||||
def process_finished_workflow_jobs(workflow_jobs):
|
||||
for workflow_job in workflow_jobs:
|
||||
dag = WorkflowDAG(workflow_job)
|
||||
if dag.is_workflow_done():
|
||||
with transaction.atomic():
|
||||
# TODO: detect if wfj failed
|
||||
workflow_job.status = 'completed'
|
||||
workflow_job.save()
|
||||
workflow_job.socketio_emit_status('completed')
|
||||
|
||||
def rebuild_graph():
|
||||
"""Regenerate the task graph by refreshing known tasks from Tower, purging
|
||||
orphaned running tasks, and creating dependencies for new tasks before
|
||||
generating directed edge relationships between those tasks.
|
||||
"""
|
||||
'''
|
||||
# Sanity check: Only do this on the primary node.
|
||||
if Instance.objects.my_role() == 'secondary':
|
||||
return None
|
||||
'''
|
||||
|
||||
inspector = inspect()
|
||||
if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'):
|
||||
active_task_queues = inspector.active()
|
||||
else:
|
||||
logger.warn("Ignoring celery task inspector")
|
||||
active_task_queues = None
|
||||
|
||||
all_sorted_tasks = get_tasks()
|
||||
if not len(all_sorted_tasks):
|
||||
return None
|
||||
|
||||
active_tasks = []
|
||||
if active_task_queues is not None:
|
||||
for queue in active_task_queues:
|
||||
active_tasks += [at['id'] for at in active_task_queues[queue]]
|
||||
else:
|
||||
logger.error("Could not communicate with celery!")
|
||||
# TODO: Something needs to be done here to signal to the system
|
||||
# as a whole that celery appears to be down.
|
||||
if not hasattr(settings, 'CELERY_UNIT_TEST'):
|
||||
return None
|
||||
|
||||
running_tasks = filter(lambda t: t.status == 'running', all_sorted_tasks)
|
||||
running_celery_tasks = filter(lambda t: type(t) != WorkflowJob, running_tasks)
|
||||
waiting_tasks = filter(lambda t: t.status != 'running', all_sorted_tasks)
|
||||
new_tasks = filter(lambda t: t.status == 'pending', all_sorted_tasks)
|
||||
|
||||
# Check running tasks and make sure they are active in celery
|
||||
logger.debug("Active celery tasks: " + str(active_tasks))
|
||||
for task in list(running_celery_tasks):
|
||||
if (task.celery_task_id not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')):
|
||||
# NOTE: Pull status again and make sure it didn't finish in
|
||||
# the meantime?
|
||||
task.status = 'failed'
|
||||
task.job_explanation += ' '.join((
|
||||
'Task was marked as running in Tower but was not present in',
|
||||
'Celery, so it has been marked as failed.',
|
||||
))
|
||||
task.save()
|
||||
task.socketio_emit_status("failed")
|
||||
running_tasks.pop(task)
|
||||
logger.error("Task %s appears orphaned... marking as failed" % task)
|
||||
|
||||
# Create and process dependencies for new tasks
|
||||
for task in new_tasks:
|
||||
logger.debug("Checking dependencies for: %s" % str(task))
|
||||
try:
|
||||
task_dependencies = task.generate_dependencies(running_tasks + waiting_tasks)
|
||||
except Exception, e:
|
||||
logger.error("Failed processing dependencies for {}: {}".format(task, e))
|
||||
task.status = 'failed'
|
||||
task.job_explanation += 'Task failed to generate dependencies: {}'.format(e)
|
||||
task.save()
|
||||
task.socketio_emit_status("failed")
|
||||
continue
|
||||
logger.debug("New dependencies: %s" % str(task_dependencies))
|
||||
for dep in task_dependencies:
|
||||
# We recalculate the created time for the moment to ensure the
|
||||
# dependencies are always sorted in the right order relative to
|
||||
# the dependent task.
|
||||
time_delt = len(task_dependencies) - task_dependencies.index(dep)
|
||||
dep.created = task.created - datetime.timedelta(seconds=1 + time_delt)
|
||||
dep.status = 'waiting'
|
||||
dep.save()
|
||||
waiting_tasks.insert(waiting_tasks.index(task), dep)
|
||||
if not hasattr(settings, 'UNIT_TEST_IGNORE_TASK_WAIT'):
|
||||
task.status = 'waiting'
|
||||
task.save()
|
||||
|
||||
# Rebuild graph
|
||||
graph = SimpleDAG()
|
||||
for task in running_tasks:
|
||||
graph.add_node(task)
|
||||
for wait_task in waiting_tasks[:50]:
|
||||
node_dependencies = []
|
||||
for node in graph:
|
||||
if wait_task.is_blocked_by(node['node_object']):
|
||||
node_dependencies.append(node['node_object'])
|
||||
graph.add_node(wait_task)
|
||||
for dependency in node_dependencies:
|
||||
graph.add_edge(wait_task, dependency)
|
||||
if settings.DEBUG:
|
||||
graph.generate_graphviz_plot()
|
||||
return graph
|
||||
|
||||
def process_graph(graph, task_capacity):
|
||||
"""Given a task dependency graph, start and manage tasks given their
|
||||
priority and weight.
|
||||
"""
|
||||
from awx.main.tasks import handle_work_error, handle_work_success
|
||||
|
||||
leaf_nodes = graph.get_leaf_nodes()
|
||||
running_nodes = filter(lambda x: x['node_object'].status == 'running', leaf_nodes)
|
||||
running_impact = sum([t['node_object'].task_impact for t in running_nodes])
|
||||
ready_nodes = filter(lambda x: x['node_object'].status != 'running', leaf_nodes)
|
||||
remaining_volume = task_capacity - running_impact
|
||||
logger.info('Running Nodes: %s; Capacity: %s; Running Impact: %s; '
|
||||
'Remaining Capacity: %s' %
|
||||
(str(running_nodes), str(task_capacity),
|
||||
str(running_impact), str(remaining_volume)))
|
||||
logger.info("Ready Nodes: %s" % str(ready_nodes))
|
||||
for task_node in ready_nodes:
|
||||
node_obj = task_node['node_object']
|
||||
# NOTE: This could be used to pass metadata through the task system
|
||||
# node_args = task_node['metadata']
|
||||
impact = node_obj.task_impact
|
||||
if impact <= remaining_volume or running_impact == 0:
|
||||
node_dependencies = graph.get_dependents(node_obj)
|
||||
# Allow other tasks to continue if a job fails, even if they are
|
||||
# other jobs.
|
||||
|
||||
node_type = graph.get_node_type(node_obj)
|
||||
if node_type == 'job':
|
||||
# clear dependencies because a job can block (not necessarily
|
||||
# depend) on other jobs that share the same job template
|
||||
node_dependencies = []
|
||||
|
||||
# Make the workflow_job look like it's started by setting status to
|
||||
# running, but don't make a celery Task for it.
|
||||
# Introduce jobs from the workflow so they are candidates to run.
|
||||
# Call process_graph() again to allow choosing for run, the
|
||||
# created candidate jobs.
|
||||
elif node_type == 'workflow_job':
|
||||
node_obj.start()
|
||||
spawn_workflow_graph_jobs([node_obj])
|
||||
return process_graph(graph, task_capacity)
|
||||
|
||||
dependent_nodes = [{'type': graph.get_node_type(node_obj), 'id': node_obj.id}] + \
|
||||
[{'type': graph.get_node_type(n['node_object']),
|
||||
'id': n['node_object'].id} for n in node_dependencies]
|
||||
error_handler = handle_work_error.s(subtasks=dependent_nodes)
|
||||
success_handler = handle_work_success.s(task_actual={'type': graph.get_node_type(node_obj),
|
||||
'id': node_obj.id})
|
||||
with transaction.atomic():
|
||||
start_status = node_obj.start(error_callback=error_handler, success_callback=success_handler)
|
||||
if not start_status:
|
||||
node_obj.status = 'failed'
|
||||
if node_obj.job_explanation:
|
||||
node_obj.job_explanation += ' '
|
||||
node_obj.job_explanation += 'Task failed pre-start check.'
|
||||
node_obj.save()
|
||||
continue
|
||||
remaining_volume -= impact
|
||||
running_impact += impact
|
||||
logger.info('Started Node: %s (capacity hit: %s) '
|
||||
'Remaining Capacity: %s' %
|
||||
(str(node_obj), str(impact), str(remaining_volume)))
|
||||
|
||||
def schedule():
|
||||
with transaction.atomic():
|
||||
# Lock
|
||||
Instance.objects.select_for_update().all()[0]
|
||||
|
||||
task_capacity = get_system_task_capacity()
|
||||
|
||||
workflow_jobs = get_running_workflow_jobs()
|
||||
process_finished_workflow_jobs(workflow_jobs)
|
||||
spawn_workflow_graph_jobs(workflow_jobs)
|
||||
|
||||
graph = rebuild_graph()
|
||||
if graph:
|
||||
process_graph(graph, task_capacity)
|
||||
|
||||
# Unlock, due to transaction ending
|
||||
140
awx/main/scheduler/dag_simple.py
Normal file
140
awx/main/scheduler/dag_simple.py
Normal file
@ -0,0 +1,140 @@
|
||||
|
||||
from awx.main.models import (
|
||||
Job,
|
||||
AdHocCommand,
|
||||
InventoryUpdate,
|
||||
ProjectUpdate,
|
||||
WorkflowJob,
|
||||
SystemJob,
|
||||
)
|
||||
|
||||
class SimpleDAG(object):
|
||||
''' A simple implementation of a directed acyclic graph '''
|
||||
|
||||
def __init__(self):
|
||||
self.nodes = []
|
||||
self.edges = []
|
||||
|
||||
def __contains__(self, obj):
|
||||
for node in self.nodes:
|
||||
if node['node_object'] == obj:
|
||||
return True
|
||||
return False
|
||||
|
||||
def __len__(self):
|
||||
return len(self.nodes)
|
||||
|
||||
def __iter__(self):
|
||||
return self.nodes.__iter__()
|
||||
|
||||
def generate_graphviz_plot(self):
|
||||
def short_string_obj(obj):
|
||||
if type(obj) == Job:
|
||||
type_str = "Job"
|
||||
if type(obj) == AdHocCommand:
|
||||
type_str = "AdHocCommand"
|
||||
elif type(obj) == InventoryUpdate:
|
||||
type_str = "Inventory"
|
||||
elif type(obj) == ProjectUpdate:
|
||||
type_str = "Project"
|
||||
elif type(obj) == WorkflowJob:
|
||||
type_str = "Workflow"
|
||||
else:
|
||||
type_str = "Unknown"
|
||||
type_str += "%s" % str(obj.id)
|
||||
return type_str
|
||||
|
||||
doc = """
|
||||
digraph g {
|
||||
rankdir = LR
|
||||
"""
|
||||
for n in self.nodes:
|
||||
doc += "%s [color = %s]\n" % (
|
||||
short_string_obj(n['node_object']),
|
||||
"red" if n['node_object'].status == 'running' else "black",
|
||||
)
|
||||
for from_node, to_node, label in self.edges:
|
||||
doc += "%s -> %s [ label=\"%s\" ];\n" % (
|
||||
short_string_obj(self.nodes[from_node]['node_object']),
|
||||
short_string_obj(self.nodes[to_node]['node_object']),
|
||||
label,
|
||||
)
|
||||
doc += "}\n"
|
||||
gv_file = open('/tmp/graph.gv', 'w')
|
||||
gv_file.write(doc)
|
||||
gv_file.close()
|
||||
|
||||
def add_node(self, obj, metadata=None):
|
||||
if self.find_ord(obj) is None:
|
||||
self.nodes.append(dict(node_object=obj, metadata=metadata))
|
||||
|
||||
def add_edge(self, from_obj, to_obj, label=None):
|
||||
from_obj_ord = self.find_ord(from_obj)
|
||||
to_obj_ord = self.find_ord(to_obj)
|
||||
if from_obj_ord is None or to_obj_ord is None:
|
||||
raise LookupError("Object not found")
|
||||
self.edges.append((from_obj_ord, to_obj_ord, label))
|
||||
|
||||
def add_edges(self, edgelist):
|
||||
for edge_pair in edgelist:
|
||||
self.add_edge(edge_pair[0], edge_pair[1], edge_pair[2])
|
||||
|
||||
def find_ord(self, obj):
|
||||
for idx in range(len(self.nodes)):
|
||||
if obj == self.nodes[idx]['node_object']:
|
||||
return idx
|
||||
return None
|
||||
|
||||
def get_node_type(self, obj):
|
||||
if type(obj) == Job:
|
||||
return "job"
|
||||
elif type(obj) == AdHocCommand:
|
||||
return "ad_hoc_command"
|
||||
elif type(obj) == InventoryUpdate:
|
||||
return "inventory_update"
|
||||
elif type(obj) == ProjectUpdate:
|
||||
return "project_update"
|
||||
elif type(obj) == SystemJob:
|
||||
return "system_job"
|
||||
elif type(obj) == WorkflowJob:
|
||||
return "workflow_job"
|
||||
return "unknown"
|
||||
|
||||
def get_dependencies(self, obj, label=None):
|
||||
antecedents = []
|
||||
this_ord = self.find_ord(obj)
|
||||
for node, dep, lbl in self.edges:
|
||||
if label:
|
||||
if node == this_ord and lbl == label:
|
||||
antecedents.append(self.nodes[dep])
|
||||
else:
|
||||
if node == this_ord:
|
||||
antecedents.append(self.nodes[dep])
|
||||
return antecedents
|
||||
|
||||
def get_dependents(self, obj, label=None):
|
||||
decendents = []
|
||||
this_ord = self.find_ord(obj)
|
||||
for node, dep, lbl in self.edges:
|
||||
if label:
|
||||
if dep == this_ord and lbl == label:
|
||||
decendents.append(self.nodes[node])
|
||||
else:
|
||||
if dep == this_ord:
|
||||
decendents.append(self.nodes[node])
|
||||
return decendents
|
||||
|
||||
def get_leaf_nodes(self):
|
||||
leafs = []
|
||||
for n in self.nodes:
|
||||
if len(self.get_dependencies(n['node_object'])) < 1:
|
||||
leafs.append(n)
|
||||
return leafs
|
||||
|
||||
def get_root_nodes(self):
|
||||
roots = []
|
||||
for n in self.nodes:
|
||||
if len(self.get_dependents(n['node_object'])) < 1:
|
||||
roots.append(n)
|
||||
return roots
|
||||
|
||||
72
awx/main/scheduler/dag_workflow.py
Normal file
72
awx/main/scheduler/dag_workflow.py
Normal file
@ -0,0 +1,72 @@
|
||||
|
||||
# AWX
|
||||
from awx.main.scheduler.dag_simple import SimpleDAG
|
||||
|
||||
class WorkflowDAG(SimpleDAG):
|
||||
def __init__(self, workflow_job=None):
|
||||
super(WorkflowDAG, self).__init__()
|
||||
if workflow_job:
|
||||
self._init_graph(workflow_job)
|
||||
|
||||
def _init_graph(self, workflow_job):
|
||||
workflow_nodes = workflow_job.workflow_job_nodes.all()
|
||||
for workflow_node in workflow_nodes:
|
||||
self.add_node(workflow_node)
|
||||
|
||||
for node_type in ['success_nodes', 'failure_nodes', 'always_nodes']:
|
||||
for workflow_node in workflow_nodes:
|
||||
related_nodes = getattr(workflow_node, node_type).all()
|
||||
for related_node in related_nodes:
|
||||
self.add_edge(workflow_node, related_node, node_type)
|
||||
|
||||
def bfs_nodes_to_run(self):
|
||||
root_nodes = self.get_root_nodes()
|
||||
nodes = root_nodes
|
||||
nodes_found = []
|
||||
|
||||
for index, n in enumerate(nodes):
|
||||
obj = n['node_object']
|
||||
job = obj.job
|
||||
|
||||
if not job:
|
||||
nodes_found.append(n)
|
||||
# Job is about to run or is running. Hold our horses and wait for
|
||||
# the job to finish. We can't proceed down the graph path until we
|
||||
# have the job result.
|
||||
elif job.status not in ['failed', 'error', 'successful']:
|
||||
continue
|
||||
elif job.status in ['failed', 'error']:
|
||||
children_failed = self.get_dependencies(obj, 'failure_nodes')
|
||||
children_always = self.get_dependencies(obj, 'always_nodes')
|
||||
children_all = children_failed + children_always
|
||||
nodes.extend(children_all)
|
||||
elif job.status in ['successful']:
|
||||
children_success = self.get_dependencies(obj, 'success_nodes')
|
||||
nodes.extend(children_success)
|
||||
return [n['node_object'] for n in nodes_found]
|
||||
|
||||
def is_workflow_done(self):
|
||||
root_nodes = self.get_root_nodes()
|
||||
nodes = root_nodes
|
||||
|
||||
for index, n in enumerate(nodes):
|
||||
obj = n['node_object']
|
||||
job = obj.job
|
||||
|
||||
if not job:
|
||||
return False
|
||||
# Job is about to run or is running. Hold our horses and wait for
|
||||
# the job to finish. We can't proceed down the graph path until we
|
||||
# have the job result.
|
||||
elif job.status not in ['failed', 'error', 'successful']:
|
||||
return False
|
||||
elif job.status in ['failed', 'error']:
|
||||
children_failed = self.get_dependencies(obj, 'failure_nodes')
|
||||
children_always = self.get_dependencies(obj, 'always_nodes')
|
||||
children_all = children_failed + children_always
|
||||
nodes.extend(children_all)
|
||||
elif job.status in ['successful']:
|
||||
children_success = self.get_dependencies(obj, 'success_nodes')
|
||||
nodes.extend(children_success)
|
||||
return True
|
||||
|
||||
79
awx/main/scheduler/tasks.py
Normal file
79
awx/main/scheduler/tasks.py
Normal file
@ -0,0 +1,79 @@
|
||||
|
||||
# Python
|
||||
import logging
|
||||
import time
|
||||
|
||||
# Celery
|
||||
from celery import task
|
||||
|
||||
# AWX
|
||||
from awx.main.models import UnifiedJob
|
||||
from awx.main.scheduler import schedule
|
||||
|
||||
logger = logging.getLogger('awx.main.scheduler')
|
||||
|
||||
# TODO: move logic to UnifiedJob model and use bind=True feature of celery.
|
||||
# Would we need the request loop then? I think so. Even if we get the in-memory
|
||||
# updated model, the call to schedule() may get stale data.
|
||||
|
||||
@task
|
||||
def run_job_launch(job_id):
|
||||
# Wait for job to exist.
|
||||
# The job is created in a transaction then the message is created, but
|
||||
# the transaction may not have completed.
|
||||
|
||||
# FIXME: We could generate the message in a Django signal handler.
|
||||
# OR, we could call an explicit commit in the view and then send the
|
||||
# message.
|
||||
|
||||
retries = 10
|
||||
retry = 0
|
||||
while not UnifiedJob.objects.filter(id=job_id).exists():
|
||||
time.sleep(0.3)
|
||||
|
||||
if retry >= retries:
|
||||
logger.error("Failed to process 'job_launch' message for job %d" % job_id)
|
||||
# ack the message so we don't build up the queue.
|
||||
#
|
||||
# The job can still be chosen to run during tower startup or
|
||||
# when another job is started or completes
|
||||
return
|
||||
retry += 1
|
||||
|
||||
# "Safe" to get the job now since it exists.
|
||||
# Really, there is a race condition from exists to get
|
||||
|
||||
# TODO: while not loop should call get wrapped in a try except
|
||||
#job = UnifiedJob.objects.get(id=job_id)
|
||||
|
||||
schedule()
|
||||
|
||||
@task
|
||||
def run_job_complete(job_id):
|
||||
# TODO: use list of finished status from jobs.py or unified_jobs.py
|
||||
finished_status = ['successful', 'error', 'failed', 'completed']
|
||||
q = UnifiedJob.objects.filter(id=job_id)
|
||||
|
||||
# Ensure that the job is updated in the database before we call to
|
||||
# schedule the next job.
|
||||
retries = 10
|
||||
retry = 0
|
||||
while True:
|
||||
# Job not found, most likely deleted. That's fine
|
||||
if not q.exists():
|
||||
logger.warn("Failed to find job '%d' while processing 'job_complete' message. Presume that it was deleted." % job_id)
|
||||
break
|
||||
|
||||
job = q[0]
|
||||
if job.status in finished_status:
|
||||
break
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
if retry >= retries:
|
||||
logger.error("Expected job status '%s' to be one of '%s' while processing 'job_complete' message." % (job.status, finished_status))
|
||||
return
|
||||
retry += 1
|
||||
|
||||
schedule()
|
||||
|
||||
@ -47,7 +47,6 @@ from django.contrib.auth.models import User
|
||||
from awx.main.constants import CLOUD_PROVIDERS
|
||||
from awx.main.models import * # noqa
|
||||
from awx.main.models import UnifiedJob
|
||||
from awx.main.queue import FifoQueue
|
||||
from awx.main.task_engine import TaskEnhancer
|
||||
from awx.main.utils import (get_ansible_version, get_ssh_version, decrypt_field, update_scm_url,
|
||||
emit_websocket_notification,
|
||||
@ -57,7 +56,7 @@ __all__ = ['RunJob', 'RunSystemJob', 'RunProjectUpdate', 'RunInventoryUpdate',
|
||||
'RunAdHocCommand', 'RunWorkflowJob', 'handle_work_error',
|
||||
'handle_work_success', 'update_inventory_computed_fields',
|
||||
'send_notifications', 'run_administrative_checks',
|
||||
'run_workflow_job']
|
||||
'RunJobLaunch']
|
||||
|
||||
HIDDEN_PASSWORD = '**********'
|
||||
|
||||
@ -177,14 +176,6 @@ def tower_periodic_scheduler(self):
|
||||
new_unified_job.socketio_emit_status("failed")
|
||||
emit_websocket_notification('/socket.io/schedules', 'schedule_changed', dict(id=schedule.id))
|
||||
|
||||
@task(queue='default')
|
||||
def notify_task_runner(metadata_dict):
|
||||
"""Add the given task into the Tower task manager's queue, to be consumed
|
||||
by the task system.
|
||||
"""
|
||||
queue = FifoQueue('tower_task_manager')
|
||||
queue.push(metadata_dict)
|
||||
|
||||
def _send_notification_templates(instance, status_str):
|
||||
if status_str not in ['succeeded', 'failed']:
|
||||
raise ValueError("status_str must be either succeeded or failed")
|
||||
@ -200,6 +191,7 @@ def _send_notification_templates(instance, status_str):
|
||||
for n in all_notification_templates],
|
||||
job_id=instance.id)
|
||||
|
||||
|
||||
@task(bind=True, queue='default')
|
||||
def handle_work_success(self, result, task_actual):
|
||||
instance = UnifiedJob.get_instance_by_type(task_actual['type'], task_actual['id'])
|
||||
@ -208,6 +200,9 @@ def handle_work_success(self, result, task_actual):
|
||||
|
||||
_send_notification_templates(instance, 'succeeded')
|
||||
|
||||
from awx.main.scheduler.tasks import run_job_complete
|
||||
run_job_complete.delay(instance.id)
|
||||
|
||||
@task(bind=True, queue='default')
|
||||
def handle_work_error(self, task_id, subtasks=None):
|
||||
print('Executing error task id %s, subtasks: %s' %
|
||||
@ -236,6 +231,15 @@ def handle_work_error(self, task_id, subtasks=None):
|
||||
|
||||
if first_instance:
|
||||
_send_notification_templates(first_instance, 'failed')
|
||||
|
||||
# We only send 1 job complete message since all the job completion message
|
||||
# handling does is trigger the scheduler. If we extend the functionality of
|
||||
# what the job complete message handler does then we may want to send a
|
||||
# completion event for each job here.
|
||||
if first_instance:
|
||||
from awx.main.scheduler.tasks import run_job_complete
|
||||
run_job_complete.delay(first_instance.id)
|
||||
pass
|
||||
|
||||
@task(queue='default')
|
||||
def update_inventory_computed_fields(inventory_id, should_update_hosts=True):
|
||||
@ -301,10 +305,6 @@ class BaseTask(Task):
|
||||
logger.error('Failed to update %s after %d retries.',
|
||||
self.model._meta.object_name, _attempt)
|
||||
|
||||
def signal_finished(self, pk):
|
||||
pass
|
||||
# notify_task_runner(dict(complete=pk))
|
||||
|
||||
def get_path_to(self, *args):
|
||||
'''
|
||||
Return absolute path relative to this file.
|
||||
@ -1662,21 +1662,30 @@ class RunSystemJob(BaseTask):
|
||||
def build_cwd(self, instance, **kwargs):
|
||||
return settings.BASE_DIR
|
||||
|
||||
'''
|
||||
class RunWorkflowJob(BaseTask):
|
||||
|
||||
name = 'awx.main.tasks.run_workflow_job'
|
||||
model = WorkflowJob
|
||||
|
||||
def run(self, pk, **kwargs):
|
||||
from awx.main.management.commands.run_task_system import WorkflowDAG
|
||||
'''
|
||||
Run the job/task and capture its output.
|
||||
'''
|
||||
pass
|
||||
#Run the job/task and capture its output.
|
||||
instance = self.update_model(pk, status='running', celery_task_id=self.request.id)
|
||||
instance.socketio_emit_status("running")
|
||||
|
||||
# FIXME: Detect workflow run completion
|
||||
# FIXME: Currently, the workflow job busy waits until the graph run is
|
||||
# complete. Instead, the workflow job should return or never even run,
|
||||
# because all of the "launch logic" can be done schedule().
|
||||
|
||||
# However, other aspects of our system depend on a 1-1 relationship
|
||||
# between a Job and a Celery Task.
|
||||
#
|
||||
# * If we let the workflow job task (RunWorkflowJob.run()) complete
|
||||
# then how do we trigger the handle_work_error and
|
||||
# handle_work_success subtasks?
|
||||
#
|
||||
# * How do we handle the recovery process? (i.e. there is an entry in
|
||||
# the database but not in celery).
|
||||
while True:
|
||||
dag = WorkflowDAG(instance)
|
||||
if dag.is_workflow_done():
|
||||
@ -1686,4 +1695,4 @@ class RunWorkflowJob(BaseTask):
|
||||
time.sleep(1)
|
||||
instance.socketio_emit_status(instance.status)
|
||||
# TODO: Handle cancel
|
||||
|
||||
'''
|
||||
|
||||
@ -12,7 +12,6 @@ import sys
|
||||
import tempfile
|
||||
import time
|
||||
import urllib
|
||||
from multiprocessing import Process
|
||||
import re
|
||||
import mock
|
||||
|
||||
@ -30,7 +29,6 @@ from django.utils.encoding import force_text
|
||||
|
||||
# AWX
|
||||
from awx.main.models import * # noqa
|
||||
from awx.main.management.commands.run_task_system import run_taskmanager
|
||||
from awx.main.task_engine import TaskEnhancer
|
||||
from awx.main.utils import get_ansible_version
|
||||
from awx.sso.backends import LDAPSettings
|
||||
@ -644,18 +642,6 @@ class BaseTestMixin(MockCommonlySlowTestMixin):
|
||||
u'expected no traceback, got:\n%s' %
|
||||
job.result_traceback)
|
||||
|
||||
|
||||
def start_taskmanager(self, command_port):
|
||||
self.start_redis()
|
||||
self.taskmanager_process = Process(target=run_taskmanager,
|
||||
args=(command_port,))
|
||||
self.taskmanager_process.start()
|
||||
|
||||
def terminate_taskmanager(self):
|
||||
if hasattr(self, 'taskmanager_process'):
|
||||
self.taskmanager_process.terminate()
|
||||
self.stop_redis()
|
||||
|
||||
class BaseTest(BaseTestMixin, django.test.TestCase):
|
||||
'''
|
||||
Base class for unit tests.
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
from awx.main.management.commands.run_task_system import (
|
||||
SimpleDAG,
|
||||
WorkflowDAG,
|
||||
)
|
||||
|
||||
# Python
|
||||
import pytest
|
||||
|
||||
# AWX
|
||||
from awx.main.scheduler.dag_simple import SimpleDAG
|
||||
from awx.main.scheduler.dag_workflow import WorkflowDAG
|
||||
from awx.main.models import Job
|
||||
from awx.main.models.workflow import WorkflowJobNode
|
||||
import pytest
|
||||
|
||||
@pytest.fixture
|
||||
def dag_root():
|
||||
@ -348,9 +348,11 @@ CELERYD_TASK_SOFT_TIME_LIMIT = None
|
||||
CELERYBEAT_SCHEDULER = 'celery.beat.PersistentScheduler'
|
||||
CELERYBEAT_MAX_LOOP_INTERVAL = 60
|
||||
CELERY_RESULT_BACKEND = 'djcelery.backends.database:DatabaseBackend'
|
||||
CELERY_IMPORTS = ('awx.main.scheduler.tasks',)
|
||||
CELERY_QUEUES = (
|
||||
Queue('default', Exchange('default'), routing_key='default'),
|
||||
Queue('jobs', Exchange('jobs'), routing_key='jobs'),
|
||||
Queue('scheduler', Exchange('scheduler', type='topic'), routing_key='scheduler.job.#', durable=False),
|
||||
# Projects use a fanout queue, this isn't super well supported
|
||||
Broadcast('projects'),
|
||||
)
|
||||
@ -362,8 +364,12 @@ CELERY_ROUTES = ({'awx.main.tasks.run_job': {'queue': 'jobs',
|
||||
'awx.main.tasks.run_ad_hoc_command': {'queue': 'jobs',
|
||||
'routing_key': 'jobs'},
|
||||
'awx.main.tasks.run_system_job': {'queue': 'jobs',
|
||||
'routing_key': 'jobs'}})
|
||||
|
||||
'routing_key': 'jobs'},
|
||||
'awx.main.scheduler.tasks.run_job_launch': {'queue': 'scheduler',
|
||||
'routing_key': 'scheduler.job.launch'},
|
||||
'awx.main.scheduler.tasks.run_job_complete': {'queue': 'scheduler',
|
||||
'routing_key': 'scheduler.job.complete'},})
|
||||
|
||||
CELERYBEAT_SCHEDULE = {
|
||||
'tower_scheduler': {
|
||||
'task': 'awx.main.tasks.tower_periodic_scheduler',
|
||||
@ -755,6 +761,7 @@ ACTIVITY_STREAM_ENABLED_FOR_INVENTORY_SYNC = False
|
||||
INTERNAL_API_URL = 'http://127.0.0.1:%s' % DEVSERVER_DEFAULT_PORT
|
||||
|
||||
CALLBACK_QUEUE = "callback_tasks"
|
||||
SCHEDULER_QUEUE = "scheduler"
|
||||
|
||||
TASK_COMMAND_PORT = 6559
|
||||
|
||||
@ -917,7 +924,11 @@ LOGGING = {
|
||||
'handlers': ['console', 'file', 'socketio_service'],
|
||||
'propagate': False
|
||||
},
|
||||
'awx.main.commands.run_task_system': {
|
||||
'awx.main.tasks': {
|
||||
'handlers': ['console', 'file', 'task_system'],
|
||||
'propagate': False
|
||||
},
|
||||
'awx.main.scheduler': {
|
||||
'handlers': ['console', 'file', 'task_system'],
|
||||
'propagate': False
|
||||
},
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user