replace task manager with event driven scheduler

This commit is contained in:
Chris Meyers 2016-09-22 11:20:24 -04:00
parent cc90204b0f
commit cdb65ccac9
12 changed files with 344 additions and 456 deletions

View File

@ -357,7 +357,6 @@ server_noattach:
tmux rename-window 'Tower'
tmux select-window -t tower:0
tmux split-window -v 'exec make celeryd'
tmux split-window -h 'exec make taskmanager'
tmux new-window 'exec make receiver'
tmux select-window -t tower:1
tmux rename-window 'Extra Services'
@ -397,7 +396,7 @@ celeryd:
@if [ "$(VENV_BASE)" ]; then \
. $(VENV_BASE)/tower/bin/activate; \
fi; \
$(PYTHON) manage.py celeryd -l DEBUG -B --autoscale=20,3 --schedule=$(CELERY_SCHEDULE_FILE) -Q projects,jobs,default
$(PYTHON) manage.py celeryd -l DEBUG -B --autoscale=20,3 --schedule=$(CELERY_SCHEDULE_FILE) -Q projects,jobs,default,scheduler
#$(PYTHON) manage.py celery multi show projects jobs default -l DEBUG -Q:projects projects -Q:jobs jobs -Q:default default -c:projects 1 -c:jobs 3 -c:default 3 -Ofair -B --schedule=$(CELERY_SCHEDULE_FILE)
# Run to start the zeromq callback receiver
@ -407,16 +406,6 @@ receiver:
fi; \
$(PYTHON) manage.py run_callback_receiver
taskmanager:
@if [ "$(VENV_BASE)" ]; then \
. $(VENV_BASE)/tower/bin/activate; \
fi; \
if [ "$(COMPOSE_HOST)" == "tower_1" ] || [ "$(COMPOSE_HOST)" == "tower" ]; then \
$(PYTHON) manage.py run_task_system; \
else \
while true; do sleep 2; done; \
fi
socketservice:
@if [ "$(VENV_BASE)" ]; then \
. $(VENV_BASE)/tower/bin/activate; \

View File

@ -1,7 +1,6 @@
runserver: make runserver
celeryd: make celeryd
taskmanager: make taskmanager
receiver: make receiver
socketservice: make socketservice
factcacher: make factcacher
flower: make flower
flower: make flower

View File

@ -1,374 +0,0 @@
#Copyright (c) 2015 Ansible, Inc.
# All Rights Reserved
# Python
import os
import datetime
import logging
import signal
import time
import traceback
from kombu import Connection, Exchange, Queue, Producer
from kombu.mixins import ConsumerMixin
# Django
from django.conf import settings
from django.core.management.base import NoArgsCommand
# AWX
from awx.main.models import * # noqa
from awx.main.queue import FifoQueue
from awx.main.tasks import handle_work_error, handle_work_success
from awx.main.utils import get_system_task_capacity
from awx.main.scheduler.dag_simple import SimpleDAG
from awx.main.scheduler.dag_workflow import WorkflowDAG
# Celery
from celery.task.control import inspect
logger = logging.getLogger('awx.main.commands.run_task_system')
queue = FifoQueue('tower_task_manager')
def get_tasks():
"""Fetch all Tower tasks that are relevant to the task management
system.
"""
RELEVANT_JOBS = ('pending', 'waiting', 'running')
# TODO: Replace this when we can grab all objects in a sane way.
graph_jobs = [j for j in Job.objects.filter(status__in=RELEVANT_JOBS)]
graph_ad_hoc_commands = [ahc for ahc in AdHocCommand.objects.filter(status__in=RELEVANT_JOBS)]
graph_inventory_updates = [iu for iu in
InventoryUpdate.objects.filter(status__in=RELEVANT_JOBS)]
graph_project_updates = [pu for pu in
ProjectUpdate.objects.filter(status__in=RELEVANT_JOBS)]
graph_system_jobs = [sj for sj in
SystemJob.objects.filter(status__in=RELEVANT_JOBS)]
graph_workflow_jobs = [wf for wf in
WorkflowJob.objects.filter(status__in=RELEVANT_JOBS)]
all_actions = sorted(graph_jobs + graph_ad_hoc_commands + graph_inventory_updates +
graph_project_updates + graph_system_jobs +
graph_workflow_jobs,
key=lambda task: task.created)
print("Returning all_actions %s" % len(all_actions))
return all_actions
def get_running_workflow_jobs():
graph_workflow_jobs = [wf for wf in
WorkflowJob.objects.filter(status='running')]
return graph_workflow_jobs
def do_spawn_workflow_jobs():
workflow_jobs = get_running_workflow_jobs()
for workflow_job in workflow_jobs:
dag = WorkflowDAG(workflow_job)
spawn_nodes = dag.bfs_nodes_to_run()
for spawn_node in spawn_nodes:
# TODO: Inject job template template params as kwargs.
# Make sure to take into account extra_vars merge logic
kv = {}
job = spawn_node.unified_job_template.create_unified_job(**kv)
spawn_node.job = job
spawn_node.save()
can_start = job.signal_start(**kv)
if not can_start:
job.status = 'failed'
job.job_explanation = "Workflow job could not start because it was not in the right state or required manual credentials"
job.save(update_fields=['status', 'job_explanation'])
job.socketio_emit_status("failed")
# TODO: should we emit a status on the socket here similar to tasks.py tower_periodic_scheduler() ?
#emit_websocket_notification('/socket.io/jobs', '', dict(id=))
def rebuild_graph():
"""Regenerate the task graph by refreshing known tasks from Tower, purging
orphaned running tasks, and creating dependencies for new tasks before
generating directed edge relationships between those tasks.
"""
'''
# Sanity check: Only do this on the primary node.
if Instance.objects.my_role() == 'secondary':
return None
'''
inspector = inspect()
if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'):
active_task_queues = inspector.active()
else:
logger.warn("Ignoring celery task inspector")
active_task_queues = None
do_spawn_workflow_jobs()
all_sorted_tasks = get_tasks()
if not len(all_sorted_tasks):
print("All sorted task len is not? <%s, %s>" % (len(all_sorted_tasks), all_sorted_tasks))
return None
active_tasks = []
if active_task_queues is not None:
for queue in active_task_queues:
active_tasks += [at['id'] for at in active_task_queues[queue]]
else:
logger.error("Could not communicate with celery!")
# TODO: Something needs to be done here to signal to the system
# as a whole that celery appears to be down.
if not hasattr(settings, 'CELERY_UNIT_TEST'):
return None
running_tasks = filter(lambda t: t.status == 'running', all_sorted_tasks)
waiting_tasks = filter(lambda t: t.status != 'running', all_sorted_tasks)
new_tasks = filter(lambda t: t.status == 'pending', all_sorted_tasks)
# Check running tasks and make sure they are active in celery
logger.debug("Active celery tasks: " + str(active_tasks))
for task in list(running_tasks):
if (task.celery_task_id not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')):
# NOTE: Pull status again and make sure it didn't finish in
# the meantime?
task.status = 'failed'
task.job_explanation += ' '.join((
'Task was marked as running in Tower but was not present in',
'Celery, so it has been marked as failed.',
))
task.save()
task.socketio_emit_status("failed")
running_tasks.pop(running_tasks.index(task))
logger.error("Task %s appears orphaned... marking as failed" % task)
# Create and process dependencies for new tasks
for task in new_tasks:
logger.debug("Checking dependencies for: %s" % str(task))
try:
task_dependencies = task.generate_dependencies(running_tasks + waiting_tasks)
except Exception, e:
logger.error("Failed processing dependencies for {}: {}".format(task, e))
task.status = 'failed'
task.job_explanation += 'Task failed to generate dependencies: {}'.format(e)
task.save()
task.socketio_emit_status("failed")
continue
logger.debug("New dependencies: %s" % str(task_dependencies))
for dep in task_dependencies:
# We recalculate the created time for the moment to ensure the
# dependencies are always sorted in the right order relative to
# the dependent task.
time_delt = len(task_dependencies) - task_dependencies.index(dep)
dep.created = task.created - datetime.timedelta(seconds=1 + time_delt)
dep.status = 'waiting'
dep.save()
waiting_tasks.insert(waiting_tasks.index(task), dep)
if not hasattr(settings, 'UNIT_TEST_IGNORE_TASK_WAIT'):
task.status = 'waiting'
task.save()
# Rebuild graph
graph = SimpleDAG()
for task in running_tasks:
graph.add_node(task)
for wait_task in waiting_tasks[:50]:
node_dependencies = []
for node in graph:
if wait_task.is_blocked_by(node['node_object']):
node_dependencies.append(node['node_object'])
graph.add_node(wait_task)
for dependency in node_dependencies:
graph.add_edge(wait_task, dependency)
if settings.DEBUG:
graph.generate_graphviz_plot()
return graph
def process_graph(graph, task_capacity):
"""Given a task dependency graph, start and manage tasks given their
priority and weight.
"""
leaf_nodes = graph.get_leaf_nodes()
running_nodes = filter(lambda x: x['node_object'].status == 'running', leaf_nodes)
running_impact = sum([t['node_object'].task_impact for t in running_nodes])
ready_nodes = filter(lambda x: x['node_object'].status != 'running', leaf_nodes)
remaining_volume = task_capacity - running_impact
logger.info('Running Nodes: %s; Capacity: %s; Running Impact: %s; '
'Remaining Capacity: %s' %
(str(running_nodes), str(task_capacity),
str(running_impact), str(remaining_volume)))
logger.info("Ready Nodes: %s" % str(ready_nodes))
for task_node in ready_nodes:
node_obj = task_node['node_object']
# NOTE: This could be used to pass metadata through the task system
# node_args = task_node['metadata']
impact = node_obj.task_impact
if impact <= remaining_volume or running_impact == 0:
node_dependencies = graph.get_dependents(node_obj)
# Allow other tasks to continue if a job fails, even if they are
# other jobs.
if graph.get_node_type(node_obj) == 'job':
node_dependencies = []
dependent_nodes = [{'type': graph.get_node_type(node_obj), 'id': node_obj.id}] + \
[{'type': graph.get_node_type(n['node_object']),
'id': n['node_object'].id} for n in node_dependencies]
error_handler = handle_work_error.s(subtasks=dependent_nodes)
success_handler = handle_work_success.s(task_actual={'type': graph.get_node_type(node_obj),
'id': node_obj.id})
start_status = node_obj.start(error_callback=error_handler, success_callback=success_handler)
if not start_status:
node_obj.status = 'failed'
if node_obj.job_explanation:
node_obj.job_explanation += ' '
node_obj.job_explanation += 'Task failed pre-start check.'
node_obj.save()
continue
remaining_volume -= impact
running_impact += impact
logger.info('Started Node: %s (capacity hit: %s) '
'Remaining Capacity: %s' %
(str(node_obj), str(impact), str(remaining_volume)))
#logger = logging.getLogger('awx.main.scheduler')
class CallbackBrokerWorker(ConsumerMixin):
def __init__(self, connection):
self.connection = connection
def get_consumers(self, Consumer, channel):
print("get_consumers() OK")
return [Consumer(queues=[Queue(settings.SCHEDULER_QUEUE,
Exchange(settings.SCHEDULER_QUEUE, type='topic'),
routing_key='scheduler.job.launch'),],
accept=['json'],
callbacks=[self.process_job_launch,]),
Consumer(queues=[Queue(settings.SCHEDULER_QUEUE,
Exchange(settings.SCHEDULER_QUEUE, type='topic'),
routing_key='scheduler.job.complete'),],
accept=['json'],
callbacks=[self.process_job_complete,]
)]
def schedule(self):
task_capacity = get_system_task_capacity()
graph = rebuild_graph()
if graph:
process_graph(graph, task_capacity)
def process_job_msg(self, body, message):
try:
if settings.DEBUG:
logger.info("Body: {}".format(body))
logger.info("Message: {}".format(message))
if "msg_type" not in body:
raise Exception("Payload does not have a msg_type")
if "job_id" not in body:
raise Exception("Payload does not have a job_id")
func = getattr(self, "process_%s" % body['msg_type'], None)
if not func:
raise AttributeError("No processor for message type %s" % body['msg_type'])
func(body)
# Raised by processors when msg isn't in the expected form.
except LookupError as e:
logger.error(e)
except AttributeError as e:
logger.error(e)
except Exception as exc:
import traceback
traceback.print_exc()
logger.error('Callback Task Processor Raised Exception: %r', exc)
finally:
message.ack()
self.schedule()
def process_job_launch(self, body, message):
print("process_job_launch()")
if "job_id" not in body:
raise KeyError("Payload does not contain job_id")
'''
Wait for job to exist.
The job is created in a transaction then the message is created, but
the transaction may not have completed.
FIXME: We could generate the message in a Django signal handler.
OR, we could call an explicit commit in the view and then send the
message.
'''
retries = 10
retry = 0
while not UnifiedJob.objects.filter(id=body['job_id']).exists():
time.sleep(0.3)
if retry >= retries:
logger.error("Failed to process 'job_launch' message for job %d" % body['job_id'])
# ack the message so we don't build up the queue.
#
# The job can still be chosen to run during tower startup or
# when another job is started or completes
message.ack()
return
retry += 1
job = UnifiedJob.objects.get(id=body['job_id'])
self.schedule()
message.ack()
def process_job_complete(self, body, message):
print("process_job_complete()")
if "job_id" not in body:
raise KeyError("Payload does not contain job_id")
# TODO: use list of finished status from jobs.py or unified_jobs.py
finished_status = ['successful', 'error', 'failed', 'completed']
q = UnifiedJob.objects.filter(id=body['job_id'])
# Ensure that the job is updated in the database before we call to
# schedule the next job.
retries = 10
retry = 0
while True:
# Job not found, most likely deleted. That's fine
if not q.exists():
logger.warn("Failed to find job '%d' while processing 'job_complete' message. Presume that it was deleted." % body['job_id'])
break
job = q[0]
if job.status in finished_status:
break
time.sleep(0.3)
if retry >= retries:
logger.error("Expected job status '%s' to be one of '%s' while processing 'job_complete' message." % (job.status, finished_status))
message.ack()
return
retry += 1
message.ack()
self.schedule()
class Command(NoArgsCommand):
"""Tower Task Management System
This daemon is designed to reside between our tasks and celery and
provide a mechanism for understanding the relationship between those tasks
and their dependencies.
It also actively prevents situations in which Tower can get blocked
because it doesn't have an understanding of what is progressing through
celery.
"""
help = 'Launch the Tower task management system'
def handle_noargs(self, **options):
with Connection(settings.BROKER_URL) as conn:
try:
worker = CallbackBrokerWorker(conn)
worker.run()
except KeyboardInterrupt:
print('Terminating Task Management System')

View File

@ -852,16 +852,8 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
self.update_fields(start_args=json.dumps(kwargs), status='pending')
self.socketio_emit_status("pending")
from kombu import Connection, Exchange, Producer
connection = Connection(settings.BROKER_URL)
exchange = Exchange(settings.SCHEDULER_QUEUE, type='topic')
producer = Producer(connection)
producer.publish({ 'msg_type': 'job_launch', 'job_id': self.id },
serializer='json',
compression='bzip2',
exchange=exchange,
declare=[exchange],
routing_key='scheduler.job.launch')
from awx.main.scheduler.tasks import run_job_launch
run_job_launch.delay(self.id)
# Each type of unified job has a different Task class; get the
# appropirate one.

View File

@ -0,0 +1,222 @@
#Copyright (c) 2015 Ansible, Inc.
# All Rights Reserved
# Python
import datetime
import logging
# Django
from django.conf import settings
# AWX
from awx.main.models import * # noqa
from awx.main.tasks import handle_work_error, handle_work_success
from awx.main.utils import get_system_task_capacity
from awx.main.scheduler.dag_simple import SimpleDAG
from awx.main.scheduler.dag_workflow import WorkflowDAG
# Celery
from celery.task.control import inspect
logger = logging.getLogger('awx.main.scheduler')
def get_tasks():
"""Fetch all Tower tasks that are relevant to the task management
system.
"""
RELEVANT_JOBS = ('pending', 'waiting', 'running')
# TODO: Replace this when we can grab all objects in a sane way.
graph_jobs = [j for j in Job.objects.filter(status__in=RELEVANT_JOBS)]
graph_ad_hoc_commands = [ahc for ahc in AdHocCommand.objects.filter(status__in=RELEVANT_JOBS)]
graph_inventory_updates = [iu for iu in
InventoryUpdate.objects.filter(status__in=RELEVANT_JOBS)]
graph_project_updates = [pu for pu in
ProjectUpdate.objects.filter(status__in=RELEVANT_JOBS)]
graph_system_jobs = [sj for sj in
SystemJob.objects.filter(status__in=RELEVANT_JOBS)]
graph_workflow_jobs = [wf for wf in
WorkflowJob.objects.filter(status__in=RELEVANT_JOBS)]
all_actions = sorted(graph_jobs + graph_ad_hoc_commands + graph_inventory_updates +
graph_project_updates + graph_system_jobs +
graph_workflow_jobs,
key=lambda task: task.created)
return all_actions
def get_running_workflow_jobs():
graph_workflow_jobs = [wf for wf in
WorkflowJob.objects.filter(status='running')]
return graph_workflow_jobs
def do_spawn_workflow_jobs():
workflow_jobs = get_running_workflow_jobs()
for workflow_job in workflow_jobs:
dag = WorkflowDAG(workflow_job)
spawn_nodes = dag.bfs_nodes_to_run()
for spawn_node in spawn_nodes:
# TODO: Inject job template template params as kwargs.
# Make sure to take into account extra_vars merge logic
kv = {}
job = spawn_node.unified_job_template.create_unified_job(**kv)
spawn_node.job = job
spawn_node.save()
can_start = job.signal_start(**kv)
if not can_start:
job.status = 'failed'
job.job_explanation = "Workflow job could not start because it was not in the right state or required manual credentials"
job.save(update_fields=['status', 'job_explanation'])
job.socketio_emit_status("failed")
# TODO: should we emit a status on the socket here similar to tasks.py tower_periodic_scheduler() ?
#emit_websocket_notification('/socket.io/jobs', '', dict(id=))
def rebuild_graph():
"""Regenerate the task graph by refreshing known tasks from Tower, purging
orphaned running tasks, and creating dependencies for new tasks before
generating directed edge relationships between those tasks.
"""
'''
# Sanity check: Only do this on the primary node.
if Instance.objects.my_role() == 'secondary':
return None
'''
inspector = inspect()
if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'):
active_task_queues = inspector.active()
else:
logger.warn("Ignoring celery task inspector")
active_task_queues = None
do_spawn_workflow_jobs()
all_sorted_tasks = get_tasks()
if not len(all_sorted_tasks):
return None
active_tasks = []
if active_task_queues is not None:
for queue in active_task_queues:
active_tasks += [at['id'] for at in active_task_queues[queue]]
else:
logger.error("Could not communicate with celery!")
# TODO: Something needs to be done here to signal to the system
# as a whole that celery appears to be down.
if not hasattr(settings, 'CELERY_UNIT_TEST'):
return None
running_tasks = filter(lambda t: t.status == 'running', all_sorted_tasks)
waiting_tasks = filter(lambda t: t.status != 'running', all_sorted_tasks)
new_tasks = filter(lambda t: t.status == 'pending', all_sorted_tasks)
# Check running tasks and make sure they are active in celery
logger.debug("Active celery tasks: " + str(active_tasks))
for task in list(running_tasks):
if (task.celery_task_id not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')):
# NOTE: Pull status again and make sure it didn't finish in
# the meantime?
task.status = 'failed'
task.job_explanation += ' '.join((
'Task was marked as running in Tower but was not present in',
'Celery, so it has been marked as failed.',
))
task.save()
task.socketio_emit_status("failed")
running_tasks.pop(running_tasks.index(task))
logger.error("Task %s appears orphaned... marking as failed" % task)
# Create and process dependencies for new tasks
for task in new_tasks:
logger.debug("Checking dependencies for: %s" % str(task))
try:
task_dependencies = task.generate_dependencies(running_tasks + waiting_tasks)
except Exception, e:
logger.error("Failed processing dependencies for {}: {}".format(task, e))
task.status = 'failed'
task.job_explanation += 'Task failed to generate dependencies: {}'.format(e)
task.save()
task.socketio_emit_status("failed")
continue
logger.debug("New dependencies: %s" % str(task_dependencies))
for dep in task_dependencies:
# We recalculate the created time for the moment to ensure the
# dependencies are always sorted in the right order relative to
# the dependent task.
time_delt = len(task_dependencies) - task_dependencies.index(dep)
dep.created = task.created - datetime.timedelta(seconds=1 + time_delt)
dep.status = 'waiting'
dep.save()
waiting_tasks.insert(waiting_tasks.index(task), dep)
if not hasattr(settings, 'UNIT_TEST_IGNORE_TASK_WAIT'):
task.status = 'waiting'
task.save()
# Rebuild graph
graph = SimpleDAG()
for task in running_tasks:
graph.add_node(task)
for wait_task in waiting_tasks[:50]:
node_dependencies = []
for node in graph:
if wait_task.is_blocked_by(node['node_object']):
node_dependencies.append(node['node_object'])
graph.add_node(wait_task)
for dependency in node_dependencies:
graph.add_edge(wait_task, dependency)
if settings.DEBUG:
graph.generate_graphviz_plot()
return graph
def process_graph(graph, task_capacity):
"""Given a task dependency graph, start and manage tasks given their
priority and weight.
"""
leaf_nodes = graph.get_leaf_nodes()
running_nodes = filter(lambda x: x['node_object'].status == 'running', leaf_nodes)
running_impact = sum([t['node_object'].task_impact for t in running_nodes])
ready_nodes = filter(lambda x: x['node_object'].status != 'running', leaf_nodes)
remaining_volume = task_capacity - running_impact
logger.info('Running Nodes: %s; Capacity: %s; Running Impact: %s; '
'Remaining Capacity: %s' %
(str(running_nodes), str(task_capacity),
str(running_impact), str(remaining_volume)))
logger.info("Ready Nodes: %s" % str(ready_nodes))
for task_node in ready_nodes:
node_obj = task_node['node_object']
# NOTE: This could be used to pass metadata through the task system
# node_args = task_node['metadata']
impact = node_obj.task_impact
if impact <= remaining_volume or running_impact == 0:
node_dependencies = graph.get_dependents(node_obj)
# Allow other tasks to continue if a job fails, even if they are
# other jobs.
if graph.get_node_type(node_obj) == 'job':
node_dependencies = []
dependent_nodes = [{'type': graph.get_node_type(node_obj), 'id': node_obj.id}] + \
[{'type': graph.get_node_type(n['node_object']),
'id': n['node_object'].id} for n in node_dependencies]
error_handler = handle_work_error.s(subtasks=dependent_nodes)
success_handler = handle_work_success.s(task_actual={'type': graph.get_node_type(node_obj),
'id': node_obj.id})
start_status = node_obj.start(error_callback=error_handler, success_callback=success_handler)
if not start_status:
node_obj.status = 'failed'
if node_obj.job_explanation:
node_obj.job_explanation += ' '
node_obj.job_explanation += 'Task failed pre-start check.'
node_obj.save()
continue
remaining_volume -= impact
running_impact += impact
logger.info('Started Node: %s (capacity hit: %s) '
'Remaining Capacity: %s' %
(str(node_obj), str(impact), str(remaining_volume)))
def schedule():
task_capacity = get_system_task_capacity()
graph = rebuild_graph()
if graph:
process_graph(graph, task_capacity)

View File

@ -1,5 +1,12 @@
from awx.main.models import * # noqa
from awx.main.models import (
Job,
AdHocCommand,
InventoryUpdate,
ProjectUpdate,
WorkflowJob,
SystemJob,
)
class SimpleDAG(object):
''' A simple implementation of a directed acyclic graph '''

View File

@ -1,4 +1,6 @@
from dag_simple import SimpleDAG
# AWX
from awx.main.scheduler.dag_simple import SimpleDAG
class WorkflowDAG(SimpleDAG):
def __init__(self, workflow_job=None):
@ -41,8 +43,6 @@ class WorkflowDAG(SimpleDAG):
elif job.status in ['successful']:
children_success = self.get_dependencies(obj, 'success_nodes')
nodes.extend(children_success)
else:
logger.warn("Incorrect graph structure")
return [n['node_object'] for n in nodes_found]
def is_workflow_done(self):
@ -68,7 +68,5 @@ class WorkflowDAG(SimpleDAG):
elif job.status in ['successful']:
children_success = self.get_dependencies(obj, 'success_nodes')
nodes.extend(children_success)
else:
logger.warn("Incorrect graph structure")
return True

View File

@ -0,0 +1,79 @@
# Python
import logging
import time
# Celery
from celery import task
# AWX
from awx.main.models import UnifiedJob
from awx.main.scheduler import schedule
logger = logging.getLogger('awx.main.scheduler')
# TODO: move logic to UnifiedJob model and use bind=True feature of celery.
# Would we need the request loop then? I think so. Even if we get the in-memory
# updated model, the call to schedule() may get stale data.
@task
def run_job_launch(job_id):
# Wait for job to exist.
# The job is created in a transaction then the message is created, but
# the transaction may not have completed.
# FIXME: We could generate the message in a Django signal handler.
# OR, we could call an explicit commit in the view and then send the
# message.
retries = 10
retry = 0
while not UnifiedJob.objects.filter(id=job_id).exists():
time.sleep(0.3)
if retry >= retries:
logger.error("Failed to process 'job_launch' message for job %d" % job_id)
# ack the message so we don't build up the queue.
#
# The job can still be chosen to run during tower startup or
# when another job is started or completes
return
retry += 1
# "Safe" to get the job now since it exists.
# Really, there is a race condition from exists to get
# TODO: while not loop should call get wrapped in a try except
#job = UnifiedJob.objects.get(id=job_id)
schedule()
@task
def run_job_complete(job_id):
# TODO: use list of finished status from jobs.py or unified_jobs.py
finished_status = ['successful', 'error', 'failed', 'completed']
q = UnifiedJob.objects.filter(id=job_id)
# Ensure that the job is updated in the database before we call to
# schedule the next job.
retries = 10
retry = 0
while True:
# Job not found, most likely deleted. That's fine
if not q.exists():
logger.warn("Failed to find job '%d' while processing 'job_complete' message. Presume that it was deleted." % job_id)
break
job = q[0]
if job.status in finished_status:
break
time.sleep(0.3)
if retry >= retries:
logger.error("Expected job status '%s' to be one of '%s' while processing 'job_complete' message." % (job.status, finished_status))
return
retry += 1
schedule()

View File

@ -31,9 +31,6 @@ except:
# Pexpect
import pexpect
# Kombu
from kombu import Connection, Exchange, Queue, Producer
# Celery
from celery import Task, task
from celery.signals import celeryd_init
@ -50,18 +47,18 @@ from django.contrib.auth.models import User
from awx.main.constants import CLOUD_PROVIDERS
from awx.main.models import * # noqa
from awx.main.models import UnifiedJob
from awx.main.queue import FifoQueue
from awx.main.conf import tower_settings
from awx.main.task_engine import TaskSerializer, TASK_TIMEOUT_INTERVAL
from awx.main.utils import (get_ansible_version, get_ssh_version, decrypt_field, update_scm_url,
emit_websocket_notification,
check_proot_installed, build_proot_temp_dir, wrap_args_with_proot)
from awx.main.scheduler.dag_workflow import WorkflowDAG
__all__ = ['RunJob', 'RunSystemJob', 'RunProjectUpdate', 'RunInventoryUpdate',
'RunAdHocCommand', 'RunWorkflowJob', 'handle_work_error',
'handle_work_success', 'update_inventory_computed_fields',
'send_notifications', 'run_administrative_checks',
'run_workflow_job']
'RunJobLaunch']
HIDDEN_PASSWORD = '**********'
@ -182,14 +179,6 @@ def tower_periodic_scheduler(self):
new_unified_job.socketio_emit_status("failed")
emit_websocket_notification('/socket.io/schedules', 'schedule_changed', dict(id=schedule.id))
@task(queue='default')
def notify_task_runner(metadata_dict):
"""Add the given task into the Tower task manager's queue, to be consumed
by the task system.
"""
queue = FifoQueue('tower_task_manager')
queue.push(metadata_dict)
def _send_notification_templates(instance, status_str):
if status_str not in ['succeeded', 'failed']:
raise ValueError("status_str must be either succeeded or failed")
@ -206,17 +195,6 @@ def _send_notification_templates(instance, status_str):
job_id=instance.id)
def _send_job_complete_msg(instance):
connection = Connection(settings.BROKER_URL)
exchange = Exchange(settings.SCHEDULER_QUEUE, type='topic')
producer = Producer(connection)
producer.publish({ 'job_id': instance.id, 'msg_type': 'job_complete' },
serializer='json',
compression='bzip2',
exchange=exchange,
declare=[exchange],
routing_key='scheduler.job.complete')
@task(bind=True, queue='default')
def handle_work_success(self, result, task_actual):
instance = UnifiedJob.get_instance_by_type(task_actual['type'], task_actual['id'])
@ -225,7 +203,8 @@ def handle_work_success(self, result, task_actual):
_send_notification_templates(instance, 'succeeded')
_send_job_complete_msg(instance)
from awx.main.scheduler.tasks import run_job_complete
run_job_complete.delay(instance.id)
@task(bind=True, queue='default')
def handle_work_error(self, task_id, subtasks=None):
@ -256,8 +235,14 @@ def handle_work_error(self, task_id, subtasks=None):
if first_instance:
_send_notification_templates(first_instance, 'failed')
# We only send 1 job complete message since all the job completion message
# handling does is trigger the scheduler. If we extend the functionality of
# what the job complete message handler does then we may want to send a
# completion event for each job here.
if first_instance:
_send_job_complete_msg(first_instance)
from awx.main.scheduler.tasks import run_job_complete
run_job_complete.delay(first_instance.id)
pass
@task(queue='default')
def update_inventory_computed_fields(inventory_id, should_update_hosts=True):
@ -323,10 +308,6 @@ class BaseTask(Task):
logger.error('Failed to update %s after %d retries.',
self.model._meta.object_name, _attempt)
def signal_finished(self, pk):
pass
# notify_task_runner(dict(complete=pk))
def get_path_to(self, *args):
'''
Return absolute path relative to this file.
@ -1690,7 +1671,7 @@ class RunWorkflowJob(BaseTask):
model = WorkflowJob
def run(self, pk, **kwargs):
from awx.main.management.commands.run_task_system import WorkflowDAG
print("I'm a running a workflow job")
'''
Run the job/task and capture its output.
'''

View File

@ -30,7 +30,7 @@ from django.utils.encoding import force_text
# AWX
from awx.main.models import * # noqa
from awx.main.management.commands.run_task_system import run_taskmanager
from awx.main.management.commands.run_callback_receiver import CallbackReceiver
from awx.main.utils import get_ansible_version
from awx.main.task_engine import TaskEngager as LicenseWriter
from awx.sso.backends import LDAPSettings
@ -654,18 +654,6 @@ class BaseTestMixin(MockCommonlySlowTestMixin):
u'expected no traceback, got:\n%s' %
job.result_traceback)
def start_taskmanager(self, command_port):
self.start_redis()
self.taskmanager_process = Process(target=run_taskmanager,
args=(command_port,))
self.taskmanager_process.start()
def terminate_taskmanager(self):
if hasattr(self, 'taskmanager_process'):
self.taskmanager_process.terminate()
self.stop_redis()
class BaseTest(BaseTestMixin, django.test.TestCase):
'''
Base class for unit tests.

View File

@ -1,10 +1,12 @@
from awx.main.management.commands.run_task_system import (
SimpleDAG,
WorkflowDAG,
)
# Python
import pytest
# AWX
from awx.main.scheduler.dag_simple import SimpleDAG
from awx.main.scheduler.dag_workflow import WorkflowDAG
from awx.main.models import Job
from awx.main.models.workflow import WorkflowJobNode
import pytest
@pytest.fixture
def dag_root():

View File

@ -339,10 +339,11 @@ CELERYD_TASK_SOFT_TIME_LIMIT = None
CELERYBEAT_SCHEDULER = 'celery.beat.PersistentScheduler'
CELERYBEAT_MAX_LOOP_INTERVAL = 60
CELERY_RESULT_BACKEND = 'djcelery.backends.database:DatabaseBackend'
CELERY_IMPORTS = ('awx.main.scheduler.tasks',)
CELERY_QUEUES = (
Queue('default', Exchange('default'), routing_key='default'),
Queue('jobs', Exchange('jobs'), routing_key='jobs'),
#Queue('scheduler', Exchange('scheduler'), routing_key='scheduler.job.#'),
Queue('scheduler', Exchange('scheduler', type='topic'), routing_key='scheduler.job.#'),
# Projects use a fanout queue, this isn't super well supported
Broadcast('projects'),
)
@ -354,7 +355,11 @@ CELERY_ROUTES = ({'awx.main.tasks.run_job': {'queue': 'jobs',
'awx.main.tasks.run_ad_hoc_command': {'queue': 'jobs',
'routing_key': 'jobs'},
'awx.main.tasks.run_system_job': {'queue': 'jobs',
'routing_key': 'jobs'}})
'routing_key': 'jobs'},
'awx.main.scheduler.tasks.run_job_launch': {'queue': 'scheduler',
'routing_key': 'scheduler.job.launch'},
'awx.main.scheduler.tasks.run_job_complete': {'queue': 'scheduler',
'routing_key': 'scheduler.job.complete'},})
CELERYBEAT_SCHEDULE = {
'tower_scheduler': {
@ -1040,7 +1045,7 @@ LOGGING = {
'handlers': ['console', 'file', 'socketio_service'],
'propagate': False
},
'awx.main.commands.run_task_system': {
'awx.main.tasks': {
'handlers': ['console', 'file', 'task_system'],
'propagate': False
},