replace celery task decorators with a kombu-based publisher

this commit implements the bulk of `awx-manage run_dispatcher`, a new command that binds to RabbitMQ via kombu and balances messages across a pool of workers that are similar to celeryd workers in spirit. Specifically, this includes: - a new decorator, `awx.main.dispatch.task`, which can be used to decorate functions or classes so that they can be designated as "Tasks" - support for fanout/broadcast tasks (at this point in time, only `conf.Setting` memcached flushes use this functionality) - support for job reaping - support for success/failure hooks for job runs (i.e., `handle_work_success` and `handle_work_error`) - support for auto scaling worker pool that scale processes up and down on demand - minimal support for RPC, such as status checks and pool recycle/reload
2026-02-19 20:20:06 -03:30 · 2018-08-08 13:41:07 -04:00
parent da74f1d01f
commit ff1e8cc356
54 changed files with 1606 additions and 1147 deletions
--- a/awx/init.py
+++ b/awx/init.py
@@ -12,14 +12,6 @@ __version__ = get_distribution('awx').version
 __all__ = ['__version__']


-# Isolated nodes do not have celery installed
-try:
-    from .celery import app as celery_app # noqa
-    __all__.append('celery_app')
-except ImportError:
-    pass
-
-
 # Check for the presence/absence of "devonly" module to determine if running
 # from a source code checkout or release packaage.
 try:
--- a/awx/api/views/init.py
+++ b/awx/api/views/init.py
@@ -3318,7 +3318,7 @@ class JobTemplateCallback(GenericAPIView):
        with transaction.atomic():
            job = job_template.create_job(**kv)

-        # Send a signal to celery that the job should be started.
+        # Send a signal to signify that the job should be started.
        result = job.signal_start(inventory_sources_already_updated=inventory_sources_already_updated)
        if not result:
            data = dict(msg=_('Error starting job!'))
--- a/awx/api/views/mixin.py
+++ b/awx/api/views/mixin.py
@@ -101,7 +101,9 @@ class UnifiedJobDeletionMixin(object):

 class InstanceGroupMembershipMixin(object):
    '''
-    Manages signaling celery to reload its queue configuration on Instance Group membership changes
+    This mixin overloads attach/detach so that it calls InstanceGroup.save(),
+    triggering a background recalculation of policy-based instance group
+    membership.
    '''
    def attach(self, request, *args, **kwargs):
        response = super(InstanceGroupMembershipMixin, self).attach(request, *args, **kwargs)
--- a/awx/celery.py
+++ b/awx/celery.py
@@ -1,25 +0,0 @@
-
-# Copyright (c) 2017 Ansible, Inc.
-# All Rights Reserved.
-
-from __future__ import absolute_import, unicode_literals
-
-import os
-from celery import Celery
-from django.conf import settings # noqa
-
-
-try:
-    import awx.devonly # noqa
-    MODE = 'development'
-except ImportError: # pragma: no cover
-    MODE = 'production'
-
-os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'awx.settings.%s' % MODE)
-
-app = Celery('awx')
-app.config_from_object('django.conf:settings')
-app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
-
-if __name__ == '__main__':
-    app.start()
--- a/awx/main/dispatch/init.py
+++ b/awx/main/dispatch/init.py
@@ -0,0 +1,5 @@
+from django.conf import settings
+
+
+def get_local_queuename():
+    return settings.CLUSTER_HOST_ID.encode('utf-8')
--- a/awx/main/dispatch/control.py
+++ b/awx/main/dispatch/control.py
@@ -0,0 +1,60 @@
+import logging
+import socket
+
+from django.conf import settings
+
+from awx.main.dispatch import get_local_queuename
+from kombu import Connection, Queue, Exchange, Producer, Consumer
+
+logger = logging.getLogger('awx.main.dispatch')
+
+
+class Control(object):
+
+    services = ('dispatcher', 'callback_receiver')
+    result = None
+
+    def __init__(self, service):
+        if service not in self.services:
+            raise RuntimeError('{} must be in {}'.format(service, self.services))
+        self.service = service
+        queuename = get_local_queuename()
+        self.queue = Queue(queuename, Exchange(queuename), routing_key=queuename)
+
+    def publish(self, msg, conn, host, **kwargs):
+        producer = Producer(
+            exchange=self.queue.exchange,
+            channel=conn,
+            routing_key=get_local_queuename()
+        )
+        producer.publish(msg, expiration=5, **kwargs)
+
+    def status(self, *args, **kwargs):
+        return self.control_with_reply('status', *args, **kwargs)
+
+    def running(self, *args, **kwargs):
+        return self.control_with_reply('running', *args, **kwargs)
+
+    def control_with_reply(self, command, host=None, timeout=5):
+        host = host or settings.CLUSTER_HOST_ID
+        logger.warn('checking {} {} for {}'.format(self.service, command, host))
+        reply_queue = Queue(name="amq.rabbitmq.reply-to")
+        self.result = None
+        with Connection(settings.BROKER_URL) as conn:
+            with Consumer(conn, reply_queue, callbacks=[self.process_message], no_ack=True):
+                self.publish({'control': command}, conn, host, reply_to='amq.rabbitmq.reply-to')
+                try:
+                    conn.drain_events(timeout=timeout)
+                except socket.timeout:
+                    logger.error('{} did not reply within {}s'.format(self.service, timeout))
+                    raise
+        return self.result
+
+    def control(self, msg, host=None, **kwargs):
+        host = host or settings.CLUSTER_HOST_ID
+        with Connection(settings.BROKER_URL) as conn:
+            self.publish(msg, conn, host)
+
+    def process_message(self, body, message):
+        self.result = body
+        message.ack()
--- a/awx/main/dispatch/pool.py
+++ b/awx/main/dispatch/pool.py
@@ -1,81 +1,260 @@
-import errno
 import logging
 import os
-import signal
+import random
 import traceback
+from uuid import uuid4

+import collections
 from multiprocessing import Process
 from multiprocessing import Queue as MPQueue
-from Queue import Full as QueueFull
+from Queue import Full as QueueFull, Empty as QueueEmpty

 from django.conf import settings
 from django.db import connection as django_connection
 from django.core.cache import cache as django_cache
+from jinja2 import Template
+import psutil
+
+from awx.main.models import UnifiedJob
+from awx.main.dispatch import reaper

 logger = logging.getLogger('awx.main.dispatch')


-def signame(sig):
-    return dict(
-        (k, v) for v, k in signal.__dict__.items()
-        if v.startswith('SIG') and not v.startswith('SIG_')
-    )[sig]
+class PoolWorker(object):
+    '''
+    Used to track a worker child process and its pending and finished messages.
+
+    This class makes use of two distinct multiprocessing.Queues to track state:
+
+    - self.queue: this is a queue which represents pending messages that should
+                  be handled by this worker process; as new AMQP messages come
+                  in, a pool will put() them into this queue; the child
+                  process that is forked will get() from this queue and handle
+                  received messages in an endless loop
+    - self.finished: this is a queue which the worker process uses to signal
+                     that it has finished processing a message
+
+    When a message is put() onto this worker, it is tracked in
+    self.managed_tasks.
+
+    Periodically, the worker will call .calculate_managed_tasks(), which will
+    cause messages in self.finished to be removed from self.managed_tasks.
+
+    In this way, self.managed_tasks represents a view of the messages assigned
+    to a specific process.  The message at [0] is the least-recently inserted
+    message, and it represents what the worker is running _right now_
+    (self.current_task).
+
+    A worker is "busy" when it has at least one message in self.managed_tasks.
+    It is "idle" when self.managed_tasks is empty.
+    '''
+
+    def __init__(self, queue_size, target, args):
+        self.messages_sent = 0
+        self.messages_finished = 0
+        self.managed_tasks = collections.OrderedDict()
+        self.finished = MPQueue(queue_size)
+        self.queue = MPQueue(queue_size)
+        self.process = Process(target=target, args=(self.queue, self.finished) + args)
+        self.process.daemon = True
+
+    def start(self):
+        self.process.start()
+
+    def put(self, body):
+        uuid = '?'
+        if isinstance(body, dict):
+            if not body.get('uuid'):
+                body['uuid'] = str(uuid4())
+            uuid = body['uuid']
+        logger.debug('delivered {} to worker[{}] qsize {}'.format(
+            uuid, self.pid, self.qsize
+        ))
+        self.managed_tasks[uuid] = body
+        self.queue.put(body, block=True, timeout=5)
+        self.messages_sent += 1
+        self.calculate_managed_tasks()
+
+    def quit(self):
+        '''
+        Send a special control message to the worker that tells it to exit
+        gracefully.
+        '''
+        self.queue.put('QUIT')
+
+    @property
+    def pid(self):
+        return self.process.pid
+
+    @property
+    def qsize(self):
+        return self.queue.qsize()
+
+    @property
+    def alive(self):
+        return self.process.is_alive()
+
+    @property
+    def mb(self):
+        if self.alive:
+            return '{:0.3f}'.format(
+                psutil.Process(self.pid).memory_info().rss / 1024.0 / 1024.0
+            )
+        return '0'
+
+    @property
+    def exitcode(self):
+        return str(self.process.exitcode)
+
+    def calculate_managed_tasks(self):
+        # look to see if any tasks were finished
+        finished = []
+        for _ in range(self.finished.qsize()):
+            try:
+                finished.append(self.finished.get(block=False))
+            except QueueEmpty:
+                break  # qsize is not always _totally_ up to date
+
+        # if any tasks were finished, removed them from the managed tasks for
+        # this worker
+        for uuid in finished:
+            self.messages_finished += 1
+            del self.managed_tasks[uuid]
+
+    @property
+    def current_task(self):
+        self.calculate_managed_tasks()
+        # the task at [0] is the one that's running right now (or is about to
+        # be running)
+        if len(self.managed_tasks):
+            return self.managed_tasks[self.managed_tasks.keys()[0]]
+
+        return None
+
+    @property
+    def orphaned_tasks(self):
+        orphaned = []
+        if not self.alive:
+            # if this process had a running task that never finished,
+            # requeue its error callbacks
+            current_task = self.current_task
+            if isinstance(current_task, dict):
+                orphaned.extend(current_task.get('errbacks', []))
+
+            # if this process has any pending messages requeue them
+            for _ in range(self.qsize):
+                try:
+                    orphaned.append(self.queue.get(block=False))
+                except QueueEmpty:
+                    break  # qsize is not always _totally_ up to date
+            if len(orphaned):
+                logger.error(
+                    'requeuing {} messages from gone worker pid:{}'.format(
+                        len(orphaned), self.pid
+                    )
+                )
+        return orphaned
+
+    @property
+    def busy(self):
+        self.calculate_managed_tasks()
+        return len(self.managed_tasks) > 0
+
+    @property
+    def idle(self):
+        return not self.busy


 class WorkerPool(object):
+    '''
+    Creates a pool of forked PoolWorkers.
+
+    As WorkerPool.write(...) is called (generally, by a kombu consumer
+    implementation when it receives an AMQP message), messages are passed to
+    one of the multiprocessing Queues where some work can be done on them.
+
+    class MessagePrinter(awx.main.dispatch.worker.BaseWorker):
+
+        def perform_work(self, body):
+            print body
+
+    pool = WorkerPool(min_workers=4)  # spawn four worker processes
+    pool.init_workers(MessagePrint().work_loop)
+    pool.write(
+        0,  # preferred worker 0
+        'Hello, World!'
+    )
+    '''
+
+    debug_meta = ''

    def __init__(self, min_workers=None, queue_size=None):
+        self.name = settings.CLUSTER_HOST_ID
+        self.pid = os.getpid()
        self.min_workers = min_workers or settings.JOB_EVENT_WORKERS
        self.queue_size = queue_size or settings.JOB_EVENT_MAX_QUEUE_SIZE
-
-        # self.workers tracks the state of worker running worker processes:
-        # [
-        #   (total_messages_consumed, multiprocessing.Queue, multiprocessing.Process),
-        #   (total_messages_consumed, multiprocessing.Queue, multiprocessing.Process),
-        #   (total_messages_consumed, multiprocessing.Queue, multiprocessing.Process),
-        #   (total_messages_consumed, multiprocessing.Queue, multiprocessing.Process)
-        # ]
        self.workers = []

    def __len__(self):
        return len(self.workers)

    def init_workers(self, target, *target_args):
-        def shutdown_handler(active_workers):
-            def _handler(signum, frame):
-                logger.debug('received shutdown {}'.format(signame(signum)))
-                try:
-                    for active_worker in active_workers:
-                        logger.debug('terminating worker')
-                    signal.signal(signum, signal.SIG_DFL)
-                    os.kill(os.getpid(), signum) # Rethrow signal, this time without catching it
-                except Exception:
-                    logger.exception('error in shutdown_handler')
-            return _handler
+        self.target = target
+        self.target_args = target_args
+        for idx in range(self.min_workers):
+            self.up()

+    def up(self):
+        idx = len(self.workers)
+        # It's important to close these because we're _about_ to fork, and we
+        # don't want the forked processes to inherit the open sockets
+        # for the DB and memcached connections (that way lies race conditions)
        django_connection.close()
        django_cache.close()
-        for idx in range(self.min_workers):
-            queue_actual = MPQueue(self.queue_size)
-            w = Process(target=target, args=(queue_actual, idx,) + target_args)
-            w.start()
-            logger.debug('started {}[{}]'.format(target.im_self.__class__.__name__, idx))
-            self.workers.append([0, queue_actual, w])
+        worker = PoolWorker(self.queue_size, self.target, (idx,) + self.target_args)
+        self.workers.append(worker)
+        try:
+            worker.start()
+        except Exception:
+            logger.exception('could not fork')
+        else:
+            logger.warn('scaling up worker pid:{}'.format(worker.pid))
+        return idx, worker

-        signal.signal(signal.SIGINT, shutdown_handler([p[2] for p in self.workers]))
-        signal.signal(signal.SIGTERM, shutdown_handler([p[2] for p in self.workers]))
+    def debug(self, *args, **kwargs):
+        self.cleanup()
+        tmpl = Template(
+            '{{ pool.name }}[pid:{{ pool.pid }}] workers total={{ workers|length }} {{ meta }} \n'
+            '{% for w in workers %}'
+            '.  worker[pid:{{ w.pid }}]{% if not w.alive %} GONE exit={{ w.exitcode }}{% endif %}'
+            ' sent={{ w.messages_sent }}'
+            ' finished={{ w.messages_finished }}'
+            ' qsize={{ w.managed_tasks|length }}'
+            ' rss={{ w.mb }}MB'
+            '{% for task in w.managed_tasks.values() %}'
+            '\n     - {% if loop.index0 == 0 %}running {% else %}queued {% endif %}'
+            '{{ task["uuid"] }} '
+            '{% if "task" in task %}'
+            '{{ task["task"].rsplit(".", 1)[-1] }}'
+            # don't print kwargs, they often contain launch-time secrets
+            '(*{{ task.get("args", []) }})'
+            '{% endif %}'
+            '{% endfor %}'
+            '{% if not w.managed_tasks|length %}'
+            ' [IDLE]'
+            '{% endif %}'
+            '\n'
+            '{% endfor %}'
+        )
+        return tmpl.render(pool=self, workers=self.workers, meta=self.debug_meta)

    def write(self, preferred_queue, body):
-        queue_order = sorted(range(self.min_workers), cmp=lambda x, y: -1 if x==preferred_queue else 0)
+        queue_order = sorted(range(len(self.workers)), cmp=lambda x, y: -1 if x==preferred_queue else 0)
        write_attempt_order = []
        for queue_actual in queue_order:
            try:
-                worker_actual = self.workers[queue_actual]
-                worker_actual[1].put(body, block=True, timeout=5)
-                logger.debug('delivered to Worker[{}] qsize {}'.format(
-                    queue_actual, worker_actual[1].qsize()
-                ))
-                worker_actual[0] += 1
+                self.workers[queue_actual].put(body)
                return queue_actual
            except QueueFull:
                pass
@@ -87,11 +266,113 @@ class WorkerPool(object):
        logger.warn("could not write payload to any queue, attempted order: {}".format(write_attempt_order))
        return None

-    def stop(self):
-        for worker in self.workers:
-            messages, queue, process = worker
-            try:
-                os.kill(process.pid, signal.SIGTERM)
-            except OSError as e:
-                if e.errno != errno.ESRCH:
-                    raise
+    def stop(self, signum):
+        try:
+            for worker in self.workers:
+                os.kill(worker.pid, signum)
+        except Exception:
+            logger.exception('could not kill {}'.format(worker.pid))
+
+
+class AutoscalePool(WorkerPool):
+    '''
+    An extended pool implementation that automatically scales workers up and
+    down based on demand
+    '''
+
+    def __init__(self, *args, **kwargs):
+        self.max_workers = kwargs.pop('max_workers', None)
+        super(AutoscalePool, self).__init__(*args, **kwargs)
+
+        if self.max_workers is None:
+            settings_absmem = getattr(settings, 'SYSTEM_TASK_ABS_MEM', None)
+            if settings_absmem is not None:
+                total_memory_gb = int(settings_absmem)
+            else:
+                total_memory_gb = (psutil.virtual_memory().total >> 30) + 1  # noqa: round up
+            # 5 workers per GB of total memory
+            self.max_workers = (total_memory_gb * 5)
+
+    @property
+    def should_grow(self):
+        if len(self.workers) < self.min_workers:
+            # If we don't have at least min_workers, add more
+            return True
+        # If every worker is busy doing something, add more
+        return all([w.busy for w in self.workers])
+
+    @property
+    def full(self):
+        return len(self.workers) == self.max_workers
+
+    @property
+    def debug_meta(self):
+        return 'min={} max={}'.format(self.min_workers, self.max_workers)
+
+    def cleanup(self):
+        """
+        Perform some internal account and cleanup.  This is run on
+        every cluster node heartbeat:
+
+        1.  Discover worker processes that exited, and recover messages they
+            were handling.
+        2.  Clean up unnecessary, idle workers.
+        """
+        orphaned = []
+        for w in self.workers[::]:
+            if not w.alive:
+                # the worker process has exited
+                # 1. take the task it was running and enqueue the error
+                #    callbacks
+                # 2. take any pending tasks delivered to its queue and
+                #    send them to another worker
+                logger.error('worker pid:{} is gone (exit={})'.format(w.pid, w.exitcode))
+                if w.current_task:
+                    try:
+                        for j in UnifiedJob.objects.filter(celery_task_id=w.current_task['uuid']):
+                            reaper.reap_job(j, 'failed')
+                    except Exception:
+                        logger.exception('failed to reap job UUID {}'.format(w.current_task['uuid']))
+                orphaned.extend(w.orphaned_tasks)
+                self.workers.remove(w)
+            elif w.idle and len(self.workers) > self.min_workers:
+                # the process has an empty queue (it's idle) and we have
+                # more processes in the pool than we need (> min)
+                # send this process a message so it will exit gracefully
+                # at the next opportunity
+                logger.warn('scaling down worker pid:{}'.format(w.pid))
+                w.quit()
+                self.workers.remove(w)
+
+        for m in orphaned:
+            # if all the workers are dead, spawn at least one
+            if not len(self.workers):
+                self.up()
+            idx = random.choice(range(len(self.workers)))
+            self.write(idx, m)
+
+    def up(self):
+        if self.full:
+            # if we can't spawn more workers, just toss this message into a
+            # random worker's backlog
+            idx = random.choice(range(len(self.workers)))
+            return idx, self.workers[idx]
+        else:
+            return super(AutoscalePool, self).up()
+
+    def write(self, preferred_queue, body):
+        # when the cluster heartbeat occurs, clean up internally
+        if isinstance(body, dict) and 'cluster_node_heartbeat' in body['task']:
+            self.cleanup()
+        if self.should_grow:
+            self.up()
+        # we don't care about "preferred queue" round robin distribution, just
+        # find the first non-busy worker and claim it
+        workers = self.workers[:]
+        random.shuffle(workers)
+        for w in workers:
+            if not w.busy:
+                w.put(body)
+                break
+        else:
+            return super(AutoscalePool, self).write(preferred_queue, body)
--- a/awx/main/dispatch/publish.py
+++ b/awx/main/dispatch/publish.py
@@ -0,0 +1,128 @@
+import inspect
+import logging
+import sys
+from uuid import uuid4
+
+from django.conf import settings
+from kombu import Connection, Exchange, Producer
+
+logger = logging.getLogger('awx.main.dispatch')
+
+
+def serialize_task(f):
+    return '.'.join([f.__module__, f.__name__])
+
+
+class task:
+    """
+    Used to decorate a function or class so that it can be run asynchronously
+    via the task dispatcher.  Tasks can be simple functions:
+
+    @task()
+    def add(a, b):
+        return a + b
+
+    ...or classes that define a `run` method:
+
+    @task()
+    class Adder:
+        def run(self, a, b):
+            return a + b
+
+    # Tasks can be run synchronously...
+    assert add(1, 1) == 2
+    assert Adder().run(1, 1) == 2
+
+    # ...or published to a queue:
+    add.apply_async([1, 1])
+    Adder.apply_async([1, 1])
+
+    # Tasks can also define a specific target queue or exchange type:
+
+    @task(queue='slow-tasks')
+    def snooze():
+        time.sleep(10)
+
+    @task(queue='tower_broadcast', exchange_type='fanout')
+    def announce():
+        print "Run this everywhere!"
+    """
+
+    def __init__(self, queue=None, exchange_type=None):
+        self.queue = queue
+        self.exchange_type = exchange_type
+
+    def __call__(self, fn=None):
+        queue = self.queue
+        exchange_type = self.exchange_type
+
+        class PublisherMixin(object):
+
+            queue = None
+
+            @classmethod
+            def delay(cls, *args, **kwargs):
+                return cls.apply_async(args, kwargs)
+
+            @classmethod
+            def apply_async(cls, args=None, kwargs=None, queue=None, uuid=None, **kw):
+                task_id = uuid or str(uuid4())
+                args = args or []
+                kwargs = kwargs or {}
+                queue = (
+                    queue or
+                    getattr(cls.queue, 'im_func', cls.queue) or
+                    settings.CELERY_DEFAULT_QUEUE
+                )
+                obj = {
+                    'uuid': task_id,
+                    'args': args,
+                    'kwargs': kwargs,
+                    'task': cls.name
+                }
+                obj.update(**kw)
+                if callable(queue):
+                    queue = queue()
+                if not settings.IS_TESTING(sys.argv):
+                    with Connection(settings.BROKER_URL) as conn:
+                        exchange = Exchange(queue, type=exchange_type or 'direct')
+                        producer = Producer(conn)
+                        logger.debug('publish {}({}, queue={})'.format(
+                            cls.name,
+                            task_id,
+                            queue
+                        ))
+                        producer.publish(obj,
+                                         serializer='json',
+                                         compression='bzip2',
+                                         exchange=exchange,
+                                         declare=[exchange],
+                                         delivery_mode="persistent",
+                                         routing_key=queue)
+                return (obj, queue)
+
+        # If the object we're wrapping *is* a class (e.g., RunJob), return
+        # a *new* class that inherits from the wrapped class *and* BaseTask
+        # In this way, the new class returned by our decorator is the class
+        # being decorated *plus* PublisherMixin so cls.apply_async() and
+        # cls.delay() work
+        bases = []
+        ns = {'name': serialize_task(fn), 'queue': queue}
+        if inspect.isclass(fn):
+            bases = list(fn.__bases__)
+            ns.update(fn.__dict__)
+        cls = type(
+            fn.__name__,
+            tuple(bases + [PublisherMixin]),
+            ns
+        )
+        if inspect.isclass(fn):
+            return cls
+
+        # if the object being decorated is *not* a class (it's a Python
+        # function), make fn.apply_async and fn.delay proxy through to the
+        # PublisherMixin we dynamically created above
+        setattr(fn, 'name', cls.name)
+        setattr(fn, 'apply_async', cls.apply_async)
+        setattr(fn, 'delay', cls.delay)
+        return fn
--- a/awx/main/dispatch/reaper.py
+++ b/awx/main/dispatch/reaper.py
@@ -0,0 +1,46 @@
+from datetime import timedelta
+import logging
+
+from django.db.models import Q
+from django.utils.timezone import now as tz_now
+from django.contrib.contenttypes.models import ContentType
+
+from awx.main.models import Instance, UnifiedJob, WorkflowJob
+
+logger = logging.getLogger('awx.main.dispatch')
+
+
+def reap_job(j, status):
+    j.status = status
+    j.start_args = ''  # blank field to remove encrypted passwords
+    j.job_explanation += ' '.join((
+        'Task was marked as running in Tower but was not present in',
+        'the job queue, so it has been marked as failed.',
+    ))
+    j.save(update_fields=['status', 'start_args', 'job_explanation'])
+    if hasattr(j, 'send_notification_templates'):
+        j.send_notification_templates('failed')
+    j.websocket_emit_status(status)
+    logger.error(
+        '{} is no longer running; reaping'.format(j.log_format)
+    )
+
+
+def reap(instance=None, status='failed'):
+    '''
+    Reap all jobs in waiting|running for this instance.
+    '''
+    me = instance or Instance.objects.me()
+    now = tz_now()
+    workflow_ctype_id = ContentType.objects.get_for_model(WorkflowJob).id
+    jobs = UnifiedJob.objects.filter(
+        (
+            Q(status='running') |
+            Q(status='waiting', modified__lte=now - timedelta(seconds=60))
+        ) & (
+            Q(execution_node=me.hostname) |
+            Q(controller_node=me.hostname)
+        ) & ~Q(polymorphic_ctype_id=workflow_ctype_id)
+    )
+    for j in jobs:
+        reap_job(j, status)
--- a/awx/main/dispatch/worker/init.py
+++ b/awx/main/dispatch/worker/init.py
@@ -0,0 +1,3 @@
+from .base import AWXConsumer, BaseWorker  # noqa
+from .callback import CallbackBrokerWorker  # noqa
+from .task import TaskWorker  # noqa
--- a/awx/main/dispatch/worker/base.py
+++ b/awx/main/dispatch/worker/base.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2018 Ansible by Red Hat
+# All Rights Reserved.
+
+import os
+import logging
+import signal
+from uuid import UUID
+from Queue import Empty as QueueEmpty
+
+from kombu import Producer
+from kombu.mixins import ConsumerMixin
+
+from awx.main.dispatch.pool import WorkerPool
+
+logger = logging.getLogger('awx.main.dispatch')
+
+
+def signame(sig):
+    return dict(
+        (k, v) for v, k in signal.__dict__.items()
+        if v.startswith('SIG') and not v.startswith('SIG_')
+    )[sig]
+
+
+class WorkerSignalHandler:
+
+    def __init__(self):
+        self.kill_now = False
+        signal.signal(signal.SIGINT, self.exit_gracefully)
+
+    def exit_gracefully(self, *args, **kwargs):
+        self.kill_now = True
+
+
+class AWXConsumer(ConsumerMixin):
+
+    def __init__(self, name, connection, worker, queues=[], pool=None):
+        self.connection = connection
+        self.total_messages = 0
+        self.queues = queues
+        self.worker = worker
+        self.pool = pool
+        if pool is None:
+            self.pool = WorkerPool()
+        self.pool.init_workers(self.worker.work_loop)
+
+    def get_consumers(self, Consumer, channel):
+        logger.debug(self.listening_on)
+        return [Consumer(queues=self.queues, accept=['json'],
+                         callbacks=[self.process_task])]
+
+    @property
+    def listening_on(self):
+        return 'listening on {}'.format([
+            '{} [{}]'.format(q.name, q.exchange.type) for q in self.queues
+        ])
+
+    def control(self, body, message):
+        logger.warn(body)
+        control = body.get('control')
+        if control in ('status', 'running'):
+            producer = Producer(
+                channel=self.connection,
+                routing_key=message.properties['reply_to']
+            )
+            if control == 'status':
+                msg = '\n'.join([self.listening_on, self.pool.debug()])
+            elif control == 'running':
+                msg = []
+                for worker in self.pool.workers:
+                    worker.calculate_managed_tasks()
+                    msg.extend(worker.managed_tasks.keys())
+            producer.publish(msg)
+        elif control == 'reload':
+            for worker in self.pool.workers:
+                worker.quit()
+        else:
+            logger.error('unrecognized control message: {}'.format(control))
+        message.ack()
+
+    def process_task(self, body, message):
+        if 'control' in body:
+            return self.control(body, message)
+        if len(self.pool):
+            if "uuid" in body and body['uuid']:
+                try:
+                    queue = UUID(body['uuid']).int % len(self.pool)
+                except Exception:
+                    queue = self.total_messages % len(self.pool)
+            else:
+                queue = self.total_messages % len(self.pool)
+        else:
+            queue = 0
+        self.pool.write(queue, body)
+        self.total_messages += 1
+        message.ack()
+
+    def run(self, *args, **kwargs):
+        signal.signal(signal.SIGINT, self.stop)
+        signal.signal(signal.SIGTERM, self.stop)
+        self.worker.on_start()
+        super(AWXConsumer, self).run(*args, **kwargs)
+
+    def stop(self, signum, frame):
+        self.should_stop = True  # this makes the kombu mixin stop consuming
+        logger.debug('received {}, stopping'.format(signame(signum)))
+        self.worker.on_stop()
+        raise SystemExit()
+
+
+class BaseWorker(object):
+
+    def work_loop(self, queue, finished, idx, *args):
+        ppid = os.getppid()
+        signal_handler = WorkerSignalHandler()
+        while not signal_handler.kill_now:
+            # if the parent PID changes, this process has been orphaned
+            # via e.g., segfault or sigkill, we should exit too
+            if os.getppid() != ppid:
+                break
+            try:
+                body = queue.get(block=True, timeout=1)
+                if body == 'QUIT':
+                    break
+            except QueueEmpty:
+                continue
+            except Exception as e:
+                logger.error("Exception on worker, restarting: " + str(e))
+                continue
+            try:
+                self.perform_work(body, *args)
+            finally:
+                if 'uuid' in body:
+                    uuid = body['uuid']
+                    logger.debug('task {} is finished'.format(uuid))
+                    finished.put(uuid)
+        logger.warn('worker exiting gracefully pid:{}'.format(os.getpid()))
+
+    def perform_work(self, body):
+        raise NotImplementedError()
+
+    def on_start(self):
+        pass
+
+    def on_stop(self):
+        pass
--- a/awx/main/dispatch/worker/callback.py
+++ b/awx/main/dispatch/worker/callback.py
@@ -1,83 +1,30 @@
-# Copyright (c) 2018 Ansible by Red Hat
-# All Rights Reserved.
-
 import logging
+import time
 import os
 import signal
-import time
 import traceback
-from uuid import UUID
-from Queue import Empty as QueueEmpty

-from kombu.mixins import ConsumerMixin
 from django.conf import settings
 from django.db import DatabaseError, OperationalError, connection as django_connection
 from django.db.utils import InterfaceError, InternalError

+from awx.main.consumers import emit_channel_notification
 from awx.main.models import (JobEvent, AdHocCommandEvent, ProjectUpdateEvent,
                             InventoryUpdateEvent, SystemJobEvent, UnifiedJob)
-from awx.main.consumers import emit_channel_notification
-from awx.main.dispatch.pool import WorkerPool
+
+from .base import BaseWorker

 logger = logging.getLogger('awx.main.dispatch')


-class WorkerSignalHandler:
-
-    def __init__(self):
-        self.kill_now = False
-        signal.signal(signal.SIGINT, self.exit_gracefully)
-        signal.signal(signal.SIGTERM, self.exit_gracefully)
-
-    def exit_gracefully(self, *args, **kwargs):
-        self.kill_now = True
-
-
-class AWXConsumer(ConsumerMixin):
-
-    def __init__(self, connection, worker, queues=[]):
-        self.connection = connection
-        self.total_messages = 0
-        self.queues = queues
-        self.pool = WorkerPool()
-        self.pool.init_workers(worker.work_loop)
-
-    def get_consumers(self, Consumer, channel):
-        return [Consumer(queues=self.queues, accept=['json'],
-                         callbacks=[self.process_task])]
-
-    def process_task(self, body, message):
-        if "uuid" in body and body['uuid']:
-            try:
-                queue = UUID(body['uuid']).int % len(self.pool)
-            except Exception:
-                queue = self.total_messages % len(self.pool)
-        else:
-            queue = self.total_messages % len(self.pool)
-        self.pool.write(queue, body)
-        self.total_messages += 1
-        message.ack()
-
-
-class BaseWorker(object):
-
-    def work_loop(self, queue, idx, *args):
-        signal_handler = WorkerSignalHandler()
-        while not signal_handler.kill_now:
-            try:
-                body = queue.get(block=True, timeout=1)
-            except QueueEmpty:
-                continue
-            except Exception as e:
-                logger.error("Exception on worker, restarting: " + str(e))
-                continue
-            self.perform_work(body, *args)
-
-    def perform_work(self, body):
-        raise NotImplemented()
-
-
 class CallbackBrokerWorker(BaseWorker):
+    '''
+    A worker implementation that deserializes callback event data and persists
+    it into the database.
+
+    The code that *builds* these types of messages is found in the AWX display
+    callback (`awx.lib.awx_display_callback`).
+    '''

    MAX_RETRIES = 2

@@ -151,7 +98,7 @@ class CallbackBrokerWorker(BaseWorker):
                try:
                    _save_event_data()
                    break
-                except (OperationalError, InterfaceError, InternalError) as e:
+                except (OperationalError, InterfaceError, InternalError):
                    if retries >= self.MAX_RETRIES:
                        logger.exception('Worker could not re-establish database connectivity, shutting down gracefully: Job {}'.format(job_identifier))
                        os.kill(os.getppid(), signal.SIGINT)
@@ -164,7 +111,7 @@ class CallbackBrokerWorker(BaseWorker):
                    django_connection.close()
                    time.sleep(delay)
                    retries += 1
-                except DatabaseError as e:
+                except DatabaseError:
                    logger.exception('Database Error Saving Job Event for Job {}'.format(job_identifier))
                    break
        except Exception as exc:
--- a/awx/main/dispatch/worker/task.py
+++ b/awx/main/dispatch/worker/task.py
@@ -0,0 +1,113 @@
+import inspect
+import logging
+import importlib
+import sys
+import traceback
+
+import six
+
+from awx.main.tasks import dispatch_startup, inform_cluster_of_shutdown
+
+from .base import BaseWorker
+
+logger = logging.getLogger('awx.main.dispatch')
+
+
+class TaskWorker(BaseWorker):
+    '''
+    A worker implementation that deserializes task messages and runs native
+    Python code.
+
+    The code that *builds* these types of messages is found in
+    `awx.main.dispatch.publish`.
+    '''
+
+    @classmethod
+    def resolve_callable(cls, task):
+        '''
+        Transform a dotted notation task into an imported, callable function, e.g.,
+
+        awx.main.tasks.delete_inventory
+        awx.main.tasks.RunProjectUpdate
+        '''
+        module, target = task.rsplit('.', 1)
+        module = importlib.import_module(module)
+        _call = None
+        if hasattr(module, target):
+            _call = getattr(module, target, None)
+        return _call
+
+    def run_callable(self, body):
+        '''
+        Given some AMQP message, import the correct Python code and run it.
+        '''
+        task = body['task']
+        uuid = body.get('uuid', '<unknown>')
+        args = body.get('args', [])
+        kwargs = body.get('kwargs', {})
+        _call = TaskWorker.resolve_callable(task)
+        if inspect.isclass(_call):
+            # the callable is a class, e.g., RunJob; instantiate and
+            # return its `run()` method
+            _call = _call().run
+        # don't print kwargs, they often contain launch-time secrets
+        logger.debug('task {} starting {}(*{})'.format(uuid, task, args))
+        return _call(*args, **kwargs)
+
+    def perform_work(self, body):
+        '''
+        Import and run code for a task e.g.,
+
+        body = {
+            'args': [8],
+            'callbacks': [{
+                'args': [],
+                'kwargs': {}
+                'task': u'awx.main.tasks.handle_work_success'
+            }],
+            'errbacks': [{
+                'args': [],
+                'kwargs': {},
+                'task': 'awx.main.tasks.handle_work_error'
+            }],
+            'kwargs': {},
+            'task': u'awx.main.tasks.RunProjectUpdate'
+        }
+        '''
+        result = None
+        try:
+            result = self.run_callable(body)
+        except Exception as exc:
+
+            try:
+                if getattr(exc, 'is_awx_task_error', False):
+                    # Error caused by user / tracked in job output
+                    logger.warning(six.text_type("{}").format(exc))
+                else:
+                    task = body['task']
+                    args = body.get('args', [])
+                    kwargs = body.get('kwargs', {})
+                    logger.exception('Worker failed to run task {}(*{}, **{}'.format(
+                        task, args, kwargs
+                    ))
+            except Exception:
+                # It's fairly critical that this code _not_ raise exceptions on logging
+                # If you configure external logging in a way that _it_ fails, there's
+                # not a lot we can do here; sys.stderr.write is a final hail mary
+                _, _, tb = sys.exc_info()
+                traceback.print_tb(tb)
+
+            for callback in body.get('errbacks', []) or []:
+                callback['uuid'] = body['uuid']
+                self.perform_work(callback)
+
+        for callback in body.get('callbacks', []) or []:
+            callback['uuid'] = body['uuid']
+            self.perform_work(callback)
+        return result
+
+    def on_start(self):
+        dispatch_startup()
+
+    def on_stop(self):
+        inform_cluster_of_shutdown()
--- a/awx/main/exceptions.py
+++ b/awx/main/exceptions.py
@@ -4,11 +4,6 @@
 import six


-# Celery does not respect exception type when using a serializer different than pickle;
-# and awx uses the json serializer
-# https://github.com/celery/celery/issues/3586
-
-
 class _AwxTaskError():
    def build_exception(self, task, message=None):
        if message is None:
@@ -36,5 +31,3 @@ class _AwxTaskError():


 AwxTaskError = _AwxTaskError()
-
-
--- a/awx/main/management/commands/inventory_import.py
+++ b/awx/main/management/commands/inventory_import.py
@@ -938,7 +938,7 @@ class Command(BaseCommand):
        self.exclude_empty_groups = bool(options.get('exclude_empty_groups', False))
        self.instance_id_var = options.get('instance_id_var', None)

-        self.celery_invoked = False if os.getenv('INVENTORY_SOURCE_ID', None) is None else True
+        self.invoked_from_dispatcher = False if os.getenv('INVENTORY_SOURCE_ID', None) is None else True

        # Load inventory and related objects from database.
        if self.inventory_name and self.inventory_id:
@@ -1062,7 +1062,7 @@ class Command(BaseCommand):
                exc = e
            transaction.rollback()

-        if self.celery_invoked is False:
+        if self.invoked_from_dispatcher is False:
            with ignore_inventory_computed_fields():
                self.inventory_update = InventoryUpdate.objects.get(pk=self.inventory_update.pk)
                self.inventory_update.result_traceback = tb
--- a/awx/main/management/commands/run_dispatcher.py
+++ b/awx/main/management/commands/run_dispatcher.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2015 Ansible, Inc.
+# All Rights Reserved.
+import os
+import logging
+from multiprocessing import Process
+
+from django.conf import settings
+from django.core.cache import cache as django_cache
+from django.core.management.base import BaseCommand
+from django.db import connection as django_connection
+from kombu import Connection, Exchange, Queue
+
+from awx.main.dispatch import get_local_queuename, reaper
+from awx.main.dispatch.control import Control
+from awx.main.dispatch.pool import AutoscalePool
+from awx.main.dispatch.worker import AWXConsumer, TaskWorker
+
+logger = logging.getLogger('awx.main.dispatch')
+
+
+def construct_bcast_queue_name(common_name):
+    return common_name.encode('utf8') + '_' + settings.CLUSTER_HOST_ID
+
+
+class Command(BaseCommand):
+    help = 'Launch the task dispatcher'
+
+    def add_arguments(self, parser):
+        parser.add_argument('--status', dest='status', action='store_true',
+                            help='print the internal state of any running dispatchers')
+        parser.add_argument('--running', dest='running', action='store_true',
+                            help='print the UUIDs of any tasked managed by this dispatcher')
+        parser.add_argument('--reload', dest='reload', action='store_true',
+                            help=('cause the dispatcher to recycle all of its worker processes;'
+                                  'running jobs will run to completion first'))
+
+    def beat(self):
+        from celery import app
+        from celery.beat import PersistentScheduler
+        from celery.apps import beat
+
+        class AWXScheduler(PersistentScheduler):
+
+            def __init__(self, *args, **kwargs):
+                self.ppid = os.getppid()
+                super(AWXScheduler, self).__init__(*args, **kwargs)
+
+            def setup_schedule(self):
+                super(AWXScheduler, self).setup_schedule()
+                self.update_from_dict(settings.CELERYBEAT_SCHEDULE)
+
+            def tick(self, *args, **kwargs):
+                if os.getppid() != self.ppid:
+                    # if the parent PID changes, this process has been orphaned
+                    # via e.g., segfault or sigkill, we should exit too
+                    raise SystemExit()
+                return super(AWXScheduler, self).tick(*args, **kwargs)
+
+            def apply_async(self, entry, publisher, **kwargs):
+                task = TaskWorker.resolve_callable(entry.task)
+                result, queue = task.apply_async()
+
+                class TaskResult(object):
+                    id = result['uuid']
+
+                return TaskResult()
+
+        app = app.App()
+        app.conf.BROKER_URL = settings.BROKER_URL
+        app.conf.CELERY_TASK_RESULT_EXPIRES = False
+        beat.Beat(
+            30,
+            app,
+            schedule='/var/lib/awx/beat.db', scheduler_cls=AWXScheduler
+        ).run()
+
+    def handle(self, *arg, **options):
+        if options.get('status'):
+            print Control('dispatcher').status()
+            return
+        if options.get('running'):
+            print Control('dispatcher').running()
+            return
+        if options.get('reload'):
+            return Control('dispatcher').control({'control': 'reload'})
+
+        # It's important to close these because we're _about_ to fork, and we
+        # don't want the forked processes to inherit the open sockets
+        # for the DB and memcached connections (that way lies race conditions)
+        django_connection.close()
+        django_cache.close()
+        beat = Process(target=self.beat)
+        beat.daemon = True
+        beat.start()
+
+        reaper.reap()
+        consumer = None
+        with Connection(settings.BROKER_URL) as conn:
+            try:
+                bcast = 'tower_broadcast_all'
+                queues = [
+                    Queue(q, Exchange(q), routing_key=q)
+                    for q in (settings.AWX_CELERY_QUEUES_STATIC + [get_local_queuename()])
+                ]
+                queues.append(
+                    Queue(
+                        construct_bcast_queue_name(bcast),
+                        exchange=Exchange(bcast, type='fanout'),
+                        routing_key=bcast,
+                        reply=True
+                    )
+                )
+                consumer = AWXConsumer(
+                    'dispatcher',
+                    conn,
+                    TaskWorker(),
+                    queues,
+                    AutoscalePool(min_workers=4)
+                )
+                consumer.run()
+            except KeyboardInterrupt:
+                logger.debug('Terminating Task Dispatcher')
+                if consumer:
+                    consumer.stop()
--- a/awx/main/management/commands/watch_celery.py
+++ b/awx/main/management/commands/watch_celery.py
@@ -1,66 +0,0 @@
-import datetime
-import os
-import signal
-import subprocess
-import sys
-import time
-
-from celery import Celery
-from django.core.management.base import BaseCommand
-from django.conf import settings
-
-
-class Command(BaseCommand):
-    """Watch local celery workers"""
-    help=("Sends a periodic ping to the local celery process over AMQP to ensure "
-          "it's responsive; this command is only intended to run in an environment "
-          "where celeryd is running")
-
-    #
-    # Just because celery is _running_ doesn't mean it's _working_; it's
-    # imperative that celery workers are _actually_ handling AMQP messages on
-    # their appropriate queues for awx to function.  Unfortunately, we've been
-    # plagued by a variety of bugs in celery that cause it to hang and become
-    # an unresponsive zombie, such as:
-    #
-    # https://github.com/celery/celery/issues/4185
-    # https://github.com/celery/celery/issues/4457
-    #
-    # The goal of this code is periodically send a broadcast AMQP message to
-    # the celery process on the local host via celery.app.control.ping;
-    # If that _fails_, we attempt to determine the pid of the celery process
-    # and send SIGHUP (which tends to resolve these sorts of issues for us).
-    #
-
-    INTERVAL = 60
-
-    def _log(self, msg):
-        sys.stderr.write(datetime.datetime.utcnow().isoformat())
-        sys.stderr.write(' ')
-        sys.stderr.write(msg)
-        sys.stderr.write('\n')
-
-    def handle(self, **options):
-        app = Celery('awx')
-        app.config_from_object('django.conf:settings')
-        while True:
-            try:
-                pongs = app.control.ping(['celery@{}'.format(settings.CLUSTER_HOST_ID)], timeout=30)
-            except Exception:
-                pongs = []
-            if not pongs:
-                self._log('celery is not responsive to ping over local AMQP')
-                pid = self.getpid()
-                if pid:
-                    self._log('sending SIGHUP to {}'.format(pid))
-                    os.kill(pid, signal.SIGHUP)
-            time.sleep(self.INTERVAL)
-
-    def getpid(self):
-        cmd = 'supervisorctl pid tower-processes:awx-celeryd'
-        if os.path.exists('/supervisor_task.conf'):
-            cmd = 'supervisorctl -c /supervisor_task.conf pid tower-processes:celery'
-        try:
-            return int(subprocess.check_output(cmd, shell=True))
-        except Exception:
-            self._log('could not detect celery pid')
--- a/awx/main/models/ha.py
+++ b/awx/main/models/ha.py
@@ -32,7 +32,7 @@ __all__ = ('Instance', 'InstanceGroup', 'JobOrigin', 'TowerScheduleState',)


 def validate_queuename(v):
-    # celery and kombu don't play nice with unicode in queue names
+    # kombu doesn't play nice with unicode in queue names
    if v:
        try:
            '{}'.format(v.decode('utf-8'))
--- a/awx/main/models/unified_jobs.py
+++ b/awx/main/models/unified_jobs.py
@@ -27,9 +27,6 @@ from rest_framework.exceptions import ParseError
 # Django-Polymorphic
 from polymorphic.models import PolymorphicModel

-# Django-Celery
-from djcelery.models import TaskMeta
-
 # AWX
 from awx.main.models.base import * # noqa
 from awx.main.models.mixins import ResourceMixin, TaskManagerUnifiedJobMixin
@@ -1112,14 +1109,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
                pass
        return None

-    @property
-    def celery_task(self):
-        try:
-            if self.celery_task_id:
-                return TaskMeta.objects.get(task_id=self.celery_task_id)
-        except TaskMeta.DoesNotExist:
-            pass
-
    def get_passwords_needed_to_start(self):
        return []

@@ -1224,29 +1213,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique

        return (True, opts)

-    def start_celery_task(self, opts, error_callback, success_callback, queue):
-        kwargs = {
-            'link_error': error_callback,
-            'link': success_callback,
-            'queue': None,
-            'task_id': None,
-        }
-        if not self.celery_task_id:
-            raise RuntimeError("Expected celery_task_id to be set on model.")
-        kwargs['task_id'] = self.celery_task_id
-        task_class = self._get_task_class()
-        kwargs['queue'] = queue
-        task_class().apply_async([self.pk], opts, **kwargs)
-
-    def start(self, error_callback, success_callback, **kwargs):
-        '''
-        Start the task running via Celery.
-        '''
-        (res, opts) = self.pre_start(**kwargs)
-        if res:
-            self.start_celery_task(opts, error_callback, success_callback)
-        return res
-
    def signal_start(self, **kwargs):
        """Notify the task runner system to begin work on this task."""

@@ -1286,42 +1252,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
    def can_cancel(self):
        return bool(self.status in CAN_CANCEL)

-    def _force_cancel(self):
-        # Update the status to 'canceled' if we can detect that the job
-        # really isn't running (i.e. celery has crashed or forcefully
-        # killed the worker).
-        task_statuses = ('STARTED', 'SUCCESS', 'FAILED', 'RETRY', 'REVOKED')
-        try:
-            taskmeta = self.celery_task
-            if not taskmeta or taskmeta.status not in task_statuses:
-                return
-            from celery import current_app
-            i = current_app.control.inspect()
-            for v in (i.active() or {}).values():
-                if taskmeta.task_id in [x['id'] for x in v]:
-                    return
-            for v in (i.reserved() or {}).values():
-                if taskmeta.task_id in [x['id'] for x in v]:
-                    return
-            for v in (i.revoked() or {}).values():
-                if taskmeta.task_id in [x['id'] for x in v]:
-                    return
-            for v in (i.scheduled() or {}).values():
-                if taskmeta.task_id in [x['id'] for x in v]:
-                    return
-            instance = self.__class__.objects.get(pk=self.pk)
-            if instance.can_cancel:
-                instance.status = 'canceled'
-                update_fields = ['status']
-                if not instance.job_explanation:
-                    instance.job_explanation = 'Forced cancel'
-                    update_fields.append('job_explanation')
-                instance.save(update_fields=update_fields)
-                self.websocket_emit_status("canceled")
-        except Exception: # FIXME: Log this exception!
-            if settings.DEBUG:
-                raise
-
    def _build_job_explanation(self):
        if not self.job_explanation:
            return 'Previous Task Canceled: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}' % \
@@ -1345,8 +1275,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
                    cancel_fields.append('job_explanation')
                self.save(update_fields=cancel_fields)
                self.websocket_emit_status("canceled")
-            if settings.BROKER_URL.startswith('amqp://'):
-                self._force_cancel()
        return self.cancel_flag

    @property
@@ -1402,7 +1330,7 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
                r['{}_user_last_name'.format(name)] = created_by.last_name
        return r

-    def get_celery_queue_name(self):
+    def get_queue_name(self):
        return self.controller_node or self.execution_node or settings.CELERY_DEFAULT_QUEUE

    def is_isolated(self):
--- a/awx/main/models/workflow.py
+++ b/awx/main/models/workflow.py
@@ -481,9 +481,3 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, SurveyJobMixin, JobNotificatio
    @property
    def preferred_instance_groups(self):
        return []
-
-    '''
-    A WorkflowJob is a virtual job. It doesn't result in a celery task.
-    '''
-    def start_celery_task(self, opts, error_callback, success_callback, queue):
-        return None
--- a/awx/main/scheduler/task_manager.py
+++ b/awx/main/scheduler/task_manager.py
@@ -2,7 +2,7 @@
 # All Rights Reserved

 # Python
-from datetime import datetime, timedelta
+from datetime import timedelta
 import logging
 import uuid
 import json
@@ -11,18 +11,13 @@ import random
 from sets import Set

 # Django
-from django.conf import settings
-from django.core.cache import cache
-from django.db import transaction, connection, DatabaseError
+from django.db import transaction, connection
 from django.utils.translation import ugettext_lazy as _
-from django.utils.timezone import now as tz_now, utc
-from django.db.models import Q
-from django.contrib.contenttypes.models import ContentType
+from django.utils.timezone import now as tz_now

 # AWX
 from awx.main.models import (
    AdHocCommand,
-    Instance,
    InstanceGroup,
    InventorySource,
    InventoryUpdate,
@@ -30,21 +25,15 @@ from awx.main.models import (
    Project,
    ProjectUpdate,
    SystemJob,
-    UnifiedJob,
    WorkflowJob,
 )
 from awx.main.scheduler.dag_workflow import WorkflowDAG
 from awx.main.utils.pglock import advisory_lock
 from awx.main.utils import get_type_for_model
 from awx.main.signals import disable_activity_stream
-
 from awx.main.scheduler.dependency_graph import DependencyGraph
 from awx.main.utils import decrypt_field

-# Celery
-from celery import Celery
-from celery.app.control import Inspect
-

 logger = logging.getLogger('awx.main.scheduler')

@@ -85,79 +74,6 @@ class TaskManager():
                           key=lambda task: task.created)
        return all_tasks

-    '''
-    Tasks that are running and SHOULD have a celery task.
-    {
-        'execution_node': [j1, j2,...],
-        'execution_node': [j3],
-        ...
-    }
-    '''
-    def get_running_tasks(self):
-        execution_nodes = {}
-        waiting_jobs = []
-        now = tz_now()
-        workflow_ctype_id = ContentType.objects.get_for_model(WorkflowJob).id
-        jobs = UnifiedJob.objects.filter((Q(status='running') |
-                                         Q(status='waiting', modified__lte=now - timedelta(seconds=60))) &
-                                         ~Q(polymorphic_ctype_id=workflow_ctype_id))
-        for j in jobs:
-            if j.execution_node:
-                execution_nodes.setdefault(j.execution_node, []).append(j)
-            else:
-                waiting_jobs.append(j)
-        return (execution_nodes, waiting_jobs)
-
-    '''
-    Tasks that are currently running in celery
-
-    Transform:
-    {
-        "celery@ec2-54-204-222-62.compute-1.amazonaws.com": [],
-        "celery@ec2-54-163-144-168.compute-1.amazonaws.com": [{
-            ...
-            "id": "5238466a-f8c7-43b3-9180-5b78e9da8304",
-            ...
-        }, {
-            ...,
-        }, ...]
-    }
-
-    to:
-    {
-        "ec2-54-204-222-62.compute-1.amazonaws.com": [
-            "5238466a-f8c7-43b3-9180-5b78e9da8304",
-            "5238466a-f8c7-43b3-9180-5b78e9da8306",
-            ...
-        ]
-    }
-    '''
-    def get_active_tasks(self):
-        if not hasattr(settings, 'IGNORE_CELERY_INSPECTOR'):
-            app = Celery('awx')
-            app.config_from_object('django.conf:settings')
-            inspector = Inspect(app=app)
-            active_task_queues = inspector.active()
-        else:
-            logger.warn("Ignoring celery task inspector")
-            active_task_queues = None
-
-        queues = None
-
-        if active_task_queues is not None:
-            queues = {}
-            for queue in active_task_queues:
-                active_tasks = set()
-                map(lambda at: active_tasks.add(at['id']), active_task_queues[queue])
-
-                # celery worker name is of the form celery@myhost.com
-                queue_name = queue.split('@')
-                queue_name = queue_name[1 if len(queue_name) > 1 else 0]
-                queues[queue_name] = active_tasks
-        else:
-            return (None, None)
-
-        return (active_task_queues, queues)

    def get_latest_project_update_tasks(self, all_sorted_tasks):
        project_ids = Set()
@@ -256,9 +172,6 @@ class TaskManager():
                             rampart_group.name, task.log_format))
                return

-        error_handler = handle_work_error.s(subtasks=[task_actual] + dependencies)
-        success_handler = handle_work_success.s(task_actual=task_actual)
-
        task.status = 'waiting'

        (start_status, opts) = task.pre_start()
@@ -300,11 +213,23 @@ class TaskManager():

        def post_commit():
            task.websocket_emit_status(task.status)
-            if task.status != 'failed':
-                task.start_celery_task(opts,
-                                       error_callback=error_handler,
-                                       success_callback=success_handler,
-                                       queue=task.get_celery_queue_name())
+            if task.status != 'failed' and type(task) is not WorkflowJob:
+                task_cls = task._get_task_class()
+                task_cls.apply_async(
+                    [task.pk],
+                    opts,
+                    queue=task.get_queue_name(),
+                    uuid=task.celery_task_id,
+                    callbacks=[{
+                        'task': handle_work_success.name,
+                        'kwargs': {'task_actual': task_actual}
+                    }],
+                    errbacks=[{
+                        'task': handle_work_error.name,
+                        'args': [task.celery_task_id],
+                        'kwargs': {'subtasks': [task_actual] + dependencies}
+                    }],
+                )

        connection.on_commit(post_commit)

@@ -529,105 +454,6 @@ class TaskManager():
            if not found_acceptable_queue:
                logger.debug(six.text_type("{} couldn't be scheduled on graph, waiting for next cycle").format(task.log_format))

-    def fail_jobs_if_not_in_celery(self, node_jobs, active_tasks, celery_task_start_time,
-                                   isolated=False):
-        for task in node_jobs:
-            if (task.celery_task_id not in active_tasks and not hasattr(settings, 'IGNORE_CELERY_INSPECTOR')):
-                if isinstance(task, WorkflowJob):
-                    continue
-                if task.modified > celery_task_start_time:
-                    continue
-                new_status = 'failed'
-                if isolated:
-                    new_status = 'error'
-                task.status = new_status
-                task.start_args = ''  # blank field to remove encrypted passwords
-                if isolated:
-                    # TODO: cancel and reap artifacts of lost jobs from heartbeat
-                    task.job_explanation += ' '.join((
-                        'Task was marked as running in Tower but its ',
-                        'controller management daemon was not present in',
-                        'the job queue, so it has been marked as failed.',
-                        'Task may still be running, but contactability is unknown.'
-                    ))
-                else:
-                    task.job_explanation += ' '.join((
-                        'Task was marked as running in Tower but was not present in',
-                        'the job queue, so it has been marked as failed.',
-                    ))
-                try:
-                    task.save(update_fields=['status', 'start_args', 'job_explanation'])
-                except DatabaseError:
-                    logger.error("Task {} DB error in marking failed. Job possibly deleted.".format(task.log_format))
-                    continue
-                if hasattr(task, 'send_notification_templates'):
-                    task.send_notification_templates('failed')
-                task.websocket_emit_status(new_status)
-                logger.error("{}Task {} has no record in celery. Marking as failed".format(
-                    'Isolated ' if isolated else '', task.log_format))
-
-    def cleanup_inconsistent_celery_tasks(self):
-        '''
-        Rectify tower db <-> celery inconsistent view of jobs state
-        '''
-        last_cleanup = cache.get('last_celery_task_cleanup') or datetime.min.replace(tzinfo=utc)
-        if (tz_now() - last_cleanup).seconds < settings.AWX_INCONSISTENT_TASK_INTERVAL:
-            return
-
-        logger.debug("Failing inconsistent running jobs.")
-        celery_task_start_time = tz_now()
-        active_task_queues, active_queues = self.get_active_tasks()
-        cache.set('last_celery_task_cleanup', tz_now())
-
-        if active_queues is None:
-            logger.error('Failed to retrieve active tasks from celery')
-            return None
-
-        '''
-        Only consider failing tasks on instances for which we obtained a task
-        list from celery for.
-        '''
-        running_tasks, waiting_tasks = self.get_running_tasks()
-        all_celery_task_ids = []
-        for node, node_jobs in active_queues.iteritems():
-            all_celery_task_ids.extend(node_jobs)
-
-        self.fail_jobs_if_not_in_celery(waiting_tasks, all_celery_task_ids, celery_task_start_time)
-
-        for node, node_jobs in running_tasks.iteritems():
-            isolated = False
-            if node in active_queues:
-                active_tasks = active_queues[node]
-            else:
-                '''
-                Node task list not found in celery. We may branch into cases:
-                 - instance is unknown to tower, system is improperly configured
-                 - instance is reported as down, then fail all jobs on the node
-                 - instance is an isolated node, then check running tasks
-                   among all allowed controller nodes for management process
-                 - valid healthy instance not included in celery task list
-                   probably a netsplit case, leave it alone
-                '''
-                instance = Instance.objects.filter(hostname=node).first()
-
-                if instance is None:
-                    logger.error("Execution node Instance {} not found in database. "
-                                 "The node is currently executing jobs {}".format(
-                                     node, [j.log_format for j in node_jobs]))
-                    active_tasks = []
-                elif instance.capacity == 0:
-                    active_tasks = []
-                elif instance.rampart_groups.filter(controller__isnull=False).exists():
-                    active_tasks = all_celery_task_ids
-                    isolated = True
-                else:
-                    continue
-
-            self.fail_jobs_if_not_in_celery(
-                node_jobs, active_tasks, celery_task_start_time,
-                isolated=isolated
-            )
-
    def calculate_capacity_consumed(self, tasks):
        self.graph = InstanceGroup.objects.capacity_values(tasks=tasks, graph=self.graph)

@@ -687,7 +513,6 @@ class TaskManager():
                    return
                logger.debug("Starting Scheduler")

-                self.cleanup_inconsistent_celery_tasks()
                finished_wfjs = self._schedule()

                # Operations whose queries rely on modifications made during the atomic scheduling session
--- a/awx/main/scheduler/tasks.py
+++ b/awx/main/scheduler/tasks.py
@@ -2,30 +2,24 @@
 # Python
 import logging

-# Celery
-from celery import shared_task
-
 # AWX
 from awx.main.scheduler import TaskManager
+from awx.main.dispatch.publish import task

 logger = logging.getLogger('awx.main.scheduler')

-# TODO: move logic to UnifiedJob model and use bind=True feature of celery.
-# Would we need the request loop then? I think so. Even if we get the in-memory
-# updated model, the call to schedule() may get stale data.

-
-@shared_task()
+@task()
 def run_job_launch(job_id):
    TaskManager().schedule()


-@shared_task()
+@task()
 def run_job_complete(job_id):
    TaskManager().schedule()


-@shared_task()
+@task()
 def run_task_manager():
    logger.debug("Running Tower task manager.")
    TaskManager().schedule()
--- a/awx/main/tasks.py
+++ b/awx/main/tasks.py
@@ -13,12 +13,11 @@ import logging
 import os
 import re
 import shutil
+import six
 import stat
-import sys
 import tempfile
 import time
 import traceback
-import six
 import urlparse
 from distutils.version import LooseVersion as Version
 import yaml
@@ -28,12 +27,6 @@ try:
 except Exception:
    psutil = None

-# Celery
-from kombu import Queue, Exchange
-from kombu.common import Broadcast
-from celery import Task, shared_task
-from celery.signals import celeryd_init, worker_shutdown
-
 # Django
 from django.conf import settings
 from django.db import transaction, DatabaseError, IntegrityError
@@ -58,10 +51,12 @@ from awx.main.constants import ACTIVE_STATES
 from awx.main.exceptions import AwxTaskError
 from awx.main.queue import CallbackQueueDispatcher
 from awx.main.expect import run, isolated_manager
+from awx.main.dispatch.publish import task
+from awx.main.dispatch import get_local_queuename, reaper
 from awx.main.utils import (get_ansible_version, get_ssh_version, decrypt_field, update_scm_url,
                            check_proot_installed, build_proot_temp_dir, get_licenser,
                            wrap_args_with_proot, OutputEventFilter, OutputVerboseFilter, ignore_inventory_computed_fields,
-                            ignore_inventory_group_removal, get_type_for_model, extract_ansible_vars)
+                            ignore_inventory_group_removal, extract_ansible_vars)
 from awx.main.utils.safe_yaml import safe_dump, sanitize_jinja
 from awx.main.utils.reload import stop_local_services
 from awx.main.utils.pglock import advisory_lock
@@ -87,52 +82,7 @@ Try upgrading OpenSSH or providing your private key in an different format. \
 logger = logging.getLogger('awx.main.tasks')


-def log_celery_failure(self, exc, task_id, args, kwargs, einfo):
-    try:
-        if getattr(exc, 'is_awx_task_error', False):
-            # Error caused by user / tracked in job output
-            logger.warning(six.text_type("{}").format(exc))
-        elif isinstance(self, BaseTask):
-            logger.exception(six.text_type(
-                '{!s} {!s} execution encountered exception.')
-                .format(get_type_for_model(self.model), args[0]))
-        else:
-            logger.exception(six.text_type('Task {} encountered exception.').format(self.name), exc_info=exc)
-    except Exception:
-        # It's fairly critical that this code _not_ raise exceptions on logging
-        # If you configure external logging in a way that _it_ fails, there's
-        # not a lot we can do here; sys.stderr.write is a final hail mary
-        _, _, tb = sys.exc_info()
-        traceback.print_tb(tb)
-
-
-@celeryd_init.connect
-def celery_startup(conf=None, **kwargs):
-    #
-    # When celeryd starts, if the instance cannot be found in the database,
-    # automatically register it.  This is mostly useful for openshift-based
-    # deployments where:
-    #
-    # 2 Instances come online
-    # Instance B encounters a network blip, Instance A notices, and
-    # deprovisions it
-    # Instance B's connectivity is restored, celeryd starts, and it
-    # re-registers itself
-    #
-    # In traditional container-less deployments, instances don't get
-    # deprovisioned when they miss their heartbeat, so this code is mostly a
-    # no-op.
-    #
-    if kwargs['instance'].hostname != 'celery@{}'.format(settings.CLUSTER_HOST_ID):
-        error = six.text_type('celery -n {} does not match settings.CLUSTER_HOST_ID={}').format(
-            instance.hostname, settings.CLUSTER_HOST_ID
-        )
-        logger.error(error)
-        raise RuntimeError(error)
-    (changed, tower_instance) = Instance.objects.get_or_register()
-    if changed:
-        logger.info(six.text_type("Registered tower node '{}'").format(tower_instance.hostname))
-
+def dispatch_startup():
    startup_logger = logging.getLogger('awx.main.tasks')
    startup_logger.info("Syncing Schedules")
    for sch in Schedule.objects.all():
@@ -144,34 +94,44 @@ def celery_startup(conf=None, **kwargs):
        except Exception:
            logger.exception(six.text_type("Failed to rebuild schedule {}.").format(sch))

-    # set the queues we want to bind to dynamically at startup
-    queues = []
-    me = Instance.objects.me()
-    for q in [me.hostname] + settings.AWX_CELERY_QUEUES_STATIC:
-        q = q.encode('utf-8')
-        queues.append(Queue(q, Exchange(q), routing_key=q))
-    for q in settings.AWX_CELERY_BCAST_QUEUES_STATIC:
-        queues.append(Broadcast(q.encode('utf-8')))
-    conf.CELERY_QUEUES = list(set(queues))
-
-    # Expedite the first hearbeat run so a node comes online quickly.
-    cluster_node_heartbeat.apply([])
+    #
+    # When the dispatcher starts, if the instance cannot be found in the database,
+    # automatically register it.  This is mostly useful for openshift-based
+    # deployments where:
+    #
+    # 2 Instances come online
+    # Instance B encounters a network blip, Instance A notices, and
+    # deprovisions it
+    # Instance B's connectivity is restored, the dispatcher starts, and it
+    # re-registers itself
+    #
+    # In traditional container-less deployments, instances don't get
+    # deprovisioned when they miss their heartbeat, so this code is mostly a
+    # no-op.
+    #
+    apply_cluster_membership_policies()
+    cluster_node_heartbeat()
+    if Instance.objects.me().is_controller():
+        awx_isolated_heartbeat()


-@worker_shutdown.connect
-def inform_cluster_of_shutdown(*args, **kwargs):
+def inform_cluster_of_shutdown():
    try:
        this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID)
        this_inst.capacity = 0  # No thank you to new jobs while shut down
        this_inst.save(update_fields=['capacity', 'modified'])
+        try:
+            reaper.reap(this_inst)
+        except Exception:
+            logger.exception('failed to reap jobs for {}'.format(this_inst.hostname))
        logger.warning(six.text_type('Normal shutdown signal for instance {}, '
                       'removed self from capacity pool.').format(this_inst.hostname))
    except Exception:
        logger.exception('Encountered problem with normal shutdown signal.')


-@shared_task(bind=True, queue=settings.CELERY_DEFAULT_QUEUE)
-def apply_cluster_membership_policies(self):
+@task()
+def apply_cluster_membership_policies():
    started_waiting = time.time()
    with advisory_lock('cluster_policy_lock', wait=True):
        lock_time = time.time() - started_waiting
@@ -280,20 +240,18 @@ def apply_cluster_membership_policies(self):
        logger.info('Cluster policy computation finished in {} seconds'.format(time.time() - started_compute))


-@shared_task(exchange='tower_broadcast_all', bind=True)
-def handle_setting_changes(self, setting_keys):
+@task(queue='tower_broadcast_all', exchange_type='fanout')
+def handle_setting_changes(setting_keys):
    orig_len = len(setting_keys)
    for i in range(orig_len):
        for dependent_key in settings_registry.get_dependent_settings(setting_keys[i]):
            setting_keys.append(dependent_key)
-    logger.warn('Processing cache changes, task args: {0.args!r} kwargs: {0.kwargs!r}'.format(
-        self.request))
    cache_keys = set(setting_keys)
    logger.debug('cache delete_many(%r)', cache_keys)
    cache.delete_many(cache_keys)


-@shared_task(queue=settings.CELERY_DEFAULT_QUEUE)
+@task()
 def send_notifications(notification_list, job_id=None):
    if not isinstance(notification_list, list):
        raise TypeError("notification_list should be of type list")
@@ -322,8 +280,8 @@ def send_notifications(notification_list, job_id=None):
                logger.exception(six.text_type('Error saving notification {} result.').format(notification.id))


-@shared_task(bind=True, queue=settings.CELERY_DEFAULT_QUEUE)
-def run_administrative_checks(self):
+@task()
+def run_administrative_checks():
    logger.warn("Running administrative checks.")
    if not settings.TOWER_ADMIN_ALERTS:
        return
@@ -344,8 +302,8 @@ def run_administrative_checks(self):
                  fail_silently=True)


-@shared_task(bind=True)
-def purge_old_stdout_files(self):
+@task(queue=get_local_queuename)
+def purge_old_stdout_files():
    nowtime = time.time()
    for f in os.listdir(settings.JOBOUTPUT_ROOT):
        if os.path.getctime(os.path.join(settings.JOBOUTPUT_ROOT,f)) < nowtime - settings.LOCAL_STDOUT_EXPIRE_TIME:
@@ -353,8 +311,8 @@ def purge_old_stdout_files(self):
            logger.info(six.text_type("Removing {}").format(os.path.join(settings.JOBOUTPUT_ROOT,f)))


-@shared_task(bind=True)
-def cluster_node_heartbeat(self):
+@task(queue=get_local_queuename)
+def cluster_node_heartbeat():
    logger.debug("Cluster node heartbeat task.")
    nowtime = now()
    instance_list = list(Instance.objects.all_non_isolated())
@@ -397,9 +355,13 @@ def cluster_node_heartbeat(self):
                                    this_inst.version))
            # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance.
            # The heartbeat task will reset the capacity to the system capacity after upgrade.
-            stop_local_services(['uwsgi', 'celery', 'beat', 'callback'], communicate=False)
+            stop_local_services(communicate=False)
            raise RuntimeError("Shutting down.")
    for other_inst in lost_instances:
+        try:
+            reaper.reap(other_inst)
+        except Exception:
+            logger.exception('failed to reap jobs for {}'.format(other_inst.hostname))
        try:
            # Capacity could already be 0 because:
            #  * It's a new node and it never had a heartbeat
@@ -424,8 +386,8 @@ def cluster_node_heartbeat(self):
                logger.exception(six.text_type('Error marking {} as lost').format(other_inst.hostname))


-@shared_task(bind=True)
-def awx_isolated_heartbeat(self):
+@task(queue=get_local_queuename)
+def awx_isolated_heartbeat():
    local_hostname = settings.CLUSTER_HOST_ID
    logger.debug("Controlling node checking for any isolated management tasks.")
    poll_interval = settings.AWX_ISOLATED_PERIODIC_CHECK
@@ -452,8 +414,8 @@ def awx_isolated_heartbeat(self):
        isolated_manager.IsolatedManager.health_check(isolated_instance_qs, awx_application_version)


-@shared_task(bind=True, queue=settings.CELERY_DEFAULT_QUEUE)
-def awx_periodic_scheduler(self):
+@task()
+def awx_periodic_scheduler():
    run_now = now()
    state = TowerScheduleState.get_solo()
    last_run = state.schedule_last_run
@@ -503,8 +465,8 @@ def awx_periodic_scheduler(self):
    state.save()


-@shared_task(bind=True, queue=settings.CELERY_DEFAULT_QUEUE)
-def handle_work_success(self, result, task_actual):
+@task()
+def handle_work_success(task_actual):
    try:
        instance = UnifiedJob.get_instance_by_type(task_actual['type'], task_actual['id'])
    except ObjectDoesNotExist:
@@ -517,7 +479,7 @@ def handle_work_success(self, result, task_actual):
    run_job_complete.delay(instance.id)


-@shared_task(queue=settings.CELERY_DEFAULT_QUEUE)
+@task()
 def handle_work_error(task_id, *args, **kwargs):
    subtasks = kwargs.get('subtasks', None)
    logger.debug('Executing error task id %s, subtasks: %s' % (task_id, str(subtasks)))
@@ -558,7 +520,7 @@ def handle_work_error(task_id, *args, **kwargs):
        pass


-@shared_task(queue=settings.CELERY_DEFAULT_QUEUE)
+@task()
 def update_inventory_computed_fields(inventory_id, should_update_hosts=True):
    '''
    Signal handler and wrapper around inventory.update_computed_fields to
@@ -578,7 +540,7 @@ def update_inventory_computed_fields(inventory_id, should_update_hosts=True):
        raise


-@shared_task(queue=settings.CELERY_DEFAULT_QUEUE)
+@task()
 def update_host_smart_inventory_memberships():
    try:
        with transaction.atomic():
@@ -603,8 +565,8 @@ def update_host_smart_inventory_memberships():
        smart_inventory.update_computed_fields(update_groups=False, update_hosts=False)


-@shared_task(bind=True, queue=settings.CELERY_DEFAULT_QUEUE, max_retries=5)
-def delete_inventory(self, inventory_id, user_id):
+@task()
+def delete_inventory(inventory_id, user_id):
    # Delete inventory as user
    if user_id is None:
        user = None
@@ -629,7 +591,7 @@ def delete_inventory(self, inventory_id, user_id):
            return
        except DatabaseError:
            logger.exception('Database error deleting inventory {}, but will retry.'.format(inventory_id))
-            self.retry(countdown=10)
+            # TODO: self.retry(countdown=10)


 def with_path_cleanup(f):
@@ -650,8 +612,7 @@ def with_path_cleanup(f):
    return _wrapped


-class BaseTask(Task):
-    name = None
+class BaseTask(object):
    model = None
    event_model = None
    abstract = True
@@ -945,14 +906,11 @@ class BaseTask(Task):
            if instance.cancel_flag:
                instance = self.update_model(instance.pk, status='canceled')
            if instance.status != 'running':
-                if hasattr(settings, 'CELERY_UNIT_TEST'):
-                    return
-                else:
-                    # Stop the task chain and prevent starting the job if it has
-                    # already been canceled.
-                    instance = self.update_model(pk)
-                    status = instance.status
-                    raise RuntimeError('not starting %s task' % instance.status)
+                # Stop the task chain and prevent starting the job if it has
+                # already been canceled.
+                instance = self.update_model(pk)
+                status = instance.status
+                raise RuntimeError('not starting %s task' % instance.status)

            if not os.path.exists(settings.AWX_PROOT_BASE_PATH):
                raise RuntimeError('AWX_PROOT_BASE_PATH=%s does not exist' % settings.AWX_PROOT_BASE_PATH)
@@ -1085,8 +1043,6 @@ class BaseTask(Task):
            logger.exception(six.text_type('{} Final run hook errored.').format(instance.log_format))
        instance.websocket_emit_status(status)
        if status != 'successful':
-            # Raising an exception will mark the job as 'failed' in celery
-            # and will stop a task chain from continuing to execute
            if status == 'canceled':
                raise AwxTaskError.TaskCancel(instance, rc)
            else:
@@ -1109,12 +1065,12 @@ class BaseTask(Task):
        return ''


+@task()
 class RunJob(BaseTask):
    '''
-    Celery task to run a job using ansible-playbook.
+    Run a job using ansible-playbook.
    '''

-    name = 'awx.main.tasks.run_job'
    model = Job
    event_model = JobEvent
    event_data_key = 'job_id'
@@ -1404,7 +1360,6 @@ class RunJob(BaseTask):
            self.update_model(job.pk, status='failed', job_explanation=error)
            raise RuntimeError(error)
        if job.project and job.project.scm_type:
-            job_request_id = '' if self.request.id is None else self.request.id
            pu_ig = job.instance_group
            pu_en = job.execution_node
            if job.is_isolated() is True:
@@ -1417,16 +1372,14 @@ class RunJob(BaseTask):
                    status='running',
                    instance_group = pu_ig,
                    execution_node=pu_en,
-                    celery_task_id=job_request_id))
+                    celery_task_id=job.celery_task_id))
            # save the associated job before calling run() so that a
            # cancel() call on the job can cancel the project update
            job = self.update_model(job.pk, project_update=local_project_sync)

            project_update_task = local_project_sync._get_task_class()
            try:
-                task_instance = project_update_task()
-                task_instance.request.id = job_request_id
-                task_instance.run(local_project_sync.id)
+                project_update_task().run(local_project_sync.id)
                job = self.update_model(job.pk, scm_revision=job.project.scm_revision)
            except Exception:
                local_project_sync.refresh_from_db()
@@ -1436,7 +1389,6 @@ class RunJob(BaseTask):
                                                             ('project_update', local_project_sync.name, local_project_sync.id)))
                    raise

-
    def final_run_hook(self, job, status, **kwargs):
        super(RunJob, self).final_run_hook(job, status, **kwargs)
        if job.use_fact_cache:
@@ -1467,9 +1419,9 @@ class RunJob(BaseTask):
            update_inventory_computed_fields.delay(inventory.id, True)


+@task()
 class RunProjectUpdate(BaseTask):

-    name = 'awx.main.tasks.run_project_update'
    model = ProjectUpdate
    event_model = ProjectUpdateEvent
    event_data_key = 'project_update_id'
@@ -1670,7 +1622,6 @@ class RunProjectUpdate(BaseTask):
        return getattr(settings, 'PROJECT_UPDATE_IDLE_TIMEOUT', None)

    def _update_dependent_inventories(self, project_update, dependent_inventory_sources):
-        project_request_id = '' if self.request.id is None else self.request.id
        scm_revision = project_update.project.scm_revision
        inv_update_class = InventoryUpdate._get_task_class()
        for inv_src in dependent_inventory_sources:
@@ -1693,13 +1644,10 @@ class RunProjectUpdate(BaseTask):
                        status='running',
                        instance_group=project_update.instance_group,
                        execution_node=project_update.execution_node,
-                        celery_task_id=str(project_request_id),
-                        source_project_update=project_update))
+                        source_project_update=project_update,
+                        celery_task_id=project_update.celery_task_id))
            try:
-                task_instance = inv_update_class()
-                # Runs in the same Celery task as project update
-                task_instance.request.id = project_request_id
-                task_instance.run(local_inv_update.id)
+                inv_update_class().run(local_inv_update.id)
            except Exception:
                logger.exception(six.text_type('{} Unhandled exception updating dependent SCM inventory sources.')
                                 .format(project_update.log_format))
@@ -1804,9 +1752,9 @@ class RunProjectUpdate(BaseTask):
        return getattr(settings, 'AWX_PROOT_ENABLED', False)


+@task()
 class RunInventoryUpdate(BaseTask):

-    name = 'awx.main.tasks.run_inventory_update'
    model = InventoryUpdate
    event_model = InventoryUpdateEvent
    event_data_key = 'inventory_update_id'
@@ -2024,8 +1972,7 @@ class RunInventoryUpdate(BaseTask):
        This dictionary is used by `build_env`, below.
        """
        # Run the superclass implementation.
-        super_ = super(RunInventoryUpdate, self).build_passwords
-        passwords = super_(inventory_update, **kwargs)
+        passwords = super(RunInventoryUpdate, self).build_passwords(inventory_update, **kwargs)

        # Take key fields from the credential in use and add them to the
        # passwords dictionary.
@@ -2188,7 +2135,6 @@ class RunInventoryUpdate(BaseTask):
        if inventory_update.inventory_source:
            source_project = inventory_update.inventory_source.source_project
        if (inventory_update.source=='scm' and inventory_update.launch_type!='scm' and source_project):
-            request_id = '' if self.request.id is None else self.request.id
            local_project_sync = source_project.create_project_update(
                _eager_fields=dict(
                    launch_type="sync",
@@ -2196,16 +2142,14 @@ class RunInventoryUpdate(BaseTask):
                    status='running',
                    execution_node=inventory_update.execution_node,
                    instance_group = inventory_update.instance_group,
-                    celery_task_id=request_id))
+                    celery_task_id=inventory_update.celery_task_id))
            # associate the inventory update before calling run() so that a
            # cancel() call on the inventory update can cancel the project update
            local_project_sync.scm_inventory_updates.add(inventory_update)

            project_update_task = local_project_sync._get_task_class()
            try:
-                task_instance = project_update_task()
-                task_instance.request.id = request_id
-                task_instance.run(local_project_sync.id)
+                project_update_task().run(local_project_sync.id)
                inventory_update.inventory_source.scm_last_revision = local_project_sync.project.scm_revision
                inventory_update.inventory_source.save(update_fields=['scm_last_revision'])
            except Exception:
@@ -2216,12 +2160,12 @@ class RunInventoryUpdate(BaseTask):
                raise


+@task()
 class RunAdHocCommand(BaseTask):
    '''
-    Celery task to run an ad hoc command using ansible.
+    Run an ad hoc command using ansible.
    '''

-    name = 'awx.main.tasks.run_ad_hoc_command'
    model = AdHocCommand
    event_model = AdHocCommandEvent
    event_data_key = 'ad_hoc_command_id'
@@ -2382,9 +2326,9 @@ class RunAdHocCommand(BaseTask):
        return getattr(settings, 'AWX_PROOT_ENABLED', False)


+@task()
 class RunSystemJob(BaseTask):

-    name = 'awx.main.tasks.run_system_job'
    model = SystemJob
    event_model = SystemJobEvent
    event_data_key = 'system_job_id'
@@ -2439,9 +2383,9 @@ def _reconstruct_relationships(copy_mapping):
        new_obj.save()


-@shared_task(bind=True, queue=settings.CELERY_DEFAULT_QUEUE)
+@task()
 def deep_copy_model_obj(
-    self, model_module, model_name, obj_pk, new_obj_pk,
+    model_module, model_name, obj_pk, new_obj_pk,
    user_pk, sub_obj_list, permission_check_func=None
 ):
    logger.info(six.text_type('Deep copy {} from {} to {}.').format(model_name, obj_pk, new_obj_pk))
--- a/awx/main/tests/functional/conftest.py
+++ b/awx/main/tests/functional/conftest.py
@@ -14,7 +14,6 @@ from django.core.urlresolvers import resolve
 from django.utils.six.moves.urllib.parse import urlparse
 from django.utils import timezone
 from django.contrib.auth.models import User
-from django.conf import settings
 from django.core.serializers.json import DjangoJSONEncoder
 from django.db.backends.sqlite3.base import SQLiteCursorWrapper
 from jsonbfield.fields import JSONField
@@ -66,17 +65,6 @@ def swagger_autogen(requests=__SWAGGER_REQUESTS__):
    return requests


-@pytest.fixture(scope="session", autouse=True)
-def celery_memory_broker():
-    '''
-    FIXME: Not sure how "far" just setting the BROKER_URL will get us.
-    We may need to incluence CELERY's configuration like we do in the old unit tests (see base.py)
-
-    Allows django signal code to execute without the need for redis
-    '''
-    settings.BROKER_URL='memory://localhost/'
-
-
@pytest.fixture
 def user():
    def u(name, is_superuser=False):
--- a/awx/main/tests/functional/models/test_unified_job.py
+++ b/awx/main/tests/functional/models/test_unified_job.py
@@ -1,13 +1,11 @@
 import itertools
 import pytest
-import mock

 # Django
 from django.contrib.contenttypes.models import ContentType

 # AWX
 from awx.main.models import UnifiedJobTemplate, Job, JobTemplate, WorkflowJobTemplate, Project, WorkflowJob, Schedule
-from awx.main.models.ha import InstanceGroup


@pytest.mark.django_db
@@ -66,48 +64,6 @@ class TestCreateUnifiedJob:
        assert net_credential in second_job.credentials.all()


-@pytest.mark.django_db
-class TestIsolatedRuns:
-
-    def test_low_capacity_isolated_instance_selected(self):
-        ig = InstanceGroup.objects.create(name='tower')
-        iso_ig = InstanceGroup.objects.create(name='thepentagon', controller=ig)
-        iso_ig.instances.create(hostname='iso1', capacity=50)
-        i2 = iso_ig.instances.create(hostname='iso2', capacity=200)
-        job = Job.objects.create(
-            instance_group=iso_ig,
-            celery_task_id='something',
-        )
-
-        mock_async = mock.MagicMock()
-        success_callback = mock.MagicMock()
-        error_callback = mock.MagicMock()
-
-        class MockTaskClass:
-            apply_async = mock_async
-
-        with mock.patch.object(job, '_get_task_class') as task_class:
-            task_class.return_value = MockTaskClass
-            job.start_celery_task([], error_callback, success_callback, 'thepentagon')
-        mock_async.assert_called_with([job.id], [], 
-                                      link_error=error_callback, 
-                                      link=success_callback, 
-                                      queue='thepentagon',
-                                      task_id='something')
-
-        i2.capacity = 20
-        i2.save()
-
-        with mock.patch.object(job, '_get_task_class') as task_class:
-            task_class.return_value = MockTaskClass
-            job.start_celery_task([], error_callback, success_callback, 'thepentagon')
-        mock_async.assert_called_with([job.id], [], 
-                                      link_error=error_callback, 
-                                      link=success_callback, 
-                                      queue='thepentagon',
-                                      task_id='something')
-
-
@pytest.mark.django_db
 class TestMetaVars:
    '''
--- a/awx/main/tests/functional/task_management/test_scheduler.py
+++ b/awx/main/tests/functional/task_management/test_scheduler.py
@@ -1,19 +1,10 @@
 import pytest
 import mock
 import json
-from datetime import timedelta, datetime
-
-from django.core.cache import cache
-from django.utils.timezone import now as tz_now
+from datetime import timedelta

 from awx.main.scheduler import TaskManager
 from awx.main.utils import encrypt_field
-from awx.main.models import (
-    Job,
-    Instance,
-    WorkflowJob,
-)
-from awx.main.models.notifications import JobNotificationMixin


@pytest.mark.django_db
@@ -245,140 +236,3 @@ def test_shared_dependencies_launch(default_instance_group, job_template_factory
    iu = [x for x in ii.inventory_updates.all()]
    assert len(pu) == 1
    assert len(iu) == 1
-
-
-@pytest.mark.django_db
-def test_cleanup_interval(mock_cache):
-    with mock.patch.multiple('awx.main.scheduler.task_manager.cache', get=mock_cache.get, set=mock_cache.set):
-        assert mock_cache.get('last_celery_task_cleanup') is None
-
-        TaskManager().cleanup_inconsistent_celery_tasks()
-        last_cleanup = mock_cache.get('last_celery_task_cleanup')
-        assert isinstance(last_cleanup, datetime)
-
-        TaskManager().cleanup_inconsistent_celery_tasks()
-        assert cache.get('last_celery_task_cleanup') == last_cleanup
-
-
-class TestReaper():
-    @pytest.fixture
-    def all_jobs(self, mocker):
-        now = tz_now()
-
-        Instance.objects.create(hostname='host1', capacity=100)
-        Instance.objects.create(hostname='host2', capacity=100)
-        Instance.objects.create(hostname='host3_split', capacity=100)
-        Instance.objects.create(hostname='host4_offline', capacity=0)
-
-        j1 = Job.objects.create(status='pending', execution_node='host1')
-        j2 = Job.objects.create(status='waiting', celery_task_id='considered_j2')
-        j3 = Job.objects.create(status='waiting', celery_task_id='considered_j3')
-        j3.modified = now - timedelta(seconds=60)
-        j3.save(update_fields=['modified'])
-        j4 = Job.objects.create(status='running', celery_task_id='considered_j4', execution_node='host1')
-        j5 = Job.objects.create(status='waiting', celery_task_id='reapable_j5')
-        j5.modified = now - timedelta(seconds=60)
-        j5.save(update_fields=['modified'])
-        j6 = Job.objects.create(status='waiting', celery_task_id='considered_j6')
-        j6.modified = now - timedelta(seconds=60)
-        j6.save(update_fields=['modified'])
-        j7 = Job.objects.create(status='running', celery_task_id='considered_j7', execution_node='host2')
-        j8 = Job.objects.create(status='running', celery_task_id='reapable_j7', execution_node='host2')
-        j9 = Job.objects.create(status='waiting', celery_task_id='reapable_j8')
-        j9.modified = now - timedelta(seconds=60)
-        j9.save(update_fields=['modified'])
-        j10 = Job.objects.create(status='running', celery_task_id='host3_j10', execution_node='host3_split')
-
-        j11 = Job.objects.create(status='running', celery_task_id='host4_j11', execution_node='host4_offline')
-
-        j12 = WorkflowJob.objects.create(status='running', celery_task_id='workflow_job', execution_node='host1')
-
-        js = [j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12]
-        for j in js:
-            j.save = mocker.Mock(wraps=j.save)
-            j.websocket_emit_status = mocker.Mock()
-        return js
-
-    @pytest.fixture
-    def considered_jobs(self, all_jobs):
-        return all_jobs[2:7] + [all_jobs[10]]
-
-    @pytest.fixture
-    def running_tasks(self, all_jobs):
-        return {
-            'host1': [all_jobs[3]],
-            'host2': [all_jobs[7], all_jobs[8]],
-            'host3_split': [all_jobs[9]],
-            'host4_offline': [all_jobs[10]],
-        }
-
-    @pytest.fixture
-    def waiting_tasks(self, all_jobs):
-        return [all_jobs[2], all_jobs[4], all_jobs[5], all_jobs[8]]
-
-    @pytest.fixture
-    def reapable_jobs(self, all_jobs):
-        return [all_jobs[4], all_jobs[7], all_jobs[10]]
-
-    @pytest.fixture
-    def unconsidered_jobs(self, all_jobs):
-        return all_jobs[0:1] + all_jobs[5:7]
-
-    @pytest.fixture
-    def active_tasks(self):
-        return ([], {
-            'host1': ['considered_j2', 'considered_j3', 'considered_j4',],
-            'host2': ['considered_j6', 'considered_j7'],
-        })
-
-    @pytest.mark.django_db
-    @mock.patch.object(JobNotificationMixin, 'send_notification_templates')
-    @mock.patch.object(TaskManager, 'get_active_tasks', lambda self: ([], []))
-    def test_cleanup_inconsistent_task(self, notify, active_tasks, considered_jobs, reapable_jobs, running_tasks, waiting_tasks, mocker, settings):
-        settings.AWX_INCONSISTENT_TASK_INTERVAL = 0
-        tm = TaskManager()
-
-        tm.get_running_tasks = mocker.Mock(return_value=(running_tasks, waiting_tasks))
-        tm.get_active_tasks = mocker.Mock(return_value=active_tasks)
-        
-        tm.cleanup_inconsistent_celery_tasks()
-        
-        for j in considered_jobs:
-            if j not in reapable_jobs:
-                j.save.assert_not_called()
-
-        assert notify.call_count == 4
-        notify.assert_has_calls([mock.call('failed') for j in reapable_jobs], any_order=True)
-
-        for j in reapable_jobs:
-            j.websocket_emit_status.assert_called_once_with('failed')
-            assert j.status == 'failed'
-            assert j.job_explanation == (
-                'Task was marked as running in Tower but was not present in the job queue, so it has been marked as failed.'
-            )
-
-    @pytest.mark.django_db
-    def test_get_running_tasks(self, all_jobs):
-        tm = TaskManager()
-
-        # Ensure the query grabs the expected jobs
-        execution_nodes_jobs, waiting_jobs = tm.get_running_tasks()
-        assert 'host1' in execution_nodes_jobs
-        assert 'host2' in execution_nodes_jobs
-        assert 'host3_split' in execution_nodes_jobs
-
-        assert all_jobs[3] in execution_nodes_jobs['host1']
-
-        assert all_jobs[6] in execution_nodes_jobs['host2']
-        assert all_jobs[7] in execution_nodes_jobs['host2']
-        
-        assert all_jobs[9] in execution_nodes_jobs['host3_split']
-
-        assert all_jobs[10] in execution_nodes_jobs['host4_offline']
-
-        assert all_jobs[11] not in execution_nodes_jobs['host1']
-
-        assert all_jobs[2] in waiting_jobs
-        assert all_jobs[4] in waiting_jobs
-        assert all_jobs[5] in waiting_jobs
-        assert all_jobs[8] in waiting_jobs
--- a/awx/main/tests/functional/test_dispatch.py
+++ b/awx/main/tests/functional/test_dispatch.py
@@ -1,12 +1,39 @@
+import datetime
 import multiprocessing
 import random
-import sys
-from uuid import uuid4
+import signal
+import time

+from django.utils.timezone import now as tz_now
 import pytest

-from awx.main.dispatch.worker import BaseWorker
-from awx.main.dispatch.pool import WorkerPool
+from awx.main.models import Job, WorkflowJob, Instance
+from awx.main.dispatch import reaper
+from awx.main.dispatch.pool import PoolWorker, WorkerPool, AutoscalePool
+from awx.main.dispatch.publish import task
+from awx.main.dispatch.worker import BaseWorker, TaskWorker
+
+
+@task()
+def add(a, b):
+    return a + b
+
+
+class BaseTask(object):
+
+    def add(self, a, b):
+        return add(a, b)
+
+
+@task()
+class Adder(BaseTask):
+    def run(self, a, b):
+        return super(Adder, self).add(a, b)
+
+
+@task(queue='hard-math')
+def multiply(a, b):
+    return a * b


 class SimpleWorker(BaseWorker):
@@ -21,6 +48,61 @@ class ResultWriter(BaseWorker):
        result_queue.put(body + '!!!')


+class SlowResultWriter(BaseWorker):
+
+    def perform_work(self, body, result_queue):
+        time.sleep(3)
+        super(SlowResultWriter, self).perform_work(body, result_queue)
+
+
+class TestPoolWorker:
+
+    def setup_method(self, test_method):
+        self.worker = PoolWorker(1000, self.tick, tuple())
+
+    def tick(self):
+        self.worker.finished.put(self.worker.queue.get()['uuid'])
+        time.sleep(.5)
+
+    def test_qsize(self):
+        assert self.worker.qsize == 0
+        for i in range(3):
+            self.worker.put({'task': 'abc123'})
+        assert self.worker.qsize == 3
+
+    def test_put(self):
+        assert len(self.worker.managed_tasks) == 0
+        assert self.worker.messages_finished == 0
+        self.worker.put({'task': 'abc123'})
+
+        assert len(self.worker.managed_tasks) == 1
+        assert self.worker.messages_sent == 1
+
+    def test_managed_tasks(self):
+        self.worker.put({'task': 'abc123'})
+        self.worker.calculate_managed_tasks()
+        assert len(self.worker.managed_tasks) == 1
+
+        self.tick()
+        self.worker.calculate_managed_tasks()
+        assert len(self.worker.managed_tasks) == 0
+
+    def test_current_task(self):
+        self.worker.put({'task': 'abc123'})
+        assert self.worker.current_task['task'] == 'abc123'
+
+    def test_quit(self):
+        self.worker.quit()
+        assert self.worker.queue.get() == 'QUIT'
+
+    def test_idle_busy(self):
+        assert self.worker.idle is True
+        assert self.worker.busy is False
+        self.worker.put({'task': 'abc123'})
+        assert self.worker.busy is True
+        assert self.worker.idle is False
+
+
@pytest.mark.django_db
 class TestWorkerPool:

@@ -28,37 +110,35 @@ class TestWorkerPool:
        self.pool = WorkerPool(min_workers=3)

    def teardown_method(self, test_method):
-        self.pool.stop()
+        self.pool.stop(signal.SIGTERM)

    def test_worker(self):
        self.pool.init_workers(SimpleWorker().work_loop)
        assert len(self.pool) == 3
        for worker in self.pool.workers:
-            total, _, process = worker
-            assert total == 0
-            assert process.is_alive() is True
+            assert worker.messages_sent == 0
+            assert worker.alive is True

    def test_single_task(self):
        self.pool.init_workers(SimpleWorker().work_loop)
        self.pool.write(0, 'xyz')
-        assert self.pool.workers[0][0] == 1  # worker at index 0 handled one task
-        assert self.pool.workers[1][0] == 0
-        assert self.pool.workers[2][0] == 0
+        assert self.pool.workers[0].messages_sent == 1  # worker at index 0 handled one task
+        assert self.pool.workers[1].messages_sent == 0
+        assert self.pool.workers[2].messages_sent == 0

    def test_queue_preference(self):
        self.pool.init_workers(SimpleWorker().work_loop)
        self.pool.write(2, 'xyz')
-        assert self.pool.workers[0][0] == 0
-        assert self.pool.workers[1][0] == 0
-        assert self.pool.workers[2][0] == 1  # worker at index 2 handled one task
+        assert self.pool.workers[0].messages_sent == 0
+        assert self.pool.workers[1].messages_sent == 0
+        assert self.pool.workers[2].messages_sent == 1  # worker at index 2 handled one task

    def test_worker_processing(self):
        result_queue = multiprocessing.Queue()
        self.pool.init_workers(ResultWriter().work_loop, result_queue)
-        uuids = []
        for i in range(10):
            self.pool.write(
-                random.choice(self.pool.workers)[0],
+                random.choice(range(len(self.pool))),
                'Hello, Worker {}'.format(i)
            )
        all_messages = [result_queue.get(timeout=1) for i in range(10)]
@@ -68,5 +148,212 @@ class TestWorkerPool:
            for i in range(10)
        ]

-        total_handled = sum([worker[0] for worker in self.pool.workers])
+        total_handled = sum([worker.messages_sent for worker in self.pool.workers])
        assert total_handled == 10
+
+
+@pytest.mark.django_db
+class TestAutoScaling:
+
+    def setup_method(self, test_method):
+        self.pool = AutoscalePool(min_workers=2, max_workers=10)
+
+    def teardown_method(self, test_method):
+        self.pool.stop(signal.SIGTERM)
+
+    def test_scale_up(self):
+        result_queue = multiprocessing.Queue()
+        self.pool.init_workers(SlowResultWriter().work_loop, result_queue)
+
+        # start with two workers, write an event to each worker and make it busy
+        assert len(self.pool) == 2
+        for i, w in enumerate(self.pool.workers):
+            w.put('Hello, Worker {}'.format(0))
+        assert len(self.pool) == 2
+
+        # wait for the subprocesses to start working on their tasks and be marked busy
+        time.sleep(1)
+        assert self.pool.should_grow
+
+        # write a third message, expect a new worker to spawn because all
+        # workers are busy
+        self.pool.write(0, 'Hello, Worker {}'.format(2))
+        assert len(self.pool) == 3
+
+    def test_scale_down(self):
+        self.pool.init_workers(ResultWriter().work_loop, multiprocessing.Queue())
+
+        # start with two workers, and scale up to 10 workers
+        assert len(self.pool) == 2
+        for i in range(8):
+            self.pool.up()
+        assert len(self.pool) == 10
+
+        # cleanup should scale down to 8 workers
+        self.pool.cleanup()
+        assert len(self.pool) == 2
+
+    def test_max_scale_up(self):
+        self.pool.init_workers(ResultWriter().work_loop, multiprocessing.Queue())
+
+        assert len(self.pool) == 2
+        for i in range(25):
+            self.pool.up()
+        assert self.pool.max_workers == 10
+        assert self.pool.full is True
+        assert len(self.pool) == 10
+
+    def test_equal_worker_distribution(self):
+        # if all workers are busy, spawn new workers *before* adding messages
+        # to an existing queue
+        self.pool.init_workers(SlowResultWriter().work_loop, multiprocessing.Queue)
+
+        # start with two workers, write an event to each worker and make it busy
+        assert len(self.pool) == 2
+        for i in range(10):
+            self.pool.write(0, 'Hello, World!')
+        assert len(self.pool) == 10
+        for w in self.pool.workers:
+            assert w.busy
+            assert len(w.managed_tasks) == 1
+
+        # the queue is full at 10, the _next_ write should put the message into
+        # a worker's backlog
+        assert len(self.pool) == 10
+        for w in self.pool.workers:
+            assert w.messages_sent == 1
+        self.pool.write(0, 'Hello, World!')
+        assert len(self.pool) == 10
+        assert self.pool.workers[0].messages_sent == 2
+
+    def test_lost_worker_autoscale(self):
+        # if a worker exits, it should be replaced automatically up to min_workers
+        self.pool.init_workers(ResultWriter().work_loop, multiprocessing.Queue())
+
+        # start with two workers, kill one of them
+        assert len(self.pool) == 2
+        assert not self.pool.should_grow
+        alive_pid = self.pool.workers[1].pid
+        self.pool.workers[0].process.terminate()
+        time.sleep(1)  # wait a moment for sigterm
+
+        # clean up and the dead worker
+        self.pool.cleanup()
+        assert len(self.pool) == 1
+        assert self.pool.workers[0].pid == alive_pid
+
+        # the next queue write should replace the lost worker
+        self.pool.write(0, 'Hello, Worker')
+        assert len(self.pool) == 2
+
+
+class TestTaskDispatcher:
+
+    @property
+    def tm(self):
+        return TaskWorker()
+
+    def test_function_dispatch(self):
+        result = self.tm.perform_work({
+            'task': 'awx.main.tests.functional.test_dispatch.add',
+            'args': [2, 2]
+        })
+        assert result == 4
+
+    def test_method_dispatch(self):
+        result = self.tm.perform_work({
+            'task': 'awx.main.tests.functional.test_dispatch.Adder',
+            'args': [2, 2]
+        })
+        assert result == 4
+
+
+class TestTaskPublisher:
+
+    def test_function_callable(self):
+        assert add(2, 2) == 4
+
+    def test_method_callable(self):
+        assert Adder().run(2, 2) == 4
+
+    def test_function_apply_async(self):
+        message, queue = add.apply_async([2, 2])
+        assert message['args'] == [2, 2]
+        assert message['kwargs'] == {}
+        assert message['task'] == 'awx.main.tests.functional.test_dispatch.add'
+        assert queue == 'awx_private_queue'
+
+    def test_method_apply_async(self):
+        message, queue = Adder.apply_async([2, 2])
+        assert message['args'] == [2, 2]
+        assert message['kwargs'] == {}
+        assert message['task'] == 'awx.main.tests.functional.test_dispatch.Adder'
+        assert queue == 'awx_private_queue'
+
+    def test_apply_with_queue(self):
+        message, queue = add.apply_async([2, 2], queue='abc123')
+        assert queue == 'abc123'
+
+    def test_queue_defined_in_task_decorator(self):
+        message, queue = multiply.apply_async([2, 2])
+        assert queue == 'hard-math'
+
+    def test_queue_overridden_from_task_decorator(self):
+        message, queue = multiply.apply_async([2, 2], queue='not-so-hard')
+        assert queue == 'not-so-hard'
+
+    def test_apply_with_callable_queuename(self):
+        message, queue = add.apply_async([2, 2], queue=lambda: 'called')
+        assert queue == 'called'
+
+
+yesterday = tz_now() - datetime.timedelta(days=1)
+
+
+@pytest.mark.django_db
+class TestJobReaper(object):
+
+    @pytest.mark.parametrize('status, execution_node, controller_node, modified, fail', [
+        ('running', '', '', None, False),        # running, not assigned to the instance
+        ('running', 'awx', '', None, True),      # running, has the instance as its execution_node
+        ('running', '', 'awx', None, True),      # running, has the instance as its controller_node
+        ('waiting', '', '', None, False),        # waiting, not assigned to the instance
+        ('waiting', 'awx', '', None, False),     # waiting, was edited less than a minute ago
+        ('waiting', '', 'awx', None, False),     # waiting, was edited less than a minute ago
+        ('waiting', 'awx', '', yesterday, True), # waiting, assigned to the execution_node, stale
+        ('waiting', '', 'awx', yesterday, True), # waiting, assigned to the controller_node, stale
+    ])
+    def test_should_reap(self, status, fail, execution_node, controller_node, modified):
+        i = Instance(hostname='awx')
+        i.save()
+        j = Job(
+            status=status,
+            execution_node=execution_node,
+            controller_node=controller_node,
+            start_args='SENSITIVE',
+        )
+        j.save()
+        if modified:
+            # we have to edit the modification time _without_ calling save()
+            # (because .save() overwrites it to _now_)
+            Job.objects.filter(id=j.id).update(modified=modified)
+        reaper.reap(i)
+        job = Job.objects.first()
+        if fail:
+            assert job.status == 'failed'
+            assert 'marked as failed' in job.job_explanation
+            assert job.start_args == ''
+        else:
+            assert job.status == status
+
+    def test_workflow_does_not_reap(self):
+        i = Instance(hostname='awx')
+        i.save()
+        j = WorkflowJob(
+            status='running',
+            execution_node='awx'
+        )
+        j.save()
+        reaper.reap(i)
+
+        assert WorkflowJob.objects.first().status == 'running'
--- a/awx/main/tests/unit/test_task_manager.py
+++ b/awx/main/tests/unit/test_task_manager.py
@@ -1,69 +0,0 @@
-# Copyright (c) 2017 Ansible by Red Hat
-# All Rights Reserved.
-
-import mock
-import pytest
-
-from django.utils.timezone import now as tz_now
-from django.db import DatabaseError
-
-from awx.main.scheduler import TaskManager
-from awx.main.models import (
-    Job,
-    Instance,
-    InstanceGroup,
-)
-from django.core.cache import cache
-
-
-class TestCleanupInconsistentCeleryTasks():
-    @mock.patch.object(cache, 'get', return_value=None)
-    @mock.patch.object(TaskManager, 'get_active_tasks', return_value=([], {}))
-    @mock.patch.object(TaskManager, 'get_running_tasks', return_value=({'host1': [Job(id=2), Job(id=3),]}, []))
-    @mock.patch.object(InstanceGroup.objects, 'prefetch_related', return_value=[])
-    @mock.patch.object(Instance.objects, 'filter', return_value=mock.MagicMock(first=lambda: None))
-    @mock.patch('awx.main.scheduler.task_manager.logger')
-    def test_instance_does_not_exist(self, logger_mock, *args):
-        logger_mock.error = mock.MagicMock(side_effect=RuntimeError("mocked"))
-        tm = TaskManager()
-        with pytest.raises(RuntimeError) as excinfo:
-            tm.cleanup_inconsistent_celery_tasks()
-
-        assert "mocked" in str(excinfo.value)
-        logger_mock.error.assert_called_once_with("Execution node Instance host1 not found in database. "
-                                                  "The node is currently executing jobs ['job 2 (new)', "
-                                                  "'job 3 (new)']")
-
-    @mock.patch.object(cache, 'get', return_value=None)
-    @mock.patch.object(TaskManager, 'get_active_tasks', return_value=([], {'host1': []}))
-    @mock.patch.object(InstanceGroup.objects, 'prefetch_related', return_value=[])
-    @mock.patch.object(TaskManager, 'get_running_tasks')
-    @mock.patch('awx.main.scheduler.task_manager.logger')
-    def test_save_failed(self, logger_mock, get_running_tasks, *args):
-        logger_mock.error = mock.MagicMock()
-        job = Job(id=2, modified=tz_now(), status='running', celery_task_id='blah', execution_node='host1')
-        job.websocket_emit_status = mock.MagicMock()
-        get_running_tasks.return_value = ({'host1': [job]}, [])
-        tm = TaskManager()
-
-        with mock.patch.object(job, 'save', side_effect=DatabaseError):
-            tm.cleanup_inconsistent_celery_tasks()
-            job.save.assert_called_once()
-            logger_mock.error.assert_called_once_with("Task job 2 (failed) DB error in marking failed. Job possibly deleted.")
-
-    @mock.patch.object(InstanceGroup.objects, 'prefetch_related', return_value=[])
-    @mock.patch('awx.main.scheduler.task_manager.Inspect')
-    def test_multiple_active_instances_sanity_check(self, inspect_mock, *args):
-        class MockInspector:
-            pass
-
-        mock_inspector = MockInspector()
-        mock_inspector.active = lambda: {
-            'celery@host1': [],
-            'celery@host2': []
-        }
-        inspect_mock.return_value = mock_inspector
-        tm = TaskManager()
-        active_task_queues, queues = tm.get_active_tasks()
-        assert 'host1' in queues
-        assert 'host2' in queues
--- a/awx/main/tests/unit/test_tasks.py
+++ b/awx/main/tests/unit/test_tasks.py
@@ -67,7 +67,7 @@ def test_work_success_callback_missing_job():
    task_data = {'type': 'project_update', 'id': 9999}
    with mock.patch('django.db.models.query.QuerySet.get') as get_mock:
        get_mock.side_effect = ProjectUpdate.DoesNotExist()
-        assert tasks.handle_work_success(None, task_data) is None
+        assert tasks.handle_work_success(task_data) is None


 def test_send_notifications_list(mocker):
--- a/awx/main/tests/unit/utils/test_reload.py
+++ b/awx/main/tests/unit/utils/test_reload.py
@@ -8,8 +8,8 @@ def test_produce_supervisor_command(mocker):
    mock_process.communicate = communicate_mock
    Popen_mock = mocker.MagicMock(return_value=mock_process)
    with mocker.patch.object(reload.subprocess, 'Popen', Popen_mock):
-        reload._supervisor_service_command(['beat', 'callback', 'fact'], "restart")
+        reload._supervisor_service_command("restart")
        reload.subprocess.Popen.assert_called_once_with(
-            ['supervisorctl', 'restart', 'tower-processes:receiver',],
+            ['supervisorctl', 'restart', 'tower-processes:*',],
            stderr=-1, stdin=-1, stdout=-1)

--- a/awx/main/utils/autoscale.py
+++ b/awx/main/utils/autoscale.py
@@ -1,27 +0,0 @@
-from celery.utils.log import get_logger
-from celery.worker.autoscale import Autoscaler, AUTOSCALE_KEEPALIVE
-from django.conf import settings
-import psutil
-
-logger = get_logger('awx.main.tasks')
-
-
-class DynamicAutoScaler(Autoscaler):
-
-    def __init__(self, pool, max_concurrency, min_concurrency=0, worker=None,
-                 keepalive=AUTOSCALE_KEEPALIVE, mutex=None):
-        super(DynamicAutoScaler, self).__init__(pool, max_concurrency,
-                                                min_concurrency, worker,
-                                                keepalive, mutex)
-        settings_absmem = getattr(settings, 'SYSTEM_TASK_ABS_MEM', None)
-        if settings_absmem is not None:
-            total_memory_gb = int(settings_absmem)
-        else:
-            total_memory_gb = (psutil.virtual_memory().total >> 30) + 1  # noqa: round up
-
-        # 5 workers per GB of total memory
-        self.max_concurrency = min(max_concurrency, (total_memory_gb * 5))
-        logger.warn('celery worker dynamic --autoscale={},{}'.format(
-            self.max_concurrency,
-            self.min_concurrency
-        ))
--- a/awx/main/utils/ha.py
+++ b/awx/main/utils/ha.py
@@ -1,17 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright (c) 2017 Ansible Tower by Red Hat
-# All Rights Reserved.
-
-from django.conf import settings
-
-
-class AWXCeleryRouter(object):
-    def route_for_task(self, task, args=None, kwargs=None):
-        tasks = [
-            'awx.main.tasks.cluster_node_heartbeat',
-            'awx.main.tasks.purge_old_stdout_files',
-            'awx.main.tasks.awx_isolated_heartbeat',
-        ]
-        if task in tasks:
-            return {'queue': settings.CLUSTER_HOST_ID, 'routing_key': settings.CLUSTER_HOST_ID}
--- a/awx/main/utils/reload.py
+++ b/awx/main/utils/reload.py
@@ -11,11 +11,8 @@ from django.conf import settings
 logger = logging.getLogger('awx.main.utils.reload')


-def _supervisor_service_command(service_internal_names, command, communicate=True):
+def _supervisor_service_command(command, communicate=True):
    '''
-    Service internal name options:
-     - beat - celery - callback - channels - uwsgi - daphne
-     - fact - nginx
    example use pattern of supervisorctl:
    # supervisorctl restart tower-processes:receiver tower-processes:factcacher
    '''
@@ -25,13 +22,7 @@ def _supervisor_service_command(service_internal_names, command, communicate=Tru
    args = ['supervisorctl']
    if settings.DEBUG:
        args.extend(['-c', '/supervisor.conf'])
-    programs = []
-    name_translation_dict = settings.SERVICE_NAME_DICT
-    for n in service_internal_names:
-        if n in name_translation_dict:
-            programs.append('{}:{}'.format(group_name, name_translation_dict[n]))
-    args.extend([command])
-    args.extend(programs)
+    args.extend([command, '{}:*'.format(group_name)])
    logger.debug('Issuing command to {} services, args={}'.format(command, args))
    supervisor_process = subprocess.Popen(args, stdin=subprocess.PIPE,
                                          stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@@ -48,6 +39,6 @@ def _supervisor_service_command(service_internal_names, command, communicate=Tru
        logger.info('Submitted supervisorctl {} command, not waiting for result'.format(command))


-def stop_local_services(service_internal_names, communicate=True):
-    logger.warn('Stopping services {} on this node in response to user action'.format(service_internal_names))
-    _supervisor_service_command(service_internal_names, command='stop', communicate=communicate)
+def stop_local_services(communicate=True):
+    logger.warn('Stopping services on this node in response to user action')
+    _supervisor_service_command(command='stop', communicate=communicate)
--- a/awx/settings/defaults.py
+++ b/awx/settings/defaults.py
@@ -4,7 +4,6 @@
 import os
 import re  # noqa
 import sys
-import djcelery
 import six
 from datetime import timedelta

@@ -26,6 +25,8 @@ BASE_DIR = os.path.dirname(os.path.dirname(__file__))
 def is_testing(argv=None):
    import sys
    '''Return True if running django or py.test unit tests.'''
+    if 'PYTEST_CURRENT_TEST' in os.environ.keys():
+        return True
    argv = sys.argv if argv is None else argv
    if len(argv) >= 1 and ('py.test' in argv[0] or 'py/test.py' in argv[0]):
        return True
@@ -60,7 +61,7 @@ DATABASES = {
        'NAME': os.path.join(BASE_DIR, 'awx.sqlite3'),
        'ATOMIC_REQUESTS': True,
        'TEST': {
-            # Test database cannot be :memory: for celery/inventory tests.
+            # Test database cannot be :memory: for inventory tests.
            'NAME': os.path.join(BASE_DIR, 'awx_test.sqlite3'),
        },
    }
@@ -280,7 +281,6 @@ INSTALLED_APPS = (
    'oauth2_provider',
    'rest_framework',
    'django_extensions',
-    'djcelery',
    'channels',
    'polymorphic',
    'taggit',
@@ -459,40 +459,9 @@ DEVSERVER_DEFAULT_PORT = '8013'
 # Set default ports for live server tests.
 os.environ.setdefault('DJANGO_LIVE_TEST_SERVER_ADDRESS', 'localhost:9013-9199')

-djcelery.setup_loader()
-
 BROKER_POOL_LIMIT = None
 BROKER_URL = 'amqp://guest:guest@localhost:5672//'
-CELERY_EVENT_QUEUE_TTL = 5
 CELERY_DEFAULT_QUEUE = 'awx_private_queue'
-CELERY_DEFAULT_EXCHANGE = 'awx_private_queue'
-CELERY_DEFAULT_ROUTING_KEY = 'awx_private_queue'
-CELERY_DEFAULT_EXCHANGE_TYPE = 'direct'
-CELERY_TASK_SERIALIZER = 'json'
-CELERY_RESULT_SERIALIZER = 'json'
-CELERY_ACCEPT_CONTENT = ['json']
-CELERY_TRACK_STARTED = True
-CELERYD_TASK_TIME_LIMIT = None
-CELERYD_TASK_SOFT_TIME_LIMIT = None
-CELERYD_POOL_RESTARTS = True
-CELERYD_AUTOSCALER = 'awx.main.utils.autoscale:DynamicAutoScaler'
-CELERY_RESULT_BACKEND = 'djcelery.backends.database:DatabaseBackend'
-CELERY_IMPORTS = ('awx.main.scheduler.tasks',)
-CELERY_QUEUES = ()
-CELERY_ROUTES = ('awx.main.utils.ha.AWXCeleryRouter',)
-
-
-def log_celery_failure(*args):
-    # Import annotations lazily to avoid polluting the `awx.settings` namespace
-    # and causing circular imports
-    from awx.main.tasks import log_celery_failure
-    return log_celery_failure(*args)
-
-
-CELERY_ANNOTATIONS = {'*': {'on_failure': log_celery_failure}}
-
-CELERYBEAT_SCHEDULER = 'celery.beat.PersistentScheduler'
-CELERYBEAT_MAX_LOOP_INTERVAL = 60
 CELERYBEAT_SCHEDULE = {
    'tower_scheduler': {
        'task': 'awx.main.tasks.awx_periodic_scheduler',
@@ -525,9 +494,6 @@ CELERYBEAT_SCHEDULE = {
 }
 AWX_INCONSISTENT_TASK_INTERVAL = 60 * 3

-# Celery queues that will always be listened to by celery workers
-# Note: Broadcast queues have unique, auto-generated names, with the alias
-# property value of the original queue name.
 AWX_CELERY_QUEUES_STATIC = [
    six.text_type(CELERY_DEFAULT_QUEUE),
 ]
@@ -626,8 +592,8 @@ SOCIAL_AUTH_SAML_ENABLED_IDPS = {}
 SOCIAL_AUTH_SAML_ORGANIZATION_ATTR = {}
 SOCIAL_AUTH_SAML_TEAM_ATTR = {}

-# Any ANSIBLE_* settings will be passed to the subprocess environment by the
-# celery task.
+# Any ANSIBLE_* settings will be passed to the task runner subprocess
+# environment

 # Do not want AWX to ask interactive questions and want it to be friendly with
 # reprovisioning
@@ -641,8 +607,7 @@ ANSIBLE_PARAMIKO_RECORD_HOST_KEYS = False
 # output
 ANSIBLE_FORCE_COLOR = True

-# Additional environment variables to be passed to the subprocess started by
-# the celery task.
+# Additional environment variables to be passed to the ansible subprocesses
 AWX_TASK_ENV = {}

 # Flag to enable/disable updating hosts M2M when saving job events.
@@ -1071,6 +1036,15 @@ LOGGING = {
            'backupCount': 5,
            'formatter':'simple',
        },
+        'callback_receiver': {
+            'level': 'WARNING',
+            'class':'logging.handlers.RotatingFileHandler',
+            'filters': ['require_debug_false'],
+            'filename': os.path.join(LOG_ROOT, 'callback_receiver.log'),
+            'maxBytes': 1024 * 1024 * 5, # 5 MB
+            'backupCount': 5,
+            'formatter':'simple',
+        },
        'dispatcher': {
            'level': 'WARNING',
            'class':'logging.handlers.RotatingFileHandler',
@@ -1080,6 +1054,10 @@ LOGGING = {
            'backupCount': 5,
            'formatter':'dispatcher',
        },
+        'celery.beat': {
+            'class':'logging.StreamHandler',
+            'level': 'ERROR'
+        },  # don't log every celerybeat wakeup
        'inventory_import': {
            'level': 'DEBUG',
            'class':'logging.StreamHandler',
@@ -1162,6 +1140,9 @@ LOGGING = {
        'awx.main': {
            'handlers': ['null']
        },
+        'awx.main.commands.run_callback_receiver': {
+            'handlers': ['callback_receiver'],
+        },
        'awx.main.dispatch': {
            'handlers': ['dispatcher'],
        },
--- a/awx/settings/development.py
+++ b/awx/settings/development.py
@@ -68,13 +68,6 @@ template['OPTIONS']['loaders'] = (
    'django.template.loaders.app_directories.Loader',
 )

-# Disable capturing all SQL queries when running celeryd in development.
-if 'celery' in sys.argv:
-    SQL_DEBUG = False
-
-CELERYD_HIJACK_ROOT_LOGGER = False
-CELERYD_LOG_COLOR = True
-
 CALLBACK_QUEUE = "callback_tasks"

 # Enable dynamically pulling roles from a requirement.yml file
@@ -149,15 +142,6 @@ except ImportError:

 CLUSTER_HOST_ID = socket.gethostname()

-# Supervisor service name dictionary used for programatic restart
-SERVICE_NAME_DICT = {
-    "celery": "celery",
-    "callback": "receiver",
-    "runworker": "channels",
-    "uwsgi": "uwsgi",
-    "daphne": "daphne",
-    "nginx": "nginx"}
-
 try:
    socket.gethostbyname('docker.for.mac.host.internal')
    os.environ['SDB_NOTIFY_HOST'] = 'docker.for.mac.host.internal'
--- a/awx/settings/local_settings.py.docker_compose
+++ b/awx/settings/local_settings.py.docker_compose
@@ -73,13 +73,13 @@ if "pytest" in sys.modules:
            'ENGINE': 'django.db.backends.sqlite3',
            'NAME': os.path.join(BASE_DIR, 'awx.sqlite3'),
            'TEST': {
-                # Test database cannot be :memory: for celery/inventory tests.
+                # Test database cannot be :memory: for inventory tests.
                'NAME': os.path.join(BASE_DIR, 'awx_test.sqlite3'),
            },
        }
    }

-# Celery AMQP configuration.
+# AMQP configuration.
 BROKER_URL = "amqp://{}:{}@{}/{}".format(os.environ.get("RABBITMQ_USER"),
                                  os.environ.get("RABBITMQ_PASS"),
                                  os.environ.get("RABBITMQ_HOST"),
@@ -138,8 +138,7 @@ REMOTE_HOST_HEADERS = ['REMOTE_ADDR', 'REMOTE_HOST']
 # REMOTE_HOST_HEADERS will be trusted unconditionally')
 PROXY_IP_WHITELIST = []

-# Define additional environment variables to be passed to subprocess started by
-# the celery task.
+# Define additional environment variables to be passed to ansible subprocesses
 #AWX_TASK_ENV['FOO'] = 'BAR'

 # If set, use -vvv for project updates instead of -v for more output.
--- a/awx/settings/local_settings.py.example
+++ b/awx/settings/local_settings.py.example
@@ -39,13 +39,13 @@ if is_testing(sys.argv):
            'ENGINE': 'django.db.backends.sqlite3',
            'NAME': os.path.join(BASE_DIR, 'awx.sqlite3'),
            'TEST': {
-                # Test database cannot be :memory: for celery/inventory tests.
+                # Test database cannot be :memory: for tests.
                'NAME': os.path.join(BASE_DIR, 'awx_test.sqlite3'),
            },
        }
    }
    
-# Celery AMQP configuration.
+# AMQP configuration.
 BROKER_URL = 'amqp://guest:guest@localhost:5672'

 # Set True to enable additional logging from the job_event_callback plugin
@@ -94,8 +94,7 @@ REMOTE_HOST_HEADERS = ['REMOTE_ADDR', 'REMOTE_HOST']
 # REMOTE_HOST_HEADERS will be trusted unconditionally')
 PROXY_IP_WHITELIST = []

-# Define additional environment variables to be passed to subprocess started by
-# the celery task.
+# Define additional environment variables to be passed to ansible subprocesses
 #AWX_TASK_ENV['FOO'] = 'BAR'

 # If set, use -vvv for project updates instead of -v for more output.
--- a/awx/settings/production.py
+++ b/awx/settings/production.py
@@ -54,21 +54,13 @@ AWX_ISOLATED_USERNAME = 'awx'

 LOGGING['handlers']['tower_warnings']['filename'] = '/var/log/tower/tower.log'
 LOGGING['handlers']['callback_receiver']['filename'] = '/var/log/tower/callback_receiver.log'
+LOGGING['handlers']['dispatcher']['filename'] = '/var/log/tower/dispatcher.log'
 LOGGING['handlers']['task_system']['filename'] = '/var/log/tower/task_system.log'
 LOGGING['handlers']['fact_receiver']['filename'] = '/var/log/tower/fact_receiver.log'
 LOGGING['handlers']['management_playbooks']['filename'] = '/var/log/tower/management_playbooks.log'
 LOGGING['handlers']['system_tracking_migrations']['filename'] = '/var/log/tower/tower_system_tracking_migrations.log'
 LOGGING['handlers']['rbac_migrations']['filename'] = '/var/log/tower/tower_rbac_migrations.log'

-# Supervisor service name dictionary used for programatic restart
-SERVICE_NAME_DICT = {
-    "beat": "awx-celery-beat",
-    "celery": "awx-celery",
-    "callback": "awx-callback-receiver",
-    "channels": "awx-channels-worker",
-    "uwsgi": "awx-uwsgi",
-    "daphne": "awx-daphne"}
-
 # Store a snapshot of default settings at this point before loading any
 # customizable config files.
 DEFAULTS_SNAPSHOT = {}