fix a bug that breaks job cancel on single node jobs

1.  Install awx w/ a single node.
2.  Start a long-running job.
3.  Forcibly kill the `awx-manage run_dispatcher` process (e.g.,
    SIGKILL) and do not start it again.
4.  The job remains in running - without a second cluster to discover
    the job, it is never reaped.
5.  This PR allows you to cancel the job from the UI+API.
This commit is contained in:
Ryan Petrello 2018-10-18 16:05:12 -04:00
parent 785c6fe846
commit 3be9113d6b
No known key found for this signature in database
GPG Key ID: F2AA5F2122351777
3 changed files with 47 additions and 12 deletions

View File

@ -14,18 +14,18 @@ class Control(object):
services = ('dispatcher', 'callback_receiver')
result = None
def __init__(self, service):
def __init__(self, service, host=None):
if service not in self.services:
raise RuntimeError('{} must be in {}'.format(service, self.services))
self.service = service
queuename = get_local_queuename()
self.queue = Queue(queuename, Exchange(queuename), routing_key=queuename)
self.queuename = host or get_local_queuename()
self.queue = Queue(self.queuename, Exchange(self.queuename), routing_key=self.queuename)
def publish(self, msg, conn, host, **kwargs):
def publish(self, msg, conn, **kwargs):
producer = Producer(
exchange=self.queue.exchange,
channel=conn,
routing_key=get_local_queuename()
routing_key=self.queuename
)
producer.publish(msg, expiration=5, **kwargs)
@ -35,14 +35,13 @@ class Control(object):
def running(self, *args, **kwargs):
return self.control_with_reply('running', *args, **kwargs)
def control_with_reply(self, command, host=None, timeout=5):
host = host or settings.CLUSTER_HOST_ID
logger.warn('checking {} {} for {}'.format(self.service, command, host))
def control_with_reply(self, command, timeout=5):
logger.warn('checking {} {} for {}'.format(self.service, command, self.queuename))
reply_queue = Queue(name="amq.rabbitmq.reply-to")
self.result = None
with Connection(settings.BROKER_URL) as conn:
with Consumer(conn, reply_queue, callbacks=[self.process_message], no_ack=True):
self.publish({'control': command}, conn, host, reply_to='amq.rabbitmq.reply-to')
self.publish({'control': command}, conn, reply_to='amq.rabbitmq.reply-to')
try:
conn.drain_events(timeout=timeout)
except socket.timeout:
@ -50,10 +49,9 @@ class Control(object):
raise
return self.result
def control(self, msg, host=None, **kwargs):
host = host or settings.CLUSTER_HOST_ID
def control(self, msg, **kwargs):
with Connection(settings.BROKER_URL) as conn:
self.publish(msg, conn, host)
self.publish(msg, conn)
def process_message(self, body, message):
self.result = body

View File

@ -7,9 +7,11 @@ import json
import logging
import os
import re
import socket
import subprocess
import tempfile
from collections import OrderedDict
import six
# Django
from django.conf import settings
@ -29,6 +31,7 @@ from polymorphic.models import PolymorphicModel
# AWX
from awx.main.models.base import * # noqa
from awx.main.dispatch.control import Control as ControlDispatcher
from awx.main.models.mixins import ResourceMixin, TaskManagerUnifiedJobMixin
from awx.main.utils import (
encrypt_dict, decrypt_field, _inventory_updates,
@ -1248,6 +1251,31 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
# Done!
return True
@property
def actually_running(self):
# returns True if the job is running in the appropriate dispatcher process
running = False
if all([
self.status == 'running',
self.celery_task_id,
self.execution_node
]):
# If the job is marked as running, but the dispatcher
# doesn't know about it (or the dispatcher doesn't reply),
# then cancel the job
timeout = 5
try:
running = self.celery_task_id in ControlDispatcher(
'dispatcher', self.execution_node
).running(timeout=timeout)
except socket.timeout:
logger.error(six.text_type(
'could not reach dispatcher on {} within {}s'
).format(self.execution_node, timeout))
running = False
return running
@property
def can_cancel(self):
return bool(self.status in CAN_CANCEL)
@ -1270,6 +1298,9 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
if self.status in ('pending', 'waiting', 'new'):
self.status = 'canceled'
cancel_fields.append('status')
if self.status == 'running' and not self.actually_running:
self.status = 'canceled'
cancel_fields.append('status')
if job_explanation is not None:
self.job_explanation = job_explanation
cancel_fields.append('job_explanation')

View File

@ -481,3 +481,9 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, SurveyJobMixin, JobNotificatio
@property
def preferred_instance_groups(self):
return []
@property
def actually_running(self):
# WorkflowJobs don't _actually_ run anything in the dispatcher, so
# there's no point in asking the dispatcher if it knows about this task
return self.status == 'running'