fix a bug that breaks job cancel on single node jobs

1.  Install awx w/ a single node.
2.  Start a long-running job.
3.  Forcibly kill the `awx-manage run_dispatcher` process (e.g.,
    SIGKILL) and do not start it again.
4.  The job remains in running - without a second cluster to discover
    the job, it is never reaped.
5.  This PR allows you to cancel the job from the UI+API.
This commit is contained in:
Ryan Petrello
2018-10-18 16:05:12 -04:00
parent 785c6fe846
commit 3be9113d6b
3 changed files with 47 additions and 12 deletions

View File

@@ -7,9 +7,11 @@ import json
import logging
import os
import re
import socket
import subprocess
import tempfile
from collections import OrderedDict
import six
# Django
from django.conf import settings
@@ -29,6 +31,7 @@ from polymorphic.models import PolymorphicModel
# AWX
from awx.main.models.base import * # noqa
from awx.main.dispatch.control import Control as ControlDispatcher
from awx.main.models.mixins import ResourceMixin, TaskManagerUnifiedJobMixin
from awx.main.utils import (
encrypt_dict, decrypt_field, _inventory_updates,
@@ -1248,6 +1251,31 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
# Done!
return True
@property
def actually_running(self):
# returns True if the job is running in the appropriate dispatcher process
running = False
if all([
self.status == 'running',
self.celery_task_id,
self.execution_node
]):
# If the job is marked as running, but the dispatcher
# doesn't know about it (or the dispatcher doesn't reply),
# then cancel the job
timeout = 5
try:
running = self.celery_task_id in ControlDispatcher(
'dispatcher', self.execution_node
).running(timeout=timeout)
except socket.timeout:
logger.error(six.text_type(
'could not reach dispatcher on {} within {}s'
).format(self.execution_node, timeout))
running = False
return running
@property
def can_cancel(self):
return bool(self.status in CAN_CANCEL)
@@ -1270,6 +1298,9 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
if self.status in ('pending', 'waiting', 'new'):
self.status = 'canceled'
cancel_fields.append('status')
if self.status == 'running' and not self.actually_running:
self.status = 'canceled'
cancel_fields.append('status')
if job_explanation is not None:
self.job_explanation = job_explanation
cancel_fields.append('job_explanation')

View File

@@ -481,3 +481,9 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, SurveyJobMixin, JobNotificatio
@property
def preferred_instance_groups(self):
return []
@property
def actually_running(self):
# WorkflowJobs don't _actually_ run anything in the dispatcher, so
# there's no point in asking the dispatcher if it knows about this task
return self.status == 'running'