mirror of
https://github.com/ansible/awx.git
synced 2026-02-25 15:06:02 -03:30
fix a bug that breaks job cancel on single node jobs
1. Install awx w/ a single node.
2. Start a long-running job.
3. Forcibly kill the `awx-manage run_dispatcher` process (e.g.,
SIGKILL) and do not start it again.
4. The job remains in running - without a second cluster to discover
the job, it is never reaped.
5. This PR allows you to cancel the job from the UI+API.
This commit is contained in:
@@ -7,9 +7,11 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import subprocess
|
||||
import tempfile
|
||||
from collections import OrderedDict
|
||||
import six
|
||||
|
||||
# Django
|
||||
from django.conf import settings
|
||||
@@ -29,6 +31,7 @@ from polymorphic.models import PolymorphicModel
|
||||
|
||||
# AWX
|
||||
from awx.main.models.base import * # noqa
|
||||
from awx.main.dispatch.control import Control as ControlDispatcher
|
||||
from awx.main.models.mixins import ResourceMixin, TaskManagerUnifiedJobMixin
|
||||
from awx.main.utils import (
|
||||
encrypt_dict, decrypt_field, _inventory_updates,
|
||||
@@ -1248,6 +1251,31 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
|
||||
# Done!
|
||||
return True
|
||||
|
||||
|
||||
@property
|
||||
def actually_running(self):
|
||||
# returns True if the job is running in the appropriate dispatcher process
|
||||
running = False
|
||||
if all([
|
||||
self.status == 'running',
|
||||
self.celery_task_id,
|
||||
self.execution_node
|
||||
]):
|
||||
# If the job is marked as running, but the dispatcher
|
||||
# doesn't know about it (or the dispatcher doesn't reply),
|
||||
# then cancel the job
|
||||
timeout = 5
|
||||
try:
|
||||
running = self.celery_task_id in ControlDispatcher(
|
||||
'dispatcher', self.execution_node
|
||||
).running(timeout=timeout)
|
||||
except socket.timeout:
|
||||
logger.error(six.text_type(
|
||||
'could not reach dispatcher on {} within {}s'
|
||||
).format(self.execution_node, timeout))
|
||||
running = False
|
||||
return running
|
||||
|
||||
@property
|
||||
def can_cancel(self):
|
||||
return bool(self.status in CAN_CANCEL)
|
||||
@@ -1270,6 +1298,9 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
|
||||
if self.status in ('pending', 'waiting', 'new'):
|
||||
self.status = 'canceled'
|
||||
cancel_fields.append('status')
|
||||
if self.status == 'running' and not self.actually_running:
|
||||
self.status = 'canceled'
|
||||
cancel_fields.append('status')
|
||||
if job_explanation is not None:
|
||||
self.job_explanation = job_explanation
|
||||
cancel_fields.append('job_explanation')
|
||||
|
||||
@@ -481,3 +481,9 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, SurveyJobMixin, JobNotificatio
|
||||
@property
|
||||
def preferred_instance_groups(self):
|
||||
return []
|
||||
|
||||
@property
|
||||
def actually_running(self):
|
||||
# WorkflowJobs don't _actually_ run anything in the dispatcher, so
|
||||
# there's no point in asking the dispatcher if it knows about this task
|
||||
return self.status == 'running'
|
||||
|
||||
Reference in New Issue
Block a user