fix a bug that breaks job cancel on single node jobs

1. Install awx w/ a single node. 2. Start a long-running job. 3. Forcibly kill the `awx-manage run_dispatcher` process (e.g., SIGKILL) and do not start it again. 4. The job remains in running - without a second cluster to discover the job, it is never reaped. 5. This PR allows you to cancel the job from the UI+API.
2026-02-25 15:06:02 -03:30 · 2018-10-18 16:05:12 -04:00
parent 785c6fe846
commit 3be9113d6b
3 changed files with 47 additions and 12 deletions
--- a/awx/main/models/unified_jobs.py
+++ b/awx/main/models/unified_jobs.py
@@ -7,9 +7,11 @@ import json
 import logging
 import os
 import re
+import socket
 import subprocess
 import tempfile
 from collections import OrderedDict
+import six

 # Django
 from django.conf import settings
@@ -29,6 +31,7 @@ from polymorphic.models import PolymorphicModel

 # AWX
 from awx.main.models.base import * # noqa
+from awx.main.dispatch.control import Control as ControlDispatcher
 from awx.main.models.mixins import ResourceMixin, TaskManagerUnifiedJobMixin
 from awx.main.utils import (
    encrypt_dict, decrypt_field, _inventory_updates,
@@ -1248,6 +1251,31 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
        # Done!
        return True

+
+    @property
+    def actually_running(self):
+        # returns True if the job is running in the appropriate dispatcher process
+        running = False
+        if all([
+            self.status == 'running',
+            self.celery_task_id,
+            self.execution_node
+        ]):
+            # If the job is marked as running, but the dispatcher
+            # doesn't know about it (or the dispatcher doesn't reply),
+            # then cancel the job
+            timeout = 5
+            try:
+                running = self.celery_task_id in ControlDispatcher(
+                    'dispatcher', self.execution_node
+                ).running(timeout=timeout)
+            except socket.timeout:
+                logger.error(six.text_type(
+                    'could not reach dispatcher on {} within {}s'
+                ).format(self.execution_node, timeout))
+                running = False
+        return running
+
    @property
    def can_cancel(self):
        return bool(self.status in CAN_CANCEL)
@@ -1270,6 +1298,9 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique
                if self.status in ('pending', 'waiting', 'new'):
                    self.status = 'canceled'
                    cancel_fields.append('status')
+                if self.status == 'running' and not self.actually_running:
+                    self.status = 'canceled'
+                    cancel_fields.append('status')
                if job_explanation is not None:
                    self.job_explanation = job_explanation
                    cancel_fields.append('job_explanation')
--- a/awx/main/models/workflow.py
+++ b/awx/main/models/workflow.py
@@ -481,3 +481,9 @@ class WorkflowJob(UnifiedJob, WorkflowJobOptions, SurveyJobMixin, JobNotificatio
    @property
    def preferred_instance_groups(self):
        return []
+
+    @property
+    def actually_running(self):
+        # WorkflowJobs don't _actually_ run anything in the dispatcher, so
+        # there's no point in asking the dispatcher if it knows about this task
+        return self.status == 'running'