From a4a3ba65d736045733cb49430d7076b73aec23bb Mon Sep 17 00:00:00 2001 From: Amol Gautam Date: Wed, 12 Jan 2022 15:40:32 -0500 Subject: [PATCH] Refactored tasks.py to a package --- Added 3 new sub-package : awx.main.tasks.system , awx.main.tasks.jobs , awx.main.tasks.receptor --- Modified the functional tests and unit tests accordingly --- awx/api/generics.py | 2 +- awx/api/views/__init__.py | 6 +- awx/conf/views.py | 2 +- awx/main/dispatch/worker/callback.py | 2 +- awx/main/dispatch/worker/task.py | 12 +- awx/main/management/commands/profile_sql.py | 2 +- awx/main/models/ad_hoc_commands.py | 2 +- awx/main/models/events.py | 2 +- awx/main/models/ha.py | 2 +- awx/main/models/inventory.py | 10 +- awx/main/models/jobs.py | 4 +- awx/main/models/notifications.py | 2 +- awx/main/models/projects.py | 4 +- awx/main/models/unified_jobs.py | 2 +- awx/main/models/workflow.py | 2 +- awx/main/scheduler/task_manager.py | 2 +- awx/main/signals.py | 2 +- awx/main/tasks/__init__.py | 0 awx/main/{tasks.py => tasks/jobs.py} | 1217 +---------------- awx/main/tasks/receptor.py | 534 ++++++++ awx/main/tasks/system.py | 897 ++++++++++++ .../tests/functional/api/test_instance.py | 2 +- .../task_management/test_container_groups.py | 2 +- .../task_management/test_rampart_groups.py | 2 +- awx/main/tests/functional/test_copy.py | 2 +- awx/main/tests/functional/test_instances.py | 2 +- .../test_inventory_source_injectors.py | 4 +- awx/main/tests/functional/test_jobs.py | 4 +- awx/main/tests/functional/test_tasks.py | 7 +- awx/main/tests/unit/settings/test_defaults.py | 2 +- awx/main/tests/unit/test_tasks.py | 184 +-- awx/main/tests/unit/utils/test_receptor.py | 2 +- awx/main/utils/receptor.py | 230 ---- awx/settings/defaults.py | 12 +- docs/debugging/debugging_misc.md | 2 +- docs/tasks.md | 4 +- 36 files changed, 1607 insertions(+), 1562 deletions(-) create mode 100644 awx/main/tasks/__init__.py rename awx/main/{tasks.py => tasks/jobs.py} (65%) create mode 100644 awx/main/tasks/receptor.py create mode 100644 awx/main/tasks/system.py delete mode 100644 awx/main/utils/receptor.py diff --git a/awx/api/generics.py b/awx/api/generics.py index f8f1a6f85c..7556bbbc9f 100644 --- a/awx/api/generics.py +++ b/awx/api/generics.py @@ -817,7 +817,7 @@ class ResourceAccessList(ParentMixin, ListAPIView): def trigger_delayed_deep_copy(*args, **kwargs): - from awx.main.tasks import deep_copy_model_obj + from awx.main.tasks.system import deep_copy_model_obj connection.on_commit(lambda: deep_copy_model_obj.delay(*args, **kwargs)) diff --git a/awx/api/views/__init__.py b/awx/api/views/__init__.py index 9907a91092..30eee980e2 100644 --- a/awx/api/views/__init__.py +++ b/awx/api/views/__init__.py @@ -62,7 +62,7 @@ import pytz from wsgiref.util import FileWrapper # AWX -from awx.main.tasks import send_notifications, update_inventory_computed_fields +from awx.main.tasks.system import send_notifications, update_inventory_computed_fields from awx.main.access import get_user_queryset, HostAccess from awx.api.generics import ( APIView, @@ -431,7 +431,7 @@ class InstanceHealthCheck(GenericAPIView): obj = self.get_object() if obj.node_type == 'execution': - from awx.main.tasks import execution_node_health_check + from awx.main.tasks.system import execution_node_health_check runner_data = execution_node_health_check(obj.hostname) obj.refresh_from_db() @@ -441,7 +441,7 @@ class InstanceHealthCheck(GenericAPIView): if extra_field in runner_data: data[extra_field] = runner_data[extra_field] else: - from awx.main.tasks import cluster_node_health_check + from awx.main.tasks.system import cluster_node_health_check if settings.CLUSTER_HOST_ID == obj.hostname: cluster_node_health_check(obj.hostname) diff --git a/awx/conf/views.py b/awx/conf/views.py index f0ff1607b7..a9eae07409 100644 --- a/awx/conf/views.py +++ b/awx/conf/views.py @@ -26,7 +26,7 @@ from awx.api.generics import APIView, GenericAPIView, ListAPIView, RetrieveUpdat from awx.api.permissions import IsSystemAdminOrAuditor from awx.api.versioning import reverse from awx.main.utils import camelcase_to_underscore -from awx.main.tasks import handle_setting_changes +from awx.main.tasks.system import handle_setting_changes from awx.conf.models import Setting from awx.conf.serializers import SettingCategorySerializer, SettingSingletonSerializer from awx.conf import settings_registry diff --git a/awx/main/dispatch/worker/callback.py b/awx/main/dispatch/worker/callback.py index 279db49bfb..ccf3b063d1 100644 --- a/awx/main/dispatch/worker/callback.py +++ b/awx/main/dispatch/worker/callback.py @@ -17,7 +17,7 @@ import redis from awx.main.consumers import emit_channel_notification from awx.main.models import JobEvent, AdHocCommandEvent, ProjectUpdateEvent, InventoryUpdateEvent, SystemJobEvent, UnifiedJob, Job -from awx.main.tasks import handle_success_and_failure_notifications +from awx.main.tasks.system import handle_success_and_failure_notifications from awx.main.models.events import emit_event_detail from awx.main.utils.profiling import AWXProfiler import awx.main.analytics.subsystem_metrics as s_metrics diff --git a/awx/main/dispatch/worker/task.py b/awx/main/dispatch/worker/task.py index e55cfbdde2..91ce7f47b4 100644 --- a/awx/main/dispatch/worker/task.py +++ b/awx/main/dispatch/worker/task.py @@ -9,7 +9,7 @@ from kubernetes.config import kube_config from django.conf import settings from django_guid.middleware import GuidMiddleware -from awx.main.tasks import dispatch_startup, inform_cluster_of_shutdown +from awx.main.tasks.system import dispatch_startup, inform_cluster_of_shutdown from .base import BaseWorker @@ -30,8 +30,8 @@ class TaskWorker(BaseWorker): """ Transform a dotted notation task into an imported, callable function, e.g., - awx.main.tasks.delete_inventory - awx.main.tasks.RunProjectUpdate + awx.main.tasks.system.delete_inventory + awx.main.tasks.jobs.RunProjectUpdate """ if not task.startswith('awx.'): raise ValueError('{} is not a valid awx task'.format(task)) @@ -73,15 +73,15 @@ class TaskWorker(BaseWorker): 'callbacks': [{ 'args': [], 'kwargs': {} - 'task': u'awx.main.tasks.handle_work_success' + 'task': u'awx.main.tasks.system.handle_work_success' }], 'errbacks': [{ 'args': [], 'kwargs': {}, - 'task': 'awx.main.tasks.handle_work_error' + 'task': 'awx.main.tasks.system.handle_work_error' }], 'kwargs': {}, - 'task': u'awx.main.tasks.RunProjectUpdate' + 'task': u'awx.main.tasks.jobs.RunProjectUpdate' } """ settings.__clean_on_fork__() diff --git a/awx/main/management/commands/profile_sql.py b/awx/main/management/commands/profile_sql.py index 2853b072ff..48701d26f3 100644 --- a/awx/main/management/commands/profile_sql.py +++ b/awx/main/management/commands/profile_sql.py @@ -1,6 +1,6 @@ from django.core.management.base import BaseCommand -from awx.main.tasks import profile_sql +from awx.main.tasks.system import profile_sql class Command(BaseCommand): diff --git a/awx/main/models/ad_hoc_commands.py b/awx/main/models/ad_hoc_commands.py index 9873888981..48273c983b 100644 --- a/awx/main/models/ad_hoc_commands.py +++ b/awx/main/models/ad_hoc_commands.py @@ -144,7 +144,7 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin): @classmethod def _get_task_class(cls): - from awx.main.tasks import RunAdHocCommand + from awx.main.tasks.jobs import RunAdHocCommand return RunAdHocCommand diff --git a/awx/main/models/events.py b/awx/main/models/events.py index 0a3cef78d1..0e516aa669 100644 --- a/awx/main/models/events.py +++ b/awx/main/models/events.py @@ -388,7 +388,7 @@ class BasePlaybookEvent(CreatedModifiedModel): job.get_event_queryset().filter(uuid__in=failed).update(failed=True) # send success/failure notifications when we've finished handling the playbook_on_stats event - from awx.main.tasks import handle_success_and_failure_notifications # circular import + from awx.main.tasks.system import handle_success_and_failure_notifications # circular import def _send_notifications(): handle_success_and_failure_notifications.apply_async([job.id]) diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 78f07861d2..653b4f1814 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -376,7 +376,7 @@ class TowerScheduleState(SingletonModel): def schedule_policy_task(): - from awx.main.tasks import apply_cluster_membership_policies + from awx.main.tasks.system import apply_cluster_membership_policies connection.on_commit(lambda: apply_cluster_membership_policies.apply_async()) diff --git a/awx/main/models/inventory.py b/awx/main/models/inventory.py index 2d5508d4d2..50d50cc005 100644 --- a/awx/main/models/inventory.py +++ b/awx/main/models/inventory.py @@ -366,7 +366,7 @@ class Inventory(CommonModelNameNotUnique, ResourceMixin, RelatedJobsMixin): @transaction.atomic def schedule_deletion(self, user_id=None): - from awx.main.tasks import delete_inventory + from awx.main.tasks.system import delete_inventory from awx.main.signals import activity_stream_delete if self.pending_deletion is True: @@ -382,7 +382,7 @@ class Inventory(CommonModelNameNotUnique, ResourceMixin, RelatedJobsMixin): if self.kind == 'smart' and settings.AWX_REBUILD_SMART_MEMBERSHIP: def on_commit(): - from awx.main.tasks import update_host_smart_inventory_memberships + from awx.main.tasks.system import update_host_smart_inventory_memberships update_host_smart_inventory_memberships.delay() @@ -551,7 +551,7 @@ class Host(CommonModelNameNotUnique, RelatedJobsMixin): if settings.AWX_REBUILD_SMART_MEMBERSHIP: def on_commit(): - from awx.main.tasks import update_host_smart_inventory_memberships + from awx.main.tasks.system import update_host_smart_inventory_memberships update_host_smart_inventory_memberships.delay() @@ -631,7 +631,7 @@ class Group(CommonModelNameNotUnique, RelatedJobsMixin): @transaction.atomic def delete_recursive(self): from awx.main.utils import ignore_inventory_computed_fields - from awx.main.tasks import update_inventory_computed_fields + from awx.main.tasks.system import update_inventory_computed_fields from awx.main.signals import disable_activity_stream, activity_stream_delete def mark_actual(): @@ -1219,7 +1219,7 @@ class InventoryUpdate(UnifiedJob, InventorySourceOptions, JobNotificationMixin, @classmethod def _get_task_class(cls): - from awx.main.tasks import RunInventoryUpdate + from awx.main.tasks.jobs import RunInventoryUpdate return RunInventoryUpdate diff --git a/awx/main/models/jobs.py b/awx/main/models/jobs.py index 2d2f3ade16..e405c98596 100644 --- a/awx/main/models/jobs.py +++ b/awx/main/models/jobs.py @@ -583,7 +583,7 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana @classmethod def _get_task_class(cls): - from awx.main.tasks import RunJob + from awx.main.tasks.jobs import RunJob return RunJob @@ -1213,7 +1213,7 @@ class SystemJob(UnifiedJob, SystemJobOptions, JobNotificationMixin): @classmethod def _get_task_class(cls): - from awx.main.tasks import RunSystemJob + from awx.main.tasks.jobs import RunSystemJob return RunSystemJob diff --git a/awx/main/models/notifications.py b/awx/main/models/notifications.py index afb55f7682..860e591e2c 100644 --- a/awx/main/models/notifications.py +++ b/awx/main/models/notifications.py @@ -508,7 +508,7 @@ class JobNotificationMixin(object): return (msg, body) def send_notification_templates(self, status): - from awx.main.tasks import send_notifications # avoid circular import + from awx.main.tasks.system import send_notifications # avoid circular import if status not in ['running', 'succeeded', 'failed']: raise ValueError(_("status must be either running, succeeded or failed")) diff --git a/awx/main/models/projects.py b/awx/main/models/projects.py index c85cfd7000..3fdd98b2b7 100644 --- a/awx/main/models/projects.py +++ b/awx/main/models/projects.py @@ -471,7 +471,7 @@ class Project(UnifiedJobTemplate, ProjectOptions, ResourceMixin, CustomVirtualEn r = super(Project, self).delete(*args, **kwargs) for path_to_delete in paths_to_delete: if self.scm_type and path_to_delete: # non-manual, concrete path - from awx.main.tasks import delete_project_files + from awx.main.tasks.system import delete_project_files delete_project_files.delay(path_to_delete) return r @@ -532,7 +532,7 @@ class ProjectUpdate(UnifiedJob, ProjectOptions, JobNotificationMixin, TaskManage @classmethod def _get_task_class(cls): - from awx.main.tasks import RunProjectUpdate + from awx.main.tasks.jobs import RunProjectUpdate return RunProjectUpdate diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index 671daf104d..489cba9799 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -1046,7 +1046,7 @@ class UnifiedJob( fd = tempfile.NamedTemporaryFile( mode='w', prefix='{}-{}-'.format(self.model_to_str(), self.pk), suffix='.out', dir=settings.JOBOUTPUT_ROOT, encoding='utf-8' ) - from awx.main.tasks import purge_old_stdout_files # circular import + from awx.main.tasks.system import purge_old_stdout_files # circular import purge_old_stdout_files.apply_async() diff --git a/awx/main/models/workflow.py b/awx/main/models/workflow.py index b254ee41c8..684e25b967 100644 --- a/awx/main/models/workflow.py +++ b/awx/main/models/workflow.py @@ -813,7 +813,7 @@ class WorkflowApproval(UnifiedJob, JobNotificationMixin): return True def send_approval_notification(self, approval_status): - from awx.main.tasks import send_notifications # avoid circular import + from awx.main.tasks.system import send_notifications # avoid circular import if self.workflow_job_template is None: return diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index 2544a062db..3be2d4cfc7 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -257,7 +257,7 @@ class TaskManager: if self.start_task_limit == 0: # schedule another run immediately after this task manager schedule_task_manager() - from awx.main.tasks import handle_work_error, handle_work_success + from awx.main.tasks.system import handle_work_error, handle_work_success dependent_tasks = dependent_tasks or [] diff --git a/awx/main/signals.py b/awx/main/signals.py index 8dde65342d..cf4f030c67 100644 --- a/awx/main/signals.py +++ b/awx/main/signals.py @@ -57,7 +57,7 @@ from awx.main.models import ( from awx.main.constants import CENSOR_VALUE from awx.main.utils import model_instance_diff, model_to_dict, camelcase_to_underscore, get_current_apps from awx.main.utils import ignore_inventory_computed_fields, ignore_inventory_group_removal, _inventory_updates -from awx.main.tasks import update_inventory_computed_fields, handle_removed_image +from awx.main.tasks.system import update_inventory_computed_fields, handle_removed_image from awx.main.fields import ( is_implicit_parent, update_role_parentage_for_instance, diff --git a/awx/main/tasks/__init__.py b/awx/main/tasks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/awx/main/tasks.py b/awx/main/tasks/jobs.py similarity index 65% rename from awx/main/tasks.py rename to awx/main/tasks/jobs.py index 7743e4001d..deff0c0cfc 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks/jobs.py @@ -1,77 +1,50 @@ -# -*- coding: utf-8 -*- - -# Copyright (c) 2015 Ansible, Inc. -# All Rights Reserved. - # Python -from collections import OrderedDict, namedtuple, deque +from asyncio.log import logger +from collections import deque, OrderedDict +from distutils.dir_util import copy_tree import errno import functools -import importlib +import fcntl import json import logging import os -from io import StringIO -from contextlib import redirect_stdout +from pathlib import Path import shutil import stat -import tempfile -import time -import traceback -from distutils.dir_util import copy_tree -from distutils.version import LooseVersion as Version import yaml -import fcntl -from pathlib import Path -from uuid import uuid4 +import tempfile +import traceback +import time import urllib.parse as urlparse -import socket -import threading -import concurrent.futures -from base64 import b64encode -import sys +from uuid import uuid4 + # Django -from django.conf import settings -from django.db import transaction, DatabaseError, IntegrityError -from django.db.models.fields.related import ForeignKey -from django.utils.timezone import now -from django.utils.encoding import smart_str -from django.contrib.auth.models import User -from django.utils.translation import ugettext_lazy as _, gettext_noop -from django.core.cache import cache -from django.core.exceptions import ObjectDoesNotExist from django_guid.middleware import GuidMiddleware +from django.conf import settings +from django.db import transaction, DatabaseError +from django.utils.timezone import now -# Django-CRUM -from crum import impersonate + +# Runner +import ansible_runner # GitPython import git from gitdb.exc import BadName as BadGitName -# Runner -import ansible_runner -import ansible_runner.cleanup - -# dateutil -from dateutil.parser import parse as parse_date # AWX -from awx import __version__ as awx_application_version +from awx.main.constants import ACTIVE_STATES +from awx.main.dispatch.publish import task +from awx.main.dispatch import get_local_queuename from awx.main.constants import PRIVILEGE_ESCALATION_METHODS, STANDARD_INVENTORY_UPDATE_ENV, MINIMAL_EVENTS, JOB_FOLDER_PREFIX -from awx.main.access import access_registry +from awx.main.utils.safe_yaml import safe_dump from awx.main.redact import UriCleaner from awx.main.models import ( - Schedule, - TowerScheduleState, Instance, - InstanceGroup, - UnifiedJob, - Notification, Inventory, InventorySource, - SmartInventoryMembership, Job, AdHocCommand, ProjectUpdate, @@ -84,808 +57,25 @@ from awx.main.models import ( SystemJobEvent, build_safe_env, ) -from awx.main.constants import ACTIVE_STATES -from awx.main.exceptions import AwxTaskError, PostRunError, ReceptorNodeNotFound from awx.main.queue import CallbackQueueDispatcher -from awx.main.dispatch.publish import task -from awx.main.dispatch import get_local_queuename, reaper +from awx.main.tasks.receptor import AWXReceptorJob +from awx.main.exceptions import AwxTaskError, PostRunError, ReceptorNodeNotFound +from awx.main.utils.ansible import read_ansible_config +from awx.main.utils.execution_environments import CONTAINER_ROOT, to_container_path +from awx.main.utils.safe_yaml import safe_dump, sanitize_jinja from awx.main.utils.common import ( update_scm_url, - ignore_inventory_computed_fields, - ignore_inventory_group_removal, extract_ansible_vars, - schedule_task_manager, get_awx_version, - deepmerge, - parse_yaml_or_json, - cleanup_new_process, create_partition, ) -from awx.main.utils.execution_environments import get_default_pod_spec, CONTAINER_ROOT, to_container_path -from awx.main.utils.ansible import read_ansible_config -from awx.main.utils.external_logging import reconfigure_rsyslog -from awx.main.utils.safe_yaml import safe_dump, sanitize_jinja -from awx.main.utils.reload import stop_local_services -from awx.main.utils.pglock import advisory_lock -from awx.main.utils.handlers import SpecialInventoryHandler -from awx.main.utils.receptor import get_receptor_ctl, worker_info, get_conn_type, get_tls_client, worker_cleanup, administrative_workunit_reaper -from awx.main.consumers import emit_channel_notification -from awx.main import analytics -from awx.conf import settings_registry from awx.conf.license import get_license -from awx.main.analytics.subsystem_metrics import Metrics - +from awx.main.utils.handlers import SpecialInventoryHandler +from awx.main.tasks.system import handle_success_and_failure_notifications, update_smart_memberships_for_inventory, update_inventory_computed_fields from rest_framework.exceptions import PermissionDenied +from django.utils.translation import ugettext_lazy as _ - -__all__ = [ - 'RunJob', - 'RunSystemJob', - 'RunProjectUpdate', - 'RunInventoryUpdate', - 'RunAdHocCommand', - 'handle_work_error', - 'handle_work_success', - 'apply_cluster_membership_policies', - 'update_inventory_computed_fields', - 'update_host_smart_inventory_memberships', - 'send_notifications', - 'purge_old_stdout_files', -] - -HIDDEN_PASSWORD = '**********' - -OPENSSH_KEY_ERROR = u'''\ -It looks like you're trying to use a private key in OpenSSH format, which \ -isn't supported by the installed version of OpenSSH on this instance. \ -Try upgrading OpenSSH or providing your private key in an different format. \ -''' - -logger = logging.getLogger('awx.main.tasks') - - -def dispatch_startup(): - startup_logger = logging.getLogger('awx.main.tasks') - startup_logger.debug("Syncing Schedules") - for sch in Schedule.objects.all(): - try: - sch.update_computed_fields() - except Exception: - logger.exception("Failed to rebuild schedule {}.".format(sch)) - - # - # When the dispatcher starts, if the instance cannot be found in the database, - # automatically register it. This is mostly useful for openshift-based - # deployments where: - # - # 2 Instances come online - # Instance B encounters a network blip, Instance A notices, and - # deprovisions it - # Instance B's connectivity is restored, the dispatcher starts, and it - # re-registers itself - # - # In traditional container-less deployments, instances don't get - # deprovisioned when they miss their heartbeat, so this code is mostly a - # no-op. - # - apply_cluster_membership_policies() - cluster_node_heartbeat() - Metrics().clear_values() - - # Update Tower's rsyslog.conf file based on loggins settings in the db - reconfigure_rsyslog() - - -def inform_cluster_of_shutdown(): - try: - this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID) - this_inst.mark_offline(update_last_seen=True, errors=_('Instance received normal shutdown signal')) - try: - reaper.reap(this_inst) - except Exception: - logger.exception('failed to reap jobs for {}'.format(this_inst.hostname)) - logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname)) - except Exception: - logger.exception('Encountered problem with normal shutdown signal.') - - -@task(queue=get_local_queuename) -def apply_cluster_membership_policies(): - from awx.main.signals import disable_activity_stream - - started_waiting = time.time() - with advisory_lock('cluster_policy_lock', wait=True): - lock_time = time.time() - started_waiting - if lock_time > 1.0: - to_log = logger.info - else: - to_log = logger.debug - to_log('Waited {} seconds to obtain lock name: cluster_policy_lock'.format(lock_time)) - started_compute = time.time() - # Hop nodes should never get assigned to an InstanceGroup. - all_instances = list(Instance.objects.exclude(node_type='hop').order_by('id')) - all_groups = list(InstanceGroup.objects.prefetch_related('instances')) - - total_instances = len(all_instances) - actual_groups = [] - actual_instances = [] - Group = namedtuple('Group', ['obj', 'instances', 'prior_instances']) - Node = namedtuple('Instance', ['obj', 'groups']) - - # Process policy instance list first, these will represent manually managed memberships - instance_hostnames_map = {inst.hostname: inst for inst in all_instances} - for ig in all_groups: - group_actual = Group(obj=ig, instances=[], prior_instances=[instance.pk for instance in ig.instances.all()]) # obtained in prefetch - for hostname in ig.policy_instance_list: - if hostname not in instance_hostnames_map: - logger.info("Unknown instance {} in {} policy list".format(hostname, ig.name)) - continue - inst = instance_hostnames_map[hostname] - group_actual.instances.append(inst.id) - # NOTE: arguable behavior: policy-list-group is not added to - # instance's group count for consideration in minimum-policy rules - if group_actual.instances: - logger.debug("Policy List, adding Instances {} to Group {}".format(group_actual.instances, ig.name)) - - actual_groups.append(group_actual) - - # Process Instance minimum policies next, since it represents a concrete lower bound to the - # number of instances to make available to instance groups - actual_instances = [Node(obj=i, groups=[]) for i in all_instances if i.managed_by_policy] - logger.debug("Total instances: {}, available for policy: {}".format(total_instances, len(actual_instances))) - for g in sorted(actual_groups, key=lambda x: len(x.instances)): - exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control' - policy_min_added = [] - for i in sorted(actual_instances, key=lambda x: len(x.groups)): - if i.obj.node_type == exclude_type: - continue # never place execution instances in controlplane group or control instances in other groups - if len(g.instances) >= g.obj.policy_instance_minimum: - break - if i.obj.id in g.instances: - # If the instance is already _in_ the group, it was - # applied earlier via the policy list - continue - g.instances.append(i.obj.id) - i.groups.append(g.obj.id) - policy_min_added.append(i.obj.id) - if policy_min_added: - logger.debug("Policy minimum, adding Instances {} to Group {}".format(policy_min_added, g.obj.name)) - - # Finally, process instance policy percentages - for g in sorted(actual_groups, key=lambda x: len(x.instances)): - exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control' - candidate_pool_ct = sum(1 for i in actual_instances if i.obj.node_type != exclude_type) - if not candidate_pool_ct: - continue - policy_per_added = [] - for i in sorted(actual_instances, key=lambda x: len(x.groups)): - if i.obj.node_type == exclude_type: - continue - if i.obj.id in g.instances: - # If the instance is already _in_ the group, it was - # applied earlier via a minimum policy or policy list - continue - if 100 * float(len(g.instances)) / candidate_pool_ct >= g.obj.policy_instance_percentage: - break - g.instances.append(i.obj.id) - i.groups.append(g.obj.id) - policy_per_added.append(i.obj.id) - if policy_per_added: - logger.debug("Policy percentage, adding Instances {} to Group {}".format(policy_per_added, g.obj.name)) - - # Determine if any changes need to be made - needs_change = False - for g in actual_groups: - if set(g.instances) != set(g.prior_instances): - needs_change = True - break - if not needs_change: - logger.debug('Cluster policy no-op finished in {} seconds'.format(time.time() - started_compute)) - return - - # On a differential basis, apply instances to groups - with transaction.atomic(): - with disable_activity_stream(): - for g in actual_groups: - if g.obj.is_container_group: - logger.debug('Skipping containerized group {} for policy calculation'.format(g.obj.name)) - continue - instances_to_add = set(g.instances) - set(g.prior_instances) - instances_to_remove = set(g.prior_instances) - set(g.instances) - if instances_to_add: - logger.debug('Adding instances {} to group {}'.format(list(instances_to_add), g.obj.name)) - g.obj.instances.add(*instances_to_add) - if instances_to_remove: - logger.debug('Removing instances {} from group {}'.format(list(instances_to_remove), g.obj.name)) - g.obj.instances.remove(*instances_to_remove) - logger.debug('Cluster policy computation finished in {} seconds'.format(time.time() - started_compute)) - - -@task(queue='tower_broadcast_all') -def handle_setting_changes(setting_keys): - orig_len = len(setting_keys) - for i in range(orig_len): - for dependent_key in settings_registry.get_dependent_settings(setting_keys[i]): - setting_keys.append(dependent_key) - cache_keys = set(setting_keys) - logger.debug('cache delete_many(%r)', cache_keys) - cache.delete_many(cache_keys) - - if any([setting.startswith('LOG_AGGREGATOR') for setting in setting_keys]): - reconfigure_rsyslog() - - -@task(queue='tower_broadcast_all') -def delete_project_files(project_path): - # TODO: possibly implement some retry logic - lock_file = project_path + '.lock' - if os.path.exists(project_path): - try: - shutil.rmtree(project_path) - logger.debug('Success removing project files {}'.format(project_path)) - except Exception: - logger.exception('Could not remove project directory {}'.format(project_path)) - if os.path.exists(lock_file): - try: - os.remove(lock_file) - logger.debug('Success removing {}'.format(lock_file)) - except Exception: - logger.exception('Could not remove lock file {}'.format(lock_file)) - - -@task(queue='tower_broadcast_all') -def profile_sql(threshold=1, minutes=1): - if threshold <= 0: - cache.delete('awx-profile-sql-threshold') - logger.error('SQL PROFILING DISABLED') - else: - cache.set('awx-profile-sql-threshold', threshold, timeout=minutes * 60) - logger.error('SQL QUERIES >={}s ENABLED FOR {} MINUTE(S)'.format(threshold, minutes)) - - -@task(queue=get_local_queuename) -def send_notifications(notification_list, job_id=None): - if not isinstance(notification_list, list): - raise TypeError("notification_list should be of type list") - if job_id is not None: - job_actual = UnifiedJob.objects.get(id=job_id) - - notifications = Notification.objects.filter(id__in=notification_list) - if job_id is not None: - job_actual.notifications.add(*notifications) - - for notification in notifications: - update_fields = ['status', 'notifications_sent'] - try: - sent = notification.notification_template.send(notification.subject, notification.body) - notification.status = "successful" - notification.notifications_sent = sent - if job_id is not None: - job_actual.log_lifecycle("notifications_sent") - except Exception as e: - logger.exception("Send Notification Failed {}".format(e)) - notification.status = "failed" - notification.error = smart_str(e) - update_fields.append('error') - finally: - try: - notification.save(update_fields=update_fields) - except Exception: - logger.exception('Error saving notification {} result.'.format(notification.id)) - - -@task(queue=get_local_queuename) -def gather_analytics(): - from awx.conf.models import Setting - from rest_framework.fields import DateTimeField - - last_gather = Setting.objects.filter(key='AUTOMATION_ANALYTICS_LAST_GATHER').first() - last_time = DateTimeField().to_internal_value(last_gather.value) if last_gather and last_gather.value else None - gather_time = now() - - if not last_time or ((gather_time - last_time).total_seconds() > settings.AUTOMATION_ANALYTICS_GATHER_INTERVAL): - analytics.gather() - - -@task(queue=get_local_queuename) -def purge_old_stdout_files(): - nowtime = time.time() - for f in os.listdir(settings.JOBOUTPUT_ROOT): - if os.path.getctime(os.path.join(settings.JOBOUTPUT_ROOT, f)) < nowtime - settings.LOCAL_STDOUT_EXPIRE_TIME: - os.unlink(os.path.join(settings.JOBOUTPUT_ROOT, f)) - logger.debug("Removing {}".format(os.path.join(settings.JOBOUTPUT_ROOT, f))) - - -def _cleanup_images_and_files(**kwargs): - if settings.IS_K8S: - return - this_inst = Instance.objects.me() - runner_cleanup_kwargs = this_inst.get_cleanup_task_kwargs(**kwargs) - if runner_cleanup_kwargs: - stdout = '' - with StringIO() as buffer: - with redirect_stdout(buffer): - ansible_runner.cleanup.run_cleanup(runner_cleanup_kwargs) - stdout = buffer.getvalue() - if '(changed: True)' in stdout: - logger.info(f'Performed local cleanup with kwargs {kwargs}, output:\n{stdout}') - - # if we are the first instance alphabetically, then run cleanup on execution nodes - checker_instance = Instance.objects.filter(node_type__in=['hybrid', 'control'], enabled=True, capacity__gt=0).order_by('-hostname').first() - if checker_instance and this_inst.hostname == checker_instance.hostname: - for inst in Instance.objects.filter(node_type='execution', enabled=True, capacity__gt=0): - runner_cleanup_kwargs = inst.get_cleanup_task_kwargs(**kwargs) - if not runner_cleanup_kwargs: - continue - try: - stdout = worker_cleanup(inst.hostname, runner_cleanup_kwargs) - if '(changed: True)' in stdout: - logger.info(f'Performed cleanup on execution node {inst.hostname} with output:\n{stdout}') - except RuntimeError: - logger.exception(f'Error running cleanup on execution node {inst.hostname}') - - -@task(queue='tower_broadcast_all') -def handle_removed_image(remove_images=None): - """Special broadcast invocation of this method to handle case of deleted EE""" - _cleanup_images_and_files(remove_images=remove_images, file_pattern='') - - -@task(queue=get_local_queuename) -def cleanup_images_and_files(): - _cleanup_images_and_files() - - -@task(queue=get_local_queuename) -def cluster_node_health_check(node): - """ - Used for the health check endpoint, refreshes the status of the instance, but must be ran on target node - """ - if node == '': - logger.warn('Local health check incorrectly called with blank string') - return - elif node != settings.CLUSTER_HOST_ID: - logger.warn(f'Local health check for {node} incorrectly sent to {settings.CLUSTER_HOST_ID}') - return - try: - this_inst = Instance.objects.me() - except Instance.DoesNotExist: - logger.warn(f'Instance record for {node} missing, could not check capacity.') - return - this_inst.local_health_check() - - -@task(queue=get_local_queuename) -def execution_node_health_check(node): - if node == '': - logger.warn('Remote health check incorrectly called with blank string') - return - try: - instance = Instance.objects.get(hostname=node) - except Instance.DoesNotExist: - logger.warn(f'Instance record for {node} missing, could not check capacity.') - return - - if instance.node_type != 'execution': - raise RuntimeError(f'Execution node health check ran against {instance.node_type} node {instance.hostname}') - - data = worker_info(node) - - prior_capacity = instance.capacity - - instance.save_health_data( - version='ansible-runner-' + data.get('runner_version', '???'), - cpu=data.get('cpu_count', 0), - memory=data.get('mem_in_bytes', 0), - uuid=data.get('uuid'), - errors='\n'.join(data.get('errors', [])), - ) - - if data['errors']: - formatted_error = "\n".join(data["errors"]) - if prior_capacity: - logger.warn(f'Health check marking execution node {node} as lost, errors:\n{formatted_error}') - else: - logger.info(f'Failed to find capacity of new or lost execution node {node}, errors:\n{formatted_error}') - else: - logger.info('Set capacity of execution node {} to {}, worker info data:\n{}'.format(node, instance.capacity, json.dumps(data, indent=2))) - - return data - - -def inspect_execution_nodes(instance_list): - with advisory_lock('inspect_execution_nodes_lock', wait=False): - node_lookup = {inst.hostname: inst for inst in instance_list} - - ctl = get_receptor_ctl() - mesh_status = ctl.simple_command('status') - - nowtime = now() - workers = mesh_status['Advertisements'] - for ad in workers: - hostname = ad['NodeID'] - if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []): - continue - - changed = False - if hostname in node_lookup: - instance = node_lookup[hostname] - else: - logger.warn(f"Unrecognized node on mesh advertising ansible-runner work type: {hostname}") - continue - - was_lost = instance.is_lost(ref_time=nowtime) - last_seen = parse_date(ad['Time']) - - if instance.last_seen and instance.last_seen >= last_seen: - continue - instance.last_seen = last_seen - instance.save(update_fields=['last_seen']) - - if changed: - execution_node_health_check.apply_async([hostname]) - elif was_lost: - # if the instance *was* lost, but has appeared again, - # attempt to re-establish the initial capacity and version - # check - logger.warn(f'Execution node attempting to rejoin as instance {hostname}.') - execution_node_health_check.apply_async([hostname]) - elif instance.capacity == 0 and instance.enabled: - # nodes with proven connection but need remediation run health checks are reduced frequency - if not instance.last_health_check or (nowtime - instance.last_health_check).total_seconds() >= settings.EXECUTION_NODE_REMEDIATION_CHECKS: - # Periodically re-run the health check of errored nodes, in case someone fixed it - # TODO: perhaps decrease the frequency of these checks - logger.debug(f'Restarting health check for execution node {hostname} with known errors.') - execution_node_health_check.apply_async([hostname]) - - -@task(queue=get_local_queuename) -def cluster_node_heartbeat(): - logger.debug("Cluster node heartbeat task.") - nowtime = now() - instance_list = list(Instance.objects.all()) - this_inst = None - lost_instances = [] - - for inst in instance_list: - if inst.hostname == settings.CLUSTER_HOST_ID: - this_inst = inst - instance_list.remove(inst) - break - else: - (changed, this_inst) = Instance.objects.get_or_register() - if changed: - logger.info("Registered tower control node '{}'".format(this_inst.hostname)) - - inspect_execution_nodes(instance_list) - - for inst in list(instance_list): - if inst.is_lost(ref_time=nowtime): - lost_instances.append(inst) - instance_list.remove(inst) - - if this_inst: - startup_event = this_inst.is_lost(ref_time=nowtime) - this_inst.local_health_check() - if startup_event and this_inst.capacity != 0: - logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname)) - return - else: - raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID)) - # IFF any node has a greater version than we do, then we'll shutdown services - for other_inst in instance_list: - if other_inst.version == "" or other_inst.version.startswith('ansible-runner') or other_inst.node_type == 'execution': - continue - if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG: - logger.error( - "Host {} reports version {}, but this node {} is at {}, shutting down".format( - other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version - ) - ) - # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance. - # The heartbeat task will reset the capacity to the system capacity after upgrade. - stop_local_services(communicate=False) - raise RuntimeError("Shutting down.") - - for other_inst in lost_instances: - try: - reaper.reap(other_inst) - except Exception: - logger.exception('failed to reap jobs for {}'.format(other_inst.hostname)) - try: - # Capacity could already be 0 because: - # * It's a new node and it never had a heartbeat - # * It was set to 0 by another tower node running this method - # * It was set to 0 by this node, but auto deprovisioning is off - # - # If auto deprovisining is on, don't bother setting the capacity to 0 - # since we will delete the node anyway. - if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES: - other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive')) - logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen)) - elif settings.AWX_AUTO_DEPROVISION_INSTANCES: - deprovision_hostname = other_inst.hostname - other_inst.delete() - logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname)) - except DatabaseError as e: - if 'did not affect any rows' in str(e): - logger.debug('Another instance has marked {} as lost'.format(other_inst.hostname)) - else: - logger.exception('Error marking {} as lost'.format(other_inst.hostname)) - - -@task(queue=get_local_queuename) -def awx_receptor_workunit_reaper(): - """ - When an AWX job is launched via receptor, files such as status, stdin, and stdout are created - in a specific receptor directory. This directory on disk is a random 8 character string, e.g. qLL2JFNT - This is also called the work Unit ID in receptor, and is used in various receptor commands, - e.g. "work results qLL2JFNT" - After an AWX job executes, the receptor work unit directory is cleaned up by - issuing the work release command. In some cases the release process might fail, or - if AWX crashes during a job's execution, the work release command is never issued to begin with. - As such, this periodic task will obtain a list of all receptor work units, and find which ones - belong to AWX jobs that are in a completed state (status is canceled, error, or succeeded). - This task will call "work release" on each of these work units to clean up the files on disk. - - Note that when we call "work release" on a work unit that actually represents remote work - both the local and remote work units are cleaned up. - - Since we are cleaning up jobs that controller considers to be inactive, we take the added - precaution of calling "work cancel" in case the work unit is still active. - """ - if not settings.RECEPTOR_RELEASE_WORK: - return - logger.debug("Checking for unreleased receptor work units") - receptor_ctl = get_receptor_ctl() - receptor_work_list = receptor_ctl.simple_command("work list") - - unit_ids = [id for id in receptor_work_list] - jobs_with_unreleased_receptor_units = UnifiedJob.objects.filter(work_unit_id__in=unit_ids).exclude(status__in=ACTIVE_STATES) - for job in jobs_with_unreleased_receptor_units: - logger.debug(f"{job.log_format} is not active, reaping receptor work unit {job.work_unit_id}") - receptor_ctl.simple_command(f"work cancel {job.work_unit_id}") - receptor_ctl.simple_command(f"work release {job.work_unit_id}") - - administrative_workunit_reaper(receptor_work_list) - - -@task(queue=get_local_queuename) -def awx_k8s_reaper(): - if not settings.RECEPTOR_RELEASE_WORK: - return - - from awx.main.scheduler.kubernetes import PodManager # prevent circular import - - for group in InstanceGroup.objects.filter(is_container_group=True).iterator(): - logger.debug("Checking for orphaned k8s pods for {}.".format(group)) - pods = PodManager.list_active_jobs(group) - for job in UnifiedJob.objects.filter(pk__in=pods.keys()).exclude(status__in=ACTIVE_STATES): - logger.debug('{} is no longer active, reaping orphaned k8s pod'.format(job.log_format)) - try: - pm = PodManager(job) - pm.kube_api.delete_namespaced_pod(name=pods[job.id], namespace=pm.namespace, _request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT) - except Exception: - logger.exception("Failed to delete orphaned pod {} from {}".format(job.log_format, group)) - - -@task(queue=get_local_queuename) -def awx_periodic_scheduler(): - with advisory_lock('awx_periodic_scheduler_lock', wait=False) as acquired: - if acquired is False: - logger.debug("Not running periodic scheduler, another task holds lock") - return - logger.debug("Starting periodic scheduler") - - run_now = now() - state = TowerScheduleState.get_solo() - last_run = state.schedule_last_run - logger.debug("Last scheduler run was: %s", last_run) - state.schedule_last_run = run_now - state.save() - - old_schedules = Schedule.objects.enabled().before(last_run) - for schedule in old_schedules: - schedule.update_computed_fields() - schedules = Schedule.objects.enabled().between(last_run, run_now) - - invalid_license = False - try: - access_registry[Job](None).check_license(quiet=True) - except PermissionDenied as e: - invalid_license = e - - for schedule in schedules: - template = schedule.unified_job_template - schedule.update_computed_fields() # To update next_run timestamp. - if template.cache_timeout_blocked: - logger.warn("Cache timeout is in the future, bypassing schedule for template %s" % str(template.id)) - continue - try: - job_kwargs = schedule.get_job_kwargs() - new_unified_job = schedule.unified_job_template.create_unified_job(**job_kwargs) - logger.debug('Spawned {} from schedule {}-{}.'.format(new_unified_job.log_format, schedule.name, schedule.pk)) - - if invalid_license: - new_unified_job.status = 'failed' - new_unified_job.job_explanation = str(invalid_license) - new_unified_job.save(update_fields=['status', 'job_explanation']) - new_unified_job.websocket_emit_status("failed") - raise invalid_license - can_start = new_unified_job.signal_start() - except Exception: - logger.exception('Error spawning scheduled job.') - continue - if not can_start: - new_unified_job.status = 'failed' - new_unified_job.job_explanation = gettext_noop( - "Scheduled job could not start because it \ - was not in the right state or required manual credentials" - ) - new_unified_job.save(update_fields=['status', 'job_explanation']) - new_unified_job.websocket_emit_status("failed") - emit_channel_notification('schedules-changed', dict(id=schedule.id, group_name="schedules")) - state.save() - - -@task(queue=get_local_queuename) -def handle_work_success(task_actual): - try: - instance = UnifiedJob.get_instance_by_type(task_actual['type'], task_actual['id']) - except ObjectDoesNotExist: - logger.warning('Missing {} `{}` in success callback.'.format(task_actual['type'], task_actual['id'])) - return - if not instance: - return - - schedule_task_manager() - - -@task(queue=get_local_queuename) -def handle_work_error(task_id, *args, **kwargs): - subtasks = kwargs.get('subtasks', None) - logger.debug('Executing error task id %s, subtasks: %s' % (task_id, str(subtasks))) - first_instance = None - first_instance_type = '' - if subtasks is not None: - for each_task in subtasks: - try: - instance = UnifiedJob.get_instance_by_type(each_task['type'], each_task['id']) - if not instance: - # Unknown task type - logger.warn("Unknown task type: {}".format(each_task['type'])) - continue - except ObjectDoesNotExist: - logger.warning('Missing {} `{}` in error callback.'.format(each_task['type'], each_task['id'])) - continue - - if first_instance is None: - first_instance = instance - first_instance_type = each_task['type'] - - if instance.celery_task_id != task_id and not instance.cancel_flag and not instance.status == 'successful': - instance.status = 'failed' - instance.failed = True - if not instance.job_explanation: - instance.job_explanation = 'Previous Task Failed: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}' % ( - first_instance_type, - first_instance.name, - first_instance.id, - ) - instance.save() - instance.websocket_emit_status("failed") - - # We only send 1 job complete message since all the job completion message - # handling does is trigger the scheduler. If we extend the functionality of - # what the job complete message handler does then we may want to send a - # completion event for each job here. - if first_instance: - schedule_task_manager() - pass - - -@task(queue=get_local_queuename) -def handle_success_and_failure_notifications(job_id): - uj = UnifiedJob.objects.get(pk=job_id) - retries = 0 - while retries < 5: - if uj.finished: - uj.send_notification_templates('succeeded' if uj.status == 'successful' else 'failed') - return - else: - # wait a few seconds to avoid a race where the - # events are persisted _before_ the UJ.status - # changes from running -> successful - retries += 1 - time.sleep(1) - uj = UnifiedJob.objects.get(pk=job_id) - - logger.warn(f"Failed to even try to send notifications for job '{uj}' due to job not being in finished state.") - - -@task(queue=get_local_queuename) -def update_inventory_computed_fields(inventory_id): - """ - Signal handler and wrapper around inventory.update_computed_fields to - prevent unnecessary recursive calls. - """ - i = Inventory.objects.filter(id=inventory_id) - if not i.exists(): - logger.error("Update Inventory Computed Fields failed due to missing inventory: " + str(inventory_id)) - return - i = i[0] - try: - i.update_computed_fields() - except DatabaseError as e: - if 'did not affect any rows' in str(e): - logger.debug('Exiting duplicate update_inventory_computed_fields task.') - return - raise - - -def update_smart_memberships_for_inventory(smart_inventory): - current = set(SmartInventoryMembership.objects.filter(inventory=smart_inventory).values_list('host_id', flat=True)) - new = set(smart_inventory.hosts.values_list('id', flat=True)) - additions = new - current - removals = current - new - if additions or removals: - with transaction.atomic(): - if removals: - SmartInventoryMembership.objects.filter(inventory=smart_inventory, host_id__in=removals).delete() - if additions: - add_for_inventory = [SmartInventoryMembership(inventory_id=smart_inventory.id, host_id=host_id) for host_id in additions] - SmartInventoryMembership.objects.bulk_create(add_for_inventory, ignore_conflicts=True) - logger.debug( - 'Smart host membership cached for {}, {} additions, {} removals, {} total count.'.format( - smart_inventory.pk, len(additions), len(removals), len(new) - ) - ) - return True # changed - return False - - -@task(queue=get_local_queuename) -def update_host_smart_inventory_memberships(): - smart_inventories = Inventory.objects.filter(kind='smart', host_filter__isnull=False, pending_deletion=False) - changed_inventories = set([]) - for smart_inventory in smart_inventories: - try: - changed = update_smart_memberships_for_inventory(smart_inventory) - if changed: - changed_inventories.add(smart_inventory) - except IntegrityError: - logger.exception('Failed to update smart inventory memberships for {}'.format(smart_inventory.pk)) - # Update computed fields for changed inventories outside atomic action - for smart_inventory in changed_inventories: - smart_inventory.update_computed_fields() - - -@task(queue=get_local_queuename) -def delete_inventory(inventory_id, user_id, retries=5): - # Delete inventory as user - if user_id is None: - user = None - else: - try: - user = User.objects.get(id=user_id) - except Exception: - user = None - with ignore_inventory_computed_fields(), ignore_inventory_group_removal(), impersonate(user): - try: - i = Inventory.objects.get(id=inventory_id) - for host in i.hosts.iterator(): - host.job_events_as_primary_host.update(host=None) - i.delete() - emit_channel_notification('inventories-status_changed', {'group_name': 'inventories', 'inventory_id': inventory_id, 'status': 'deleted'}) - logger.debug('Deleted inventory {} as user {}.'.format(inventory_id, user_id)) - except Inventory.DoesNotExist: - logger.exception("Delete Inventory failed due to missing inventory: " + str(inventory_id)) - return - except DatabaseError: - logger.exception('Database error deleting inventory {}, but will retry.'.format(inventory_id)) - if retries > 0: - time.sleep(10) - delete_inventory(inventory_id, user_id, retries=retries - 1) +logger = logging.getLogger('awx.main.tasks.jobs') def with_path_cleanup(f): @@ -2327,7 +1517,7 @@ class RunProjectUpdate(BaseTask): # the project update playbook is not in a git repo, but uses a vendoring directory # to be consistent with the ansible-runner model, # that is moved into the runner project folder here - awx_playbooks = self.get_path_to('..', 'playbooks') + awx_playbooks = self.get_path_to('../../', 'playbooks') copy_tree(awx_playbooks, os.path.join(private_data_dir, 'project')) @staticmethod @@ -2986,350 +2176,3 @@ class RunSystemJob(BaseTask): def build_inventory(self, instance, private_data_dir): return None - - -def _reconstruct_relationships(copy_mapping): - for old_obj, new_obj in copy_mapping.items(): - model = type(old_obj) - for field_name in getattr(model, 'FIELDS_TO_PRESERVE_AT_COPY', []): - field = model._meta.get_field(field_name) - if isinstance(field, ForeignKey): - if getattr(new_obj, field_name, None): - continue - related_obj = getattr(old_obj, field_name) - related_obj = copy_mapping.get(related_obj, related_obj) - setattr(new_obj, field_name, related_obj) - elif field.many_to_many: - for related_obj in getattr(old_obj, field_name).all(): - logger.debug('Deep copy: Adding {} to {}({}).{} relationship'.format(related_obj, new_obj, model, field_name)) - getattr(new_obj, field_name).add(copy_mapping.get(related_obj, related_obj)) - new_obj.save() - - -@task(queue=get_local_queuename) -def deep_copy_model_obj(model_module, model_name, obj_pk, new_obj_pk, user_pk, uuid, permission_check_func=None): - sub_obj_list = cache.get(uuid) - if sub_obj_list is None: - logger.error('Deep copy {} from {} to {} failed unexpectedly.'.format(model_name, obj_pk, new_obj_pk)) - return - - logger.debug('Deep copy {} from {} to {}.'.format(model_name, obj_pk, new_obj_pk)) - from awx.api.generics import CopyAPIView - from awx.main.signals import disable_activity_stream - - model = getattr(importlib.import_module(model_module), model_name, None) - if model is None: - return - try: - obj = model.objects.get(pk=obj_pk) - new_obj = model.objects.get(pk=new_obj_pk) - creater = User.objects.get(pk=user_pk) - except ObjectDoesNotExist: - logger.warning("Object or user no longer exists.") - return - with transaction.atomic(), ignore_inventory_computed_fields(), disable_activity_stream(): - copy_mapping = {} - for sub_obj_setup in sub_obj_list: - sub_model = getattr(importlib.import_module(sub_obj_setup[0]), sub_obj_setup[1], None) - if sub_model is None: - continue - try: - sub_obj = sub_model.objects.get(pk=sub_obj_setup[2]) - except ObjectDoesNotExist: - continue - copy_mapping.update(CopyAPIView.copy_model_obj(obj, new_obj, sub_model, sub_obj, creater)) - _reconstruct_relationships(copy_mapping) - if permission_check_func: - permission_check_func = getattr(getattr(importlib.import_module(permission_check_func[0]), permission_check_func[1]), permission_check_func[2]) - permission_check_func(creater, copy_mapping.values()) - if isinstance(new_obj, Inventory): - update_inventory_computed_fields.delay(new_obj.id) - - -class TransmitterThread(threading.Thread): - def run(self): - self.exc = None - - try: - super().run() - except Exception: - self.exc = sys.exc_info() - - -class AWXReceptorJob: - def __init__(self, task, runner_params=None): - self.task = task - self.runner_params = runner_params - self.unit_id = None - - if self.task and not self.task.instance.is_container_group_task: - execution_environment_params = self.task.build_execution_environment_params(self.task.instance, runner_params['private_data_dir']) - self.runner_params.update(execution_environment_params) - - if not settings.IS_K8S and self.work_type == 'local' and 'only_transmit_kwargs' not in self.runner_params: - self.runner_params['only_transmit_kwargs'] = True - - def run(self): - # We establish a connection to the Receptor socket - receptor_ctl = get_receptor_ctl() - - res = None - try: - res = self._run_internal(receptor_ctl) - return res - finally: - # Make sure to always release the work unit if we established it - if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK: - try: - receptor_ctl.simple_command(f"work release {self.unit_id}") - except Exception: - logger.exception(f"Error releasing work unit {self.unit_id}.") - - @property - def sign_work(self): - return False if settings.IS_K8S else True - - def _run_internal(self, receptor_ctl): - # Create a socketpair. Where the left side will be used for writing our payload - # (private data dir, kwargs). The right side will be passed to Receptor for - # reading. - sockin, sockout = socket.socketpair() - - transmitter_thread = TransmitterThread(target=self.transmit, args=[sockin]) - transmitter_thread.start() - - # submit our work, passing - # in the right side of our socketpair for reading. - _kw = {} - if self.work_type == 'ansible-runner': - _kw['node'] = self.task.instance.execution_node - use_stream_tls = get_conn_type(_kw['node'], receptor_ctl).name == "STREAMTLS" - _kw['tlsclient'] = get_tls_client(use_stream_tls) - result = receptor_ctl.submit_work(worktype=self.work_type, payload=sockout.makefile('rb'), params=self.receptor_params, signwork=self.sign_work, **_kw) - self.unit_id = result['unitid'] - # Update the job with the work unit in-memory so that the log_lifecycle - # will print out the work unit that is to be associated with the job in the database - # via the update_model() call. - # We want to log the work_unit_id as early as possible. A failure can happen in between - # when we start the job in receptor and when we associate the job <-> work_unit_id. - # In that case, there will be work running in receptor and Controller will not know - # which Job it is associated with. - # We do not programatically handle this case. Ideally, we would handle this with a reaper case. - # The two distinct job lifecycle log events below allow for us to at least detect when this - # edge case occurs. If the lifecycle event work_unit_id_received occurs without the - # work_unit_id_assigned event then this case may have occured. - self.task.instance.work_unit_id = result['unitid'] # Set work_unit_id in-memory only - self.task.instance.log_lifecycle("work_unit_id_received") - self.task.update_model(self.task.instance.pk, work_unit_id=result['unitid']) - self.task.instance.log_lifecycle("work_unit_id_assigned") - - sockin.close() - sockout.close() - - if transmitter_thread.exc: - raise transmitter_thread.exc[1].with_traceback(transmitter_thread.exc[2]) - - transmitter_thread.join() - - resultsock, resultfile = receptor_ctl.get_work_results(self.unit_id, return_socket=True, return_sockfile=True) - # Both "processor" and "cancel_watcher" are spawned in separate threads. - # We wait for the first one to return. If cancel_watcher returns first, - # we yank the socket out from underneath the processor, which will cause it - # to exit. A reference to the processor_future is passed into the cancel_watcher_future, - # Which exits if the job has finished normally. The context manager ensures we do not - # leave any threads laying around. - with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: - processor_future = executor.submit(self.processor, resultfile) - cancel_watcher_future = executor.submit(self.cancel_watcher, processor_future) - futures = [processor_future, cancel_watcher_future] - first_future = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED) - - res = list(first_future.done)[0].result() - if res.status == 'canceled': - receptor_ctl.simple_command(f"work cancel {self.unit_id}") - resultsock.shutdown(socket.SHUT_RDWR) - resultfile.close() - elif res.status == 'error': - try: - unit_status = receptor_ctl.simple_command(f'work status {self.unit_id}') - detail = unit_status.get('Detail', None) - state_name = unit_status.get('StateName', None) - except Exception: - detail = '' - state_name = '' - logger.exception(f'An error was encountered while getting status for work unit {self.unit_id}') - - if 'exceeded quota' in detail: - logger.warn(detail) - log_name = self.task.instance.log_format - logger.warn(f"Could not launch pod for {log_name}. Exceeded quota.") - self.task.update_model(self.task.instance.pk, status='pending') - return - # If ansible-runner ran, but an error occured at runtime, the traceback information - # is saved via the status_handler passed in to the processor. - if state_name == 'Succeeded': - return res - - if not self.task.instance.result_traceback: - try: - resultsock = receptor_ctl.get_work_results(self.unit_id, return_sockfile=True) - lines = resultsock.readlines() - receptor_output = b"".join(lines).decode() - if receptor_output: - self.task.instance.result_traceback = receptor_output - self.task.instance.save(update_fields=['result_traceback']) - elif detail: - self.task.instance.result_traceback = detail - self.task.instance.save(update_fields=['result_traceback']) - else: - logger.warn(f'No result details or output from {self.task.instance.log_format}, status:\n{unit_status}') - except Exception: - raise RuntimeError(detail) - - return res - - # Spawned in a thread so Receptor can start reading before we finish writing, we - # write our payload to the left side of our socketpair. - @cleanup_new_process - def transmit(self, _socket): - try: - ansible_runner.interface.run(streamer='transmit', _output=_socket.makefile('wb'), **self.runner_params) - finally: - # Socket must be shutdown here, or the reader will hang forever. - _socket.shutdown(socket.SHUT_WR) - - @cleanup_new_process - def processor(self, resultfile): - return ansible_runner.interface.run( - streamer='process', - quiet=True, - _input=resultfile, - event_handler=self.task.event_handler, - finished_callback=self.task.finished_callback, - status_handler=self.task.status_handler, - **self.runner_params, - ) - - @property - def receptor_params(self): - if self.task.instance.is_container_group_task: - spec_yaml = yaml.dump(self.pod_definition, explicit_start=True) - - receptor_params = { - "secret_kube_pod": spec_yaml, - "pod_pending_timeout": getattr(settings, 'AWX_CONTAINER_GROUP_POD_PENDING_TIMEOUT', "5m"), - } - - if self.credential: - kubeconfig_yaml = yaml.dump(self.kube_config, explicit_start=True) - receptor_params["secret_kube_config"] = kubeconfig_yaml - else: - private_data_dir = self.runner_params['private_data_dir'] - if self.work_type == 'ansible-runner' and settings.AWX_CLEANUP_PATHS: - # on execution nodes, we rely on the private data dir being deleted - cli_params = f"--private-data-dir={private_data_dir} --delete" - else: - # on hybrid nodes, we rely on the private data dir NOT being deleted - cli_params = f"--private-data-dir={private_data_dir}" - receptor_params = {"params": cli_params} - - return receptor_params - - @property - def work_type(self): - if self.task.instance.is_container_group_task: - if self.credential: - return 'kubernetes-runtime-auth' - return 'kubernetes-incluster-auth' - if self.task.instance.execution_node == settings.CLUSTER_HOST_ID or self.task.instance.execution_node == self.task.instance.controller_node: - return 'local' - return 'ansible-runner' - - @cleanup_new_process - def cancel_watcher(self, processor_future): - while True: - if processor_future.done(): - return processor_future.result() - - if self.task.cancel_callback(): - result = namedtuple('result', ['status', 'rc']) - return result('canceled', 1) - - time.sleep(1) - - @property - def pod_definition(self): - ee = self.task.instance.execution_environment - - default_pod_spec = get_default_pod_spec() - - pod_spec_override = {} - if self.task and self.task.instance.instance_group.pod_spec_override: - pod_spec_override = parse_yaml_or_json(self.task.instance.instance_group.pod_spec_override) - # According to the deepmerge docstring, the second dictionary will override when - # they share keys, which is the desired behavior. - # This allows user to only provide elements they want to override, and for us to still provide any - # defaults they don't want to change - pod_spec = deepmerge(default_pod_spec, pod_spec_override) - - pod_spec['spec']['containers'][0]['image'] = ee.image - pod_spec['spec']['containers'][0]['args'] = ['ansible-runner', 'worker', '--private-data-dir=/runner'] - - # Enforce EE Pull Policy - pull_options = {"always": "Always", "missing": "IfNotPresent", "never": "Never"} - if self.task and self.task.instance.execution_environment: - if self.task.instance.execution_environment.pull: - pod_spec['spec']['containers'][0]['imagePullPolicy'] = pull_options[self.task.instance.execution_environment.pull] - - if self.task and self.task.instance.is_container_group_task: - # If EE credential is passed, create an imagePullSecret - if self.task.instance.execution_environment and self.task.instance.execution_environment.credential: - # Create pull secret in k8s cluster based on ee cred - from awx.main.scheduler.kubernetes import PodManager # prevent circular import - - pm = PodManager(self.task.instance) - secret_name = pm.create_secret(job=self.task.instance) - - # Inject secret name into podspec - pod_spec['spec']['imagePullSecrets'] = [{"name": secret_name}] - - if self.task: - pod_spec['metadata'] = deepmerge( - pod_spec.get('metadata', {}), - dict(name=self.pod_name, labels={'ansible-awx': settings.INSTALL_UUID, 'ansible-awx-job-id': str(self.task.instance.id)}), - ) - - return pod_spec - - @property - def pod_name(self): - return f"automation-job-{self.task.instance.id}" - - @property - def credential(self): - return self.task.instance.instance_group.credential - - @property - def namespace(self): - return self.pod_definition['metadata']['namespace'] - - @property - def kube_config(self): - host_input = self.credential.get_input('host') - config = { - "apiVersion": "v1", - "kind": "Config", - "preferences": {}, - "clusters": [{"name": host_input, "cluster": {"server": host_input}}], - "users": [{"name": host_input, "user": {"token": self.credential.get_input('bearer_token')}}], - "contexts": [{"name": host_input, "context": {"cluster": host_input, "user": host_input, "namespace": self.namespace}}], - "current-context": host_input, - } - - if self.credential.get_input('verify_ssl') and 'ssl_ca_cert' in self.credential.inputs: - config["clusters"][0]["cluster"]["certificate-authority-data"] = b64encode( - self.credential.get_input('ssl_ca_cert').encode() # encode to bytes - ).decode() # decode the base64 data into a str - else: - config["clusters"][0]["cluster"]["insecure-skip-tls-verify"] = True - return config diff --git a/awx/main/tasks/receptor.py b/awx/main/tasks/receptor.py new file mode 100644 index 0000000000..0a68800a4d --- /dev/null +++ b/awx/main/tasks/receptor.py @@ -0,0 +1,534 @@ +# Python +from base64 import b64encode +from collections import namedtuple +import concurrent.futures +from enum import Enum +import logging +import socket +import sys +import threading +import time +import yaml + +# Django +from django.conf import settings + +# Runner +import ansible_runner + +# AWX +from awx.main.utils.execution_environments import get_default_pod_spec +from awx.main.exceptions import ReceptorNodeNotFound +from awx.main.utils.common import ( + deepmerge, + parse_yaml_or_json, + cleanup_new_process, +) + +# Receptorctl +from receptorctl.socket_interface import ReceptorControl + +logger = logging.getLogger('awx.main.tasks.receptor') +__RECEPTOR_CONF = '/etc/receptor/receptor.conf' +RECEPTOR_ACTIVE_STATES = ('Pending', 'Running') + + +class ReceptorConnectionType(Enum): + DATAGRAM = 0 + STREAM = 1 + STREAMTLS = 2 + + +def get_receptor_sockfile(): + with open(__RECEPTOR_CONF, 'r') as f: + data = yaml.safe_load(f) + for section in data: + for entry_name, entry_data in section.items(): + if entry_name == 'control-service': + if 'filename' in entry_data: + return entry_data['filename'] + else: + raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} control-service entry does not have a filename parameter') + else: + raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} does not have control-service entry needed to get sockfile') + + +def get_tls_client(use_stream_tls=None): + if not use_stream_tls: + return None + + with open(__RECEPTOR_CONF, 'r') as f: + data = yaml.safe_load(f) + for section in data: + for entry_name, entry_data in section.items(): + if entry_name == 'tls-client': + if 'name' in entry_data: + return entry_data['name'] + return None + + +def get_receptor_ctl(): + receptor_sockfile = get_receptor_sockfile() + try: + return ReceptorControl(receptor_sockfile, config=__RECEPTOR_CONF, tlsclient=get_tls_client(True)) + except RuntimeError: + return ReceptorControl(receptor_sockfile) + + +def get_conn_type(node_name, receptor_ctl): + all_nodes = receptor_ctl.simple_command("status").get('Advertisements', None) + for node in all_nodes: + if node.get('NodeID') == node_name: + return ReceptorConnectionType(node.get('ConnType')) + raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh') + + +def administrative_workunit_reaper(work_list=None): + """ + This releases completed work units that were spawned by actions inside of this module + specifically, this should catch any completed work unit left by + - worker_info + - worker_cleanup + These should ordinarily be released when the method finishes, but this is a + cleanup of last-resort, in case something went awry + """ + receptor_ctl = get_receptor_ctl() + if work_list is None: + work_list = receptor_ctl.simple_command("work list") + + for unit_id, work_data in work_list.items(): + extra_data = work_data.get('ExtraData') + if (extra_data is None) or (extra_data.get('RemoteWorkType') != 'ansible-runner'): + continue # if this is not ansible-runner work, we do not want to touch it + params = extra_data.get('RemoteParams', {}).get('params') + if not params: + continue + if not (params == '--worker-info' or params.startswith('cleanup')): + continue # if this is not a cleanup or health check, we do not want to touch it + if work_data.get('StateName') in RECEPTOR_ACTIVE_STATES: + continue # do not want to touch active work units + logger.info(f'Reaping orphaned work unit {unit_id} with params {params}') + receptor_ctl.simple_command(f"work release {unit_id}") + + +class RemoteJobError(RuntimeError): + pass + + +def run_until_complete(node, timing_data=None, **kwargs): + """ + Runs an ansible-runner work_type on remote node, waits until it completes, then returns stdout. + """ + receptor_ctl = get_receptor_ctl() + + use_stream_tls = getattr(get_conn_type(node, receptor_ctl), 'name', None) == "STREAMTLS" + kwargs.setdefault('tlsclient', get_tls_client(use_stream_tls)) + kwargs.setdefault('ttl', '20s') + kwargs.setdefault('payload', '') + + transmit_start = time.time() + sign_work = False if settings.IS_K8S else True + result = receptor_ctl.submit_work(worktype='ansible-runner', node=node, signwork=sign_work, **kwargs) + + unit_id = result['unitid'] + run_start = time.time() + if timing_data: + timing_data['transmit_timing'] = run_start - transmit_start + run_timing = 0.0 + stdout = '' + + try: + + resultfile = receptor_ctl.get_work_results(unit_id) + + while run_timing < 20.0: + status = receptor_ctl.simple_command(f'work status {unit_id}') + state_name = status.get('StateName') + if state_name not in RECEPTOR_ACTIVE_STATES: + break + run_timing = time.time() - run_start + time.sleep(0.5) + else: + raise RemoteJobError(f'Receptor job timeout on {node} after {run_timing} seconds, state remains in {state_name}') + + if timing_data: + timing_data['run_timing'] = run_timing + + stdout = resultfile.read() + stdout = str(stdout, encoding='utf-8') + + finally: + + if settings.RECEPTOR_RELEASE_WORK: + res = receptor_ctl.simple_command(f"work release {unit_id}") + if res != {'released': unit_id}: + logger.warn(f'Could not confirm release of receptor work unit id {unit_id} from {node}, data: {res}') + + receptor_ctl.close() + + if state_name.lower() == 'failed': + work_detail = status.get('Detail', '') + if work_detail: + raise RemoteJobError(f'Receptor error from {node}, detail:\n{work_detail}') + else: + raise RemoteJobError(f'Unknown ansible-runner error on node {node}, stdout:\n{stdout}') + + return stdout + + +def worker_info(node_name, work_type='ansible-runner'): + error_list = [] + data = {'errors': error_list, 'transmit_timing': 0.0} + + try: + stdout = run_until_complete(node=node_name, timing_data=data, params={"params": "--worker-info"}) + + yaml_stdout = stdout.strip() + remote_data = {} + try: + remote_data = yaml.safe_load(yaml_stdout) + except Exception as json_e: + error_list.append(f'Failed to parse node {node_name} --worker-info output as YAML, error: {json_e}, data:\n{yaml_stdout}') + + if not isinstance(remote_data, dict): + error_list.append(f'Remote node {node_name} --worker-info output is not a YAML dict, output:{stdout}') + else: + error_list.extend(remote_data.pop('errors', [])) # merge both error lists + data.update(remote_data) + + except RemoteJobError as exc: + details = exc.args[0] + if 'unrecognized arguments: --worker-info' in details: + error_list.append(f'Old version (2.0.1 or earlier) of ansible-runner on node {node_name} without --worker-info') + else: + error_list.append(details) + + except (ReceptorNodeNotFound, RuntimeError) as exc: + error_list.append(str(exc)) + + # If we have a connection error, missing keys would be trivial consequence of that + if not data['errors']: + # see tasks.py usage of keys + missing_keys = set(('runner_version', 'mem_in_bytes', 'cpu_count')) - set(data.keys()) + if missing_keys: + data['errors'].append('Worker failed to return keys {}'.format(' '.join(missing_keys))) + + return data + + +def _convert_args_to_cli(vargs): + """ + For the ansible-runner worker cleanup command + converts the dictionary (parsed argparse variables) used for python interface + into a string of CLI options, which has to be used on execution nodes. + """ + args = ['cleanup'] + for option in ('exclude_strings', 'remove_images'): + if vargs.get(option): + args.append('--{}={}'.format(option.replace('_', '-'), ' '.join(vargs.get(option)))) + for option in ('file_pattern', 'image_prune', 'process_isolation_executable', 'grace_period'): + if vargs.get(option) is True: + args.append('--{}'.format(option.replace('_', '-'))) + elif vargs.get(option) not in (None, ''): + args.append('--{}={}'.format(option.replace('_', '-'), vargs.get(option))) + return args + + +def worker_cleanup(node_name, vargs, timeout=300.0): + args = _convert_args_to_cli(vargs) + + remote_command = ' '.join(args) + logger.debug(f'Running command over receptor mesh on {node_name}: ansible-runner worker {remote_command}') + + stdout = run_until_complete(node=node_name, params={"params": remote_command}) + + return stdout + + +class TransmitterThread(threading.Thread): + def run(self): + self.exc = None + + try: + super().run() + except Exception: + self.exc = sys.exc_info() + + +class AWXReceptorJob: + def __init__(self, task, runner_params=None): + self.task = task + self.runner_params = runner_params + self.unit_id = None + + if self.task and not self.task.instance.is_container_group_task: + execution_environment_params = self.task.build_execution_environment_params(self.task.instance, runner_params['private_data_dir']) + self.runner_params.update(execution_environment_params) + + if not settings.IS_K8S and self.work_type == 'local' and 'only_transmit_kwargs' not in self.runner_params: + self.runner_params['only_transmit_kwargs'] = True + + def run(self): + # We establish a connection to the Receptor socket + receptor_ctl = get_receptor_ctl() + + res = None + try: + res = self._run_internal(receptor_ctl) + return res + finally: + # Make sure to always release the work unit if we established it + if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK: + try: + receptor_ctl.simple_command(f"work release {self.unit_id}") + except Exception: + logger.exception(f"Error releasing work unit {self.unit_id}.") + + @property + def sign_work(self): + return False if settings.IS_K8S else True + + def _run_internal(self, receptor_ctl): + # Create a socketpair. Where the left side will be used for writing our payload + # (private data dir, kwargs). The right side will be passed to Receptor for + # reading. + sockin, sockout = socket.socketpair() + + transmitter_thread = TransmitterThread(target=self.transmit, args=[sockin]) + transmitter_thread.start() + + # submit our work, passing + # in the right side of our socketpair for reading. + _kw = {} + if self.work_type == 'ansible-runner': + _kw['node'] = self.task.instance.execution_node + use_stream_tls = get_conn_type(_kw['node'], receptor_ctl).name == "STREAMTLS" + _kw['tlsclient'] = get_tls_client(use_stream_tls) + result = receptor_ctl.submit_work(worktype=self.work_type, payload=sockout.makefile('rb'), params=self.receptor_params, signwork=self.sign_work, **_kw) + self.unit_id = result['unitid'] + # Update the job with the work unit in-memory so that the log_lifecycle + # will print out the work unit that is to be associated with the job in the database + # via the update_model() call. + # We want to log the work_unit_id as early as possible. A failure can happen in between + # when we start the job in receptor and when we associate the job <-> work_unit_id. + # In that case, there will be work running in receptor and Controller will not know + # which Job it is associated with. + # We do not programatically handle this case. Ideally, we would handle this with a reaper case. + # The two distinct job lifecycle log events below allow for us to at least detect when this + # edge case occurs. If the lifecycle event work_unit_id_received occurs without the + # work_unit_id_assigned event then this case may have occured. + self.task.instance.work_unit_id = result['unitid'] # Set work_unit_id in-memory only + self.task.instance.log_lifecycle("work_unit_id_received") + self.task.update_model(self.task.instance.pk, work_unit_id=result['unitid']) + self.task.instance.log_lifecycle("work_unit_id_assigned") + + sockin.close() + sockout.close() + + if transmitter_thread.exc: + raise transmitter_thread.exc[1].with_traceback(transmitter_thread.exc[2]) + + transmitter_thread.join() + + resultsock, resultfile = receptor_ctl.get_work_results(self.unit_id, return_socket=True, return_sockfile=True) + # Both "processor" and "cancel_watcher" are spawned in separate threads. + # We wait for the first one to return. If cancel_watcher returns first, + # we yank the socket out from underneath the processor, which will cause it + # to exit. A reference to the processor_future is passed into the cancel_watcher_future, + # Which exits if the job has finished normally. The context manager ensures we do not + # leave any threads laying around. + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + processor_future = executor.submit(self.processor, resultfile) + cancel_watcher_future = executor.submit(self.cancel_watcher, processor_future) + futures = [processor_future, cancel_watcher_future] + first_future = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED) + + res = list(first_future.done)[0].result() + if res.status == 'canceled': + receptor_ctl.simple_command(f"work cancel {self.unit_id}") + resultsock.shutdown(socket.SHUT_RDWR) + resultfile.close() + elif res.status == 'error': + try: + unit_status = receptor_ctl.simple_command(f'work status {self.unit_id}') + detail = unit_status.get('Detail', None) + state_name = unit_status.get('StateName', None) + except Exception: + detail = '' + state_name = '' + logger.exception(f'An error was encountered while getting status for work unit {self.unit_id}') + + if 'exceeded quota' in detail: + logger.warn(detail) + log_name = self.task.instance.log_format + logger.warn(f"Could not launch pod for {log_name}. Exceeded quota.") + self.task.update_model(self.task.instance.pk, status='pending') + return + # If ansible-runner ran, but an error occured at runtime, the traceback information + # is saved via the status_handler passed in to the processor. + if state_name == 'Succeeded': + return res + + if not self.task.instance.result_traceback: + try: + resultsock = receptor_ctl.get_work_results(self.unit_id, return_sockfile=True) + lines = resultsock.readlines() + receptor_output = b"".join(lines).decode() + if receptor_output: + self.task.instance.result_traceback = receptor_output + self.task.instance.save(update_fields=['result_traceback']) + elif detail: + self.task.instance.result_traceback = detail + self.task.instance.save(update_fields=['result_traceback']) + else: + logger.warn(f'No result details or output from {self.task.instance.log_format}, status:\n{unit_status}') + except Exception: + raise RuntimeError(detail) + + return res + + # Spawned in a thread so Receptor can start reading before we finish writing, we + # write our payload to the left side of our socketpair. + @cleanup_new_process + def transmit(self, _socket): + try: + ansible_runner.interface.run(streamer='transmit', _output=_socket.makefile('wb'), **self.runner_params) + finally: + # Socket must be shutdown here, or the reader will hang forever. + _socket.shutdown(socket.SHUT_WR) + + @cleanup_new_process + def processor(self, resultfile): + return ansible_runner.interface.run( + streamer='process', + quiet=True, + _input=resultfile, + event_handler=self.task.event_handler, + finished_callback=self.task.finished_callback, + status_handler=self.task.status_handler, + **self.runner_params, + ) + + @property + def receptor_params(self): + if self.task.instance.is_container_group_task: + spec_yaml = yaml.dump(self.pod_definition, explicit_start=True) + + receptor_params = { + "secret_kube_pod": spec_yaml, + "pod_pending_timeout": getattr(settings, 'AWX_CONTAINER_GROUP_POD_PENDING_TIMEOUT', "5m"), + } + + if self.credential: + kubeconfig_yaml = yaml.dump(self.kube_config, explicit_start=True) + receptor_params["secret_kube_config"] = kubeconfig_yaml + else: + private_data_dir = self.runner_params['private_data_dir'] + if self.work_type == 'ansible-runner' and settings.AWX_CLEANUP_PATHS: + # on execution nodes, we rely on the private data dir being deleted + cli_params = f"--private-data-dir={private_data_dir} --delete" + else: + # on hybrid nodes, we rely on the private data dir NOT being deleted + cli_params = f"--private-data-dir={private_data_dir}" + receptor_params = {"params": cli_params} + + return receptor_params + + @property + def work_type(self): + if self.task.instance.is_container_group_task: + if self.credential: + return 'kubernetes-runtime-auth' + return 'kubernetes-incluster-auth' + if self.task.instance.execution_node == settings.CLUSTER_HOST_ID or self.task.instance.execution_node == self.task.instance.controller_node: + return 'local' + return 'ansible-runner' + + @cleanup_new_process + def cancel_watcher(self, processor_future): + while True: + if processor_future.done(): + return processor_future.result() + + if self.task.cancel_callback(): + result = namedtuple('result', ['status', 'rc']) + return result('canceled', 1) + + time.sleep(1) + + @property + def pod_definition(self): + ee = self.task.instance.execution_environment + + default_pod_spec = get_default_pod_spec() + + pod_spec_override = {} + if self.task and self.task.instance.instance_group.pod_spec_override: + pod_spec_override = parse_yaml_or_json(self.task.instance.instance_group.pod_spec_override) + # According to the deepmerge docstring, the second dictionary will override when + # they share keys, which is the desired behavior. + # This allows user to only provide elements they want to override, and for us to still provide any + # defaults they don't want to change + pod_spec = deepmerge(default_pod_spec, pod_spec_override) + + pod_spec['spec']['containers'][0]['image'] = ee.image + pod_spec['spec']['containers'][0]['args'] = ['ansible-runner', 'worker', '--private-data-dir=/runner'] + + # Enforce EE Pull Policy + pull_options = {"always": "Always", "missing": "IfNotPresent", "never": "Never"} + if self.task and self.task.instance.execution_environment: + if self.task.instance.execution_environment.pull: + pod_spec['spec']['containers'][0]['imagePullPolicy'] = pull_options[self.task.instance.execution_environment.pull] + + if self.task and self.task.instance.is_container_group_task: + # If EE credential is passed, create an imagePullSecret + if self.task.instance.execution_environment and self.task.instance.execution_environment.credential: + # Create pull secret in k8s cluster based on ee cred + from awx.main.scheduler.kubernetes import PodManager # prevent circular import + + pm = PodManager(self.task.instance) + secret_name = pm.create_secret(job=self.task.instance) + + # Inject secret name into podspec + pod_spec['spec']['imagePullSecrets'] = [{"name": secret_name}] + + if self.task: + pod_spec['metadata'] = deepmerge( + pod_spec.get('metadata', {}), + dict(name=self.pod_name, labels={'ansible-awx': settings.INSTALL_UUID, 'ansible-awx-job-id': str(self.task.instance.id)}), + ) + + return pod_spec + + @property + def pod_name(self): + return f"automation-job-{self.task.instance.id}" + + @property + def credential(self): + return self.task.instance.instance_group.credential + + @property + def namespace(self): + return self.pod_definition['metadata']['namespace'] + + @property + def kube_config(self): + host_input = self.credential.get_input('host') + config = { + "apiVersion": "v1", + "kind": "Config", + "preferences": {}, + "clusters": [{"name": host_input, "cluster": {"server": host_input}}], + "users": [{"name": host_input, "user": {"token": self.credential.get_input('bearer_token')}}], + "contexts": [{"name": host_input, "context": {"cluster": host_input, "user": host_input, "namespace": self.namespace}}], + "current-context": host_input, + } + + if self.credential.get_input('verify_ssl') and 'ssl_ca_cert' in self.credential.inputs: + config["clusters"][0]["cluster"]["certificate-authority-data"] = b64encode( + self.credential.get_input('ssl_ca_cert').encode() # encode to bytes + ).decode() # decode the base64 data into a str + else: + config["clusters"][0]["cluster"]["insecure-skip-tls-verify"] = True + return config diff --git a/awx/main/tasks/system.py b/awx/main/tasks/system.py new file mode 100644 index 0000000000..e596668f89 --- /dev/null +++ b/awx/main/tasks/system.py @@ -0,0 +1,897 @@ +# Python +from collections import namedtuple +import functools +import importlib +import json +import logging +import os +from io import StringIO +from contextlib import redirect_stdout +import shutil +import time +from distutils.version import LooseVersion as Version + +# Django +from django.conf import settings +from django.db import transaction, DatabaseError, IntegrityError +from django.db.models.fields.related import ForeignKey +from django.utils.timezone import now +from django.utils.encoding import smart_str +from django.contrib.auth.models import User +from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import gettext_noop +from django.core.cache import cache +from django.core.exceptions import ObjectDoesNotExist + +# Django-CRUM +from crum import impersonate + + +# Runner +import ansible_runner.cleanup + +# dateutil +from dateutil.parser import parse as parse_date + +# AWX +from awx import __version__ as awx_application_version +from awx.main.access import access_registry +from awx.main.models import ( + Schedule, + TowerScheduleState, + Instance, + InstanceGroup, + UnifiedJob, + Notification, + Inventory, + SmartInventoryMembership, + Job, +) +from awx.main.constants import ACTIVE_STATES +from awx.main.dispatch.publish import task +from awx.main.dispatch import get_local_queuename, reaper +from awx.main.utils.common import ( + ignore_inventory_computed_fields, + ignore_inventory_group_removal, + schedule_task_manager, +) + +from awx.main.utils.external_logging import reconfigure_rsyslog +from awx.main.utils.reload import stop_local_services +from awx.main.utils.pglock import advisory_lock +from awx.main.tasks.receptor import get_receptor_ctl, worker_info, worker_cleanup, administrative_workunit_reaper +from awx.main.consumers import emit_channel_notification +from awx.main import analytics +from awx.conf import settings_registry +from awx.main.analytics.subsystem_metrics import Metrics + +from rest_framework.exceptions import PermissionDenied + +logger = logging.getLogger('awx.main.tasks.system') + +OPENSSH_KEY_ERROR = u'''\ +It looks like you're trying to use a private key in OpenSSH format, which \ +isn't supported by the installed version of OpenSSH on this instance. \ +Try upgrading OpenSSH or providing your private key in an different format. \ +''' + + +def dispatch_startup(): + startup_logger = logging.getLogger('awx.main.tasks') + startup_logger.debug("Syncing Schedules") + for sch in Schedule.objects.all(): + try: + sch.update_computed_fields() + except Exception: + logger.exception("Failed to rebuild schedule {}.".format(sch)) + + # + # When the dispatcher starts, if the instance cannot be found in the database, + # automatically register it. This is mostly useful for openshift-based + # deployments where: + # + # 2 Instances come online + # Instance B encounters a network blip, Instance A notices, and + # deprovisions it + # Instance B's connectivity is restored, the dispatcher starts, and it + # re-registers itself + # + # In traditional container-less deployments, instances don't get + # deprovisioned when they miss their heartbeat, so this code is mostly a + # no-op. + # + apply_cluster_membership_policies() + cluster_node_heartbeat() + Metrics().clear_values() + + # Update Tower's rsyslog.conf file based on loggins settings in the db + reconfigure_rsyslog() + + +def inform_cluster_of_shutdown(): + try: + this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID) + this_inst.mark_offline(update_last_seen=True, errors=_('Instance received normal shutdown signal')) + try: + reaper.reap(this_inst) + except Exception: + logger.exception('failed to reap jobs for {}'.format(this_inst.hostname)) + logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname)) + except Exception: + logger.exception('Encountered problem with normal shutdown signal.') + + +@task(queue=get_local_queuename) +def apply_cluster_membership_policies(): + from awx.main.signals import disable_activity_stream + + started_waiting = time.time() + with advisory_lock('cluster_policy_lock', wait=True): + lock_time = time.time() - started_waiting + if lock_time > 1.0: + to_log = logger.info + else: + to_log = logger.debug + to_log('Waited {} seconds to obtain lock name: cluster_policy_lock'.format(lock_time)) + started_compute = time.time() + # Hop nodes should never get assigned to an InstanceGroup. + all_instances = list(Instance.objects.exclude(node_type='hop').order_by('id')) + all_groups = list(InstanceGroup.objects.prefetch_related('instances')) + + total_instances = len(all_instances) + actual_groups = [] + actual_instances = [] + Group = namedtuple('Group', ['obj', 'instances', 'prior_instances']) + Node = namedtuple('Instance', ['obj', 'groups']) + + # Process policy instance list first, these will represent manually managed memberships + instance_hostnames_map = {inst.hostname: inst for inst in all_instances} + for ig in all_groups: + group_actual = Group(obj=ig, instances=[], prior_instances=[instance.pk for instance in ig.instances.all()]) # obtained in prefetch + for hostname in ig.policy_instance_list: + if hostname not in instance_hostnames_map: + logger.info("Unknown instance {} in {} policy list".format(hostname, ig.name)) + continue + inst = instance_hostnames_map[hostname] + group_actual.instances.append(inst.id) + # NOTE: arguable behavior: policy-list-group is not added to + # instance's group count for consideration in minimum-policy rules + if group_actual.instances: + logger.debug("Policy List, adding Instances {} to Group {}".format(group_actual.instances, ig.name)) + + actual_groups.append(group_actual) + + # Process Instance minimum policies next, since it represents a concrete lower bound to the + # number of instances to make available to instance groups + actual_instances = [Node(obj=i, groups=[]) for i in all_instances if i.managed_by_policy] + logger.debug("Total instances: {}, available for policy: {}".format(total_instances, len(actual_instances))) + for g in sorted(actual_groups, key=lambda x: len(x.instances)): + exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control' + policy_min_added = [] + for i in sorted(actual_instances, key=lambda x: len(x.groups)): + if i.obj.node_type == exclude_type: + continue # never place execution instances in controlplane group or control instances in other groups + if len(g.instances) >= g.obj.policy_instance_minimum: + break + if i.obj.id in g.instances: + # If the instance is already _in_ the group, it was + # applied earlier via the policy list + continue + g.instances.append(i.obj.id) + i.groups.append(g.obj.id) + policy_min_added.append(i.obj.id) + if policy_min_added: + logger.debug("Policy minimum, adding Instances {} to Group {}".format(policy_min_added, g.obj.name)) + + # Finally, process instance policy percentages + for g in sorted(actual_groups, key=lambda x: len(x.instances)): + exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control' + candidate_pool_ct = sum(1 for i in actual_instances if i.obj.node_type != exclude_type) + if not candidate_pool_ct: + continue + policy_per_added = [] + for i in sorted(actual_instances, key=lambda x: len(x.groups)): + if i.obj.node_type == exclude_type: + continue + if i.obj.id in g.instances: + # If the instance is already _in_ the group, it was + # applied earlier via a minimum policy or policy list + continue + if 100 * float(len(g.instances)) / candidate_pool_ct >= g.obj.policy_instance_percentage: + break + g.instances.append(i.obj.id) + i.groups.append(g.obj.id) + policy_per_added.append(i.obj.id) + if policy_per_added: + logger.debug("Policy percentage, adding Instances {} to Group {}".format(policy_per_added, g.obj.name)) + + # Determine if any changes need to be made + needs_change = False + for g in actual_groups: + if set(g.instances) != set(g.prior_instances): + needs_change = True + break + if not needs_change: + logger.debug('Cluster policy no-op finished in {} seconds'.format(time.time() - started_compute)) + return + + # On a differential basis, apply instances to groups + with transaction.atomic(): + with disable_activity_stream(): + for g in actual_groups: + if g.obj.is_container_group: + logger.debug('Skipping containerized group {} for policy calculation'.format(g.obj.name)) + continue + instances_to_add = set(g.instances) - set(g.prior_instances) + instances_to_remove = set(g.prior_instances) - set(g.instances) + if instances_to_add: + logger.debug('Adding instances {} to group {}'.format(list(instances_to_add), g.obj.name)) + g.obj.instances.add(*instances_to_add) + if instances_to_remove: + logger.debug('Removing instances {} from group {}'.format(list(instances_to_remove), g.obj.name)) + g.obj.instances.remove(*instances_to_remove) + logger.debug('Cluster policy computation finished in {} seconds'.format(time.time() - started_compute)) + + +@task(queue='tower_broadcast_all') +def handle_setting_changes(setting_keys): + orig_len = len(setting_keys) + for i in range(orig_len): + for dependent_key in settings_registry.get_dependent_settings(setting_keys[i]): + setting_keys.append(dependent_key) + cache_keys = set(setting_keys) + logger.debug('cache delete_many(%r)', cache_keys) + cache.delete_many(cache_keys) + + if any([setting.startswith('LOG_AGGREGATOR') for setting in setting_keys]): + reconfigure_rsyslog() + + +@task(queue='tower_broadcast_all') +def delete_project_files(project_path): + # TODO: possibly implement some retry logic + lock_file = project_path + '.lock' + if os.path.exists(project_path): + try: + shutil.rmtree(project_path) + logger.debug('Success removing project files {}'.format(project_path)) + except Exception: + logger.exception('Could not remove project directory {}'.format(project_path)) + if os.path.exists(lock_file): + try: + os.remove(lock_file) + logger.debug('Success removing {}'.format(lock_file)) + except Exception: + logger.exception('Could not remove lock file {}'.format(lock_file)) + + +@task(queue='tower_broadcast_all') +def profile_sql(threshold=1, minutes=1): + if threshold <= 0: + cache.delete('awx-profile-sql-threshold') + logger.error('SQL PROFILING DISABLED') + else: + cache.set('awx-profile-sql-threshold', threshold, timeout=minutes * 60) + logger.error('SQL QUERIES >={}s ENABLED FOR {} MINUTE(S)'.format(threshold, minutes)) + + +@task(queue=get_local_queuename) +def send_notifications(notification_list, job_id=None): + if not isinstance(notification_list, list): + raise TypeError("notification_list should be of type list") + if job_id is not None: + job_actual = UnifiedJob.objects.get(id=job_id) + + notifications = Notification.objects.filter(id__in=notification_list) + if job_id is not None: + job_actual.notifications.add(*notifications) + + for notification in notifications: + update_fields = ['status', 'notifications_sent'] + try: + sent = notification.notification_template.send(notification.subject, notification.body) + notification.status = "successful" + notification.notifications_sent = sent + if job_id is not None: + job_actual.log_lifecycle("notifications_sent") + except Exception as e: + logger.exception("Send Notification Failed {}".format(e)) + notification.status = "failed" + notification.error = smart_str(e) + update_fields.append('error') + finally: + try: + notification.save(update_fields=update_fields) + except Exception: + logger.exception('Error saving notification {} result.'.format(notification.id)) + + +@task(queue=get_local_queuename) +def gather_analytics(): + from awx.conf.models import Setting + from rest_framework.fields import DateTimeField + + last_gather = Setting.objects.filter(key='AUTOMATION_ANALYTICS_LAST_GATHER').first() + last_time = DateTimeField().to_internal_value(last_gather.value) if last_gather and last_gather.value else None + gather_time = now() + + if not last_time or ((gather_time - last_time).total_seconds() > settings.AUTOMATION_ANALYTICS_GATHER_INTERVAL): + analytics.gather() + + +@task(queue=get_local_queuename) +def purge_old_stdout_files(): + nowtime = time.time() + for f in os.listdir(settings.JOBOUTPUT_ROOT): + if os.path.getctime(os.path.join(settings.JOBOUTPUT_ROOT, f)) < nowtime - settings.LOCAL_STDOUT_EXPIRE_TIME: + os.unlink(os.path.join(settings.JOBOUTPUT_ROOT, f)) + logger.debug("Removing {}".format(os.path.join(settings.JOBOUTPUT_ROOT, f))) + + +def _cleanup_images_and_files(**kwargs): + if settings.IS_K8S: + return + this_inst = Instance.objects.me() + runner_cleanup_kwargs = this_inst.get_cleanup_task_kwargs(**kwargs) + if runner_cleanup_kwargs: + stdout = '' + with StringIO() as buffer: + with redirect_stdout(buffer): + ansible_runner.cleanup.run_cleanup(runner_cleanup_kwargs) + stdout = buffer.getvalue() + if '(changed: True)' in stdout: + logger.info(f'Performed local cleanup with kwargs {kwargs}, output:\n{stdout}') + + # if we are the first instance alphabetically, then run cleanup on execution nodes + checker_instance = Instance.objects.filter(node_type__in=['hybrid', 'control'], enabled=True, capacity__gt=0).order_by('-hostname').first() + if checker_instance and this_inst.hostname == checker_instance.hostname: + for inst in Instance.objects.filter(node_type='execution', enabled=True, capacity__gt=0): + runner_cleanup_kwargs = inst.get_cleanup_task_kwargs(**kwargs) + if not runner_cleanup_kwargs: + continue + try: + stdout = worker_cleanup(inst.hostname, runner_cleanup_kwargs) + if '(changed: True)' in stdout: + logger.info(f'Performed cleanup on execution node {inst.hostname} with output:\n{stdout}') + except RuntimeError: + logger.exception(f'Error running cleanup on execution node {inst.hostname}') + + +@task(queue='tower_broadcast_all') +def handle_removed_image(remove_images=None): + """Special broadcast invocation of this method to handle case of deleted EE""" + _cleanup_images_and_files(remove_images=remove_images, file_pattern='') + + +@task(queue=get_local_queuename) +def cleanup_images_and_files(): + _cleanup_images_and_files() + + +@task(queue=get_local_queuename) +def cluster_node_health_check(node): + """ + Used for the health check endpoint, refreshes the status of the instance, but must be ran on target node + """ + if node == '': + logger.warn('Local health check incorrectly called with blank string') + return + elif node != settings.CLUSTER_HOST_ID: + logger.warn(f'Local health check for {node} incorrectly sent to {settings.CLUSTER_HOST_ID}') + return + try: + this_inst = Instance.objects.me() + except Instance.DoesNotExist: + logger.warn(f'Instance record for {node} missing, could not check capacity.') + return + this_inst.local_health_check() + + +@task(queue=get_local_queuename) +def execution_node_health_check(node): + if node == '': + logger.warn('Remote health check incorrectly called with blank string') + return + try: + instance = Instance.objects.get(hostname=node) + except Instance.DoesNotExist: + logger.warn(f'Instance record for {node} missing, could not check capacity.') + return + + if instance.node_type != 'execution': + raise RuntimeError(f'Execution node health check ran against {instance.node_type} node {instance.hostname}') + + data = worker_info(node) + + prior_capacity = instance.capacity + + instance.save_health_data( + version='ansible-runner-' + data.get('runner_version', '???'), + cpu=data.get('cpu_count', 0), + memory=data.get('mem_in_bytes', 0), + uuid=data.get('uuid'), + errors='\n'.join(data.get('errors', [])), + ) + + if data['errors']: + formatted_error = "\n".join(data["errors"]) + if prior_capacity: + logger.warn(f'Health check marking execution node {node} as lost, errors:\n{formatted_error}') + else: + logger.info(f'Failed to find capacity of new or lost execution node {node}, errors:\n{formatted_error}') + else: + logger.info('Set capacity of execution node {} to {}, worker info data:\n{}'.format(node, instance.capacity, json.dumps(data, indent=2))) + + return data + + +def inspect_execution_nodes(instance_list): + with advisory_lock('inspect_execution_nodes_lock', wait=False): + node_lookup = {inst.hostname: inst for inst in instance_list} + + ctl = get_receptor_ctl() + mesh_status = ctl.simple_command('status') + + nowtime = now() + workers = mesh_status['Advertisements'] + for ad in workers: + hostname = ad['NodeID'] + if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []): + continue + + changed = False + if hostname in node_lookup: + instance = node_lookup[hostname] + else: + logger.warn(f"Unrecognized node on mesh advertising ansible-runner work type: {hostname}") + continue + + was_lost = instance.is_lost(ref_time=nowtime) + last_seen = parse_date(ad['Time']) + + if instance.last_seen and instance.last_seen >= last_seen: + continue + instance.last_seen = last_seen + instance.save(update_fields=['last_seen']) + + if changed: + execution_node_health_check.apply_async([hostname]) + elif was_lost: + # if the instance *was* lost, but has appeared again, + # attempt to re-establish the initial capacity and version + # check + logger.warn(f'Execution node attempting to rejoin as instance {hostname}.') + execution_node_health_check.apply_async([hostname]) + elif instance.capacity == 0 and instance.enabled: + # nodes with proven connection but need remediation run health checks are reduced frequency + if not instance.last_health_check or (nowtime - instance.last_health_check).total_seconds() >= settings.EXECUTION_NODE_REMEDIATION_CHECKS: + # Periodically re-run the health check of errored nodes, in case someone fixed it + # TODO: perhaps decrease the frequency of these checks + logger.debug(f'Restarting health check for execution node {hostname} with known errors.') + execution_node_health_check.apply_async([hostname]) + + +@task(queue=get_local_queuename) +def cluster_node_heartbeat(): + logger.debug("Cluster node heartbeat task.") + nowtime = now() + instance_list = list(Instance.objects.all()) + this_inst = None + lost_instances = [] + + for inst in instance_list: + if inst.hostname == settings.CLUSTER_HOST_ID: + this_inst = inst + instance_list.remove(inst) + break + else: + (changed, this_inst) = Instance.objects.get_or_register() + if changed: + logger.info("Registered tower control node '{}'".format(this_inst.hostname)) + + inspect_execution_nodes(instance_list) + + for inst in list(instance_list): + if inst.is_lost(ref_time=nowtime): + lost_instances.append(inst) + instance_list.remove(inst) + + if this_inst: + startup_event = this_inst.is_lost(ref_time=nowtime) + this_inst.local_health_check() + if startup_event and this_inst.capacity != 0: + logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname)) + return + else: + raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID)) + # IFF any node has a greater version than we do, then we'll shutdown services + for other_inst in instance_list: + if other_inst.version == "" or other_inst.version.startswith('ansible-runner') or other_inst.node_type == 'execution': + continue + if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG: + logger.error( + "Host {} reports version {}, but this node {} is at {}, shutting down".format( + other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version + ) + ) + # Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance. + # The heartbeat task will reset the capacity to the system capacity after upgrade. + stop_local_services(communicate=False) + raise RuntimeError("Shutting down.") + + for other_inst in lost_instances: + try: + reaper.reap(other_inst) + except Exception: + logger.exception('failed to reap jobs for {}'.format(other_inst.hostname)) + try: + # Capacity could already be 0 because: + # * It's a new node and it never had a heartbeat + # * It was set to 0 by another tower node running this method + # * It was set to 0 by this node, but auto deprovisioning is off + # + # If auto deprovisining is on, don't bother setting the capacity to 0 + # since we will delete the node anyway. + if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES: + other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive')) + logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen)) + elif settings.AWX_AUTO_DEPROVISION_INSTANCES: + deprovision_hostname = other_inst.hostname + other_inst.delete() + logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname)) + except DatabaseError as e: + if 'did not affect any rows' in str(e): + logger.debug('Another instance has marked {} as lost'.format(other_inst.hostname)) + else: + logger.exception('Error marking {} as lost'.format(other_inst.hostname)) + + +@task(queue=get_local_queuename) +def awx_receptor_workunit_reaper(): + """ + When an AWX job is launched via receptor, files such as status, stdin, and stdout are created + in a specific receptor directory. This directory on disk is a random 8 character string, e.g. qLL2JFNT + This is also called the work Unit ID in receptor, and is used in various receptor commands, + e.g. "work results qLL2JFNT" + After an AWX job executes, the receptor work unit directory is cleaned up by + issuing the work release command. In some cases the release process might fail, or + if AWX crashes during a job's execution, the work release command is never issued to begin with. + As such, this periodic task will obtain a list of all receptor work units, and find which ones + belong to AWX jobs that are in a completed state (status is canceled, error, or succeeded). + This task will call "work release" on each of these work units to clean up the files on disk. + + Note that when we call "work release" on a work unit that actually represents remote work + both the local and remote work units are cleaned up. + + Since we are cleaning up jobs that controller considers to be inactive, we take the added + precaution of calling "work cancel" in case the work unit is still active. + """ + if not settings.RECEPTOR_RELEASE_WORK: + return + logger.debug("Checking for unreleased receptor work units") + receptor_ctl = get_receptor_ctl() + receptor_work_list = receptor_ctl.simple_command("work list") + + unit_ids = [id for id in receptor_work_list] + jobs_with_unreleased_receptor_units = UnifiedJob.objects.filter(work_unit_id__in=unit_ids).exclude(status__in=ACTIVE_STATES) + for job in jobs_with_unreleased_receptor_units: + logger.debug(f"{job.log_format} is not active, reaping receptor work unit {job.work_unit_id}") + receptor_ctl.simple_command(f"work cancel {job.work_unit_id}") + receptor_ctl.simple_command(f"work release {job.work_unit_id}") + + administrative_workunit_reaper(receptor_work_list) + + +@task(queue=get_local_queuename) +def awx_k8s_reaper(): + if not settings.RECEPTOR_RELEASE_WORK: + return + + from awx.main.scheduler.kubernetes import PodManager # prevent circular import + + for group in InstanceGroup.objects.filter(is_container_group=True).iterator(): + logger.debug("Checking for orphaned k8s pods for {}.".format(group)) + pods = PodManager.list_active_jobs(group) + for job in UnifiedJob.objects.filter(pk__in=pods.keys()).exclude(status__in=ACTIVE_STATES): + logger.debug('{} is no longer active, reaping orphaned k8s pod'.format(job.log_format)) + try: + pm = PodManager(job) + pm.kube_api.delete_namespaced_pod(name=pods[job.id], namespace=pm.namespace, _request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT) + except Exception: + logger.exception("Failed to delete orphaned pod {} from {}".format(job.log_format, group)) + + +@task(queue=get_local_queuename) +def awx_periodic_scheduler(): + with advisory_lock('awx_periodic_scheduler_lock', wait=False) as acquired: + if acquired is False: + logger.debug("Not running periodic scheduler, another task holds lock") + return + logger.debug("Starting periodic scheduler") + + run_now = now() + state = TowerScheduleState.get_solo() + last_run = state.schedule_last_run + logger.debug("Last scheduler run was: %s", last_run) + state.schedule_last_run = run_now + state.save() + + old_schedules = Schedule.objects.enabled().before(last_run) + for schedule in old_schedules: + schedule.update_computed_fields() + schedules = Schedule.objects.enabled().between(last_run, run_now) + + invalid_license = False + try: + access_registry[Job](None).check_license(quiet=True) + except PermissionDenied as e: + invalid_license = e + + for schedule in schedules: + template = schedule.unified_job_template + schedule.update_computed_fields() # To update next_run timestamp. + if template.cache_timeout_blocked: + logger.warn("Cache timeout is in the future, bypassing schedule for template %s" % str(template.id)) + continue + try: + job_kwargs = schedule.get_job_kwargs() + new_unified_job = schedule.unified_job_template.create_unified_job(**job_kwargs) + logger.debug('Spawned {} from schedule {}-{}.'.format(new_unified_job.log_format, schedule.name, schedule.pk)) + + if invalid_license: + new_unified_job.status = 'failed' + new_unified_job.job_explanation = str(invalid_license) + new_unified_job.save(update_fields=['status', 'job_explanation']) + new_unified_job.websocket_emit_status("failed") + raise invalid_license + can_start = new_unified_job.signal_start() + except Exception: + logger.exception('Error spawning scheduled job.') + continue + if not can_start: + new_unified_job.status = 'failed' + new_unified_job.job_explanation = gettext_noop( + "Scheduled job could not start because it \ + was not in the right state or required manual credentials" + ) + new_unified_job.save(update_fields=['status', 'job_explanation']) + new_unified_job.websocket_emit_status("failed") + emit_channel_notification('schedules-changed', dict(id=schedule.id, group_name="schedules")) + state.save() + + +@task(queue=get_local_queuename) +def handle_work_success(task_actual): + try: + instance = UnifiedJob.get_instance_by_type(task_actual['type'], task_actual['id']) + except ObjectDoesNotExist: + logger.warning('Missing {} `{}` in success callback.'.format(task_actual['type'], task_actual['id'])) + return + if not instance: + return + + schedule_task_manager() + + +@task(queue=get_local_queuename) +def handle_work_error(task_id, *args, **kwargs): + subtasks = kwargs.get('subtasks', None) + logger.debug('Executing error task id %s, subtasks: %s' % (task_id, str(subtasks))) + first_instance = None + first_instance_type = '' + if subtasks is not None: + for each_task in subtasks: + try: + instance = UnifiedJob.get_instance_by_type(each_task['type'], each_task['id']) + if not instance: + # Unknown task type + logger.warn("Unknown task type: {}".format(each_task['type'])) + continue + except ObjectDoesNotExist: + logger.warning('Missing {} `{}` in error callback.'.format(each_task['type'], each_task['id'])) + continue + + if first_instance is None: + first_instance = instance + first_instance_type = each_task['type'] + + if instance.celery_task_id != task_id and not instance.cancel_flag and not instance.status == 'successful': + instance.status = 'failed' + instance.failed = True + if not instance.job_explanation: + instance.job_explanation = 'Previous Task Failed: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}' % ( + first_instance_type, + first_instance.name, + first_instance.id, + ) + instance.save() + instance.websocket_emit_status("failed") + + # We only send 1 job complete message since all the job completion message + # handling does is trigger the scheduler. If we extend the functionality of + # what the job complete message handler does then we may want to send a + # completion event for each job here. + if first_instance: + schedule_task_manager() + pass + + +@task(queue=get_local_queuename) +def handle_success_and_failure_notifications(job_id): + uj = UnifiedJob.objects.get(pk=job_id) + retries = 0 + while retries < 5: + if uj.finished: + uj.send_notification_templates('succeeded' if uj.status == 'successful' else 'failed') + return + else: + # wait a few seconds to avoid a race where the + # events are persisted _before_ the UJ.status + # changes from running -> successful + retries += 1 + time.sleep(1) + uj = UnifiedJob.objects.get(pk=job_id) + + logger.warn(f"Failed to even try to send notifications for job '{uj}' due to job not being in finished state.") + + +@task(queue=get_local_queuename) +def update_inventory_computed_fields(inventory_id): + """ + Signal handler and wrapper around inventory.update_computed_fields to + prevent unnecessary recursive calls. + """ + i = Inventory.objects.filter(id=inventory_id) + if not i.exists(): + logger.error("Update Inventory Computed Fields failed due to missing inventory: " + str(inventory_id)) + return + i = i[0] + try: + i.update_computed_fields() + except DatabaseError as e: + if 'did not affect any rows' in str(e): + logger.debug('Exiting duplicate update_inventory_computed_fields task.') + return + raise + + +def update_smart_memberships_for_inventory(smart_inventory): + current = set(SmartInventoryMembership.objects.filter(inventory=smart_inventory).values_list('host_id', flat=True)) + new = set(smart_inventory.hosts.values_list('id', flat=True)) + additions = new - current + removals = current - new + if additions or removals: + with transaction.atomic(): + if removals: + SmartInventoryMembership.objects.filter(inventory=smart_inventory, host_id__in=removals).delete() + if additions: + add_for_inventory = [SmartInventoryMembership(inventory_id=smart_inventory.id, host_id=host_id) for host_id in additions] + SmartInventoryMembership.objects.bulk_create(add_for_inventory, ignore_conflicts=True) + logger.debug( + 'Smart host membership cached for {}, {} additions, {} removals, {} total count.'.format( + smart_inventory.pk, len(additions), len(removals), len(new) + ) + ) + return True # changed + return False + + +@task(queue=get_local_queuename) +def update_host_smart_inventory_memberships(): + smart_inventories = Inventory.objects.filter(kind='smart', host_filter__isnull=False, pending_deletion=False) + changed_inventories = set([]) + for smart_inventory in smart_inventories: + try: + changed = update_smart_memberships_for_inventory(smart_inventory) + if changed: + changed_inventories.add(smart_inventory) + except IntegrityError: + logger.exception('Failed to update smart inventory memberships for {}'.format(smart_inventory.pk)) + # Update computed fields for changed inventories outside atomic action + for smart_inventory in changed_inventories: + smart_inventory.update_computed_fields() + + +@task(queue=get_local_queuename) +def delete_inventory(inventory_id, user_id, retries=5): + # Delete inventory as user + if user_id is None: + user = None + else: + try: + user = User.objects.get(id=user_id) + except Exception: + user = None + with ignore_inventory_computed_fields(), ignore_inventory_group_removal(), impersonate(user): + try: + i = Inventory.objects.get(id=inventory_id) + for host in i.hosts.iterator(): + host.job_events_as_primary_host.update(host=None) + i.delete() + emit_channel_notification('inventories-status_changed', {'group_name': 'inventories', 'inventory_id': inventory_id, 'status': 'deleted'}) + logger.debug('Deleted inventory {} as user {}.'.format(inventory_id, user_id)) + except Inventory.DoesNotExist: + logger.exception("Delete Inventory failed due to missing inventory: " + str(inventory_id)) + return + except DatabaseError: + logger.exception('Database error deleting inventory {}, but will retry.'.format(inventory_id)) + if retries > 0: + time.sleep(10) + delete_inventory(inventory_id, user_id, retries=retries - 1) + + +def with_path_cleanup(f): + @functools.wraps(f) + def _wrapped(self, *args, **kwargs): + try: + return f(self, *args, **kwargs) + finally: + for p in self.cleanup_paths: + try: + if os.path.isdir(p): + shutil.rmtree(p, ignore_errors=True) + elif os.path.exists(p): + os.remove(p) + except OSError: + logger.exception("Failed to remove tmp file: {}".format(p)) + self.cleanup_paths = [] + + return _wrapped + + +def _reconstruct_relationships(copy_mapping): + for old_obj, new_obj in copy_mapping.items(): + model = type(old_obj) + for field_name in getattr(model, 'FIELDS_TO_PRESERVE_AT_COPY', []): + field = model._meta.get_field(field_name) + if isinstance(field, ForeignKey): + if getattr(new_obj, field_name, None): + continue + related_obj = getattr(old_obj, field_name) + related_obj = copy_mapping.get(related_obj, related_obj) + setattr(new_obj, field_name, related_obj) + elif field.many_to_many: + for related_obj in getattr(old_obj, field_name).all(): + logger.debug('Deep copy: Adding {} to {}({}).{} relationship'.format(related_obj, new_obj, model, field_name)) + getattr(new_obj, field_name).add(copy_mapping.get(related_obj, related_obj)) + new_obj.save() + + +@task(queue=get_local_queuename) +def deep_copy_model_obj(model_module, model_name, obj_pk, new_obj_pk, user_pk, uuid, permission_check_func=None): + sub_obj_list = cache.get(uuid) + if sub_obj_list is None: + logger.error('Deep copy {} from {} to {} failed unexpectedly.'.format(model_name, obj_pk, new_obj_pk)) + return + + logger.debug('Deep copy {} from {} to {}.'.format(model_name, obj_pk, new_obj_pk)) + from awx.api.generics import CopyAPIView + from awx.main.signals import disable_activity_stream + + model = getattr(importlib.import_module(model_module), model_name, None) + if model is None: + return + try: + obj = model.objects.get(pk=obj_pk) + new_obj = model.objects.get(pk=new_obj_pk) + creater = User.objects.get(pk=user_pk) + except ObjectDoesNotExist: + logger.warning("Object or user no longer exists.") + return + with transaction.atomic(), ignore_inventory_computed_fields(), disable_activity_stream(): + copy_mapping = {} + for sub_obj_setup in sub_obj_list: + sub_model = getattr(importlib.import_module(sub_obj_setup[0]), sub_obj_setup[1], None) + if sub_model is None: + continue + try: + sub_obj = sub_model.objects.get(pk=sub_obj_setup[2]) + except ObjectDoesNotExist: + continue + copy_mapping.update(CopyAPIView.copy_model_obj(obj, new_obj, sub_model, sub_obj, creater)) + _reconstruct_relationships(copy_mapping) + if permission_check_func: + permission_check_func = getattr(getattr(importlib.import_module(permission_check_func[0]), permission_check_func[1]), permission_check_func[2]) + permission_check_func(creater, copy_mapping.values()) + if isinstance(new_obj, Inventory): + update_inventory_computed_fields.delay(new_obj.id) diff --git a/awx/main/tests/functional/api/test_instance.py b/awx/main/tests/functional/api/test_instance.py index c65cea0c01..4184d876aa 100644 --- a/awx/main/tests/functional/api/test_instance.py +++ b/awx/main/tests/functional/api/test_instance.py @@ -62,7 +62,7 @@ def test_health_check_throws_error(post, admin_user): # we will simulate a receptor error, similar to this one # https://github.com/ansible/receptor/blob/156e6e24a49fbf868734507f9943ac96208ed8f5/receptorctl/receptorctl/socket_interface.py#L204 # related to issue https://github.com/ansible/tower/issues/5315 - with mock.patch('awx.main.utils.receptor.run_until_complete', side_effect=RuntimeError('Remote error: foobar')): + with mock.patch('awx.main.tasks.receptor.run_until_complete', side_effect=RuntimeError('Remote error: foobar')): post(url=url, user=admin_user, expect=200) instance.refresh_from_db() assert 'Remote error: foobar' in instance.errors diff --git a/awx/main/tests/functional/task_management/test_container_groups.py b/awx/main/tests/functional/task_management/test_container_groups.py index 33a0f72392..9c565f16d0 100644 --- a/awx/main/tests/functional/task_management/test_container_groups.py +++ b/awx/main/tests/functional/task_management/test_container_groups.py @@ -5,7 +5,7 @@ from collections import namedtuple from unittest import mock # noqa import pytest -from awx.main.tasks import AWXReceptorJob +from awx.main.tasks.receptor import AWXReceptorJob from awx.main.utils import ( create_temporary_fifo, ) diff --git a/awx/main/tests/functional/task_management/test_rampart_groups.py b/awx/main/tests/functional/task_management/test_rampart_groups.py index 26b19399d4..db39d30e22 100644 --- a/awx/main/tests/functional/task_management/test_rampart_groups.py +++ b/awx/main/tests/functional/task_management/test_rampart_groups.py @@ -3,7 +3,7 @@ from unittest import mock from datetime import timedelta from awx.main.scheduler import TaskManager from awx.main.models import InstanceGroup, WorkflowJob -from awx.main.tasks import apply_cluster_membership_policies +from awx.main.tasks.system import apply_cluster_membership_policies @pytest.mark.django_db diff --git a/awx/main/tests/functional/test_copy.py b/awx/main/tests/functional/test_copy.py index 86334efd34..41f635dde9 100644 --- a/awx/main/tests/functional/test_copy.py +++ b/awx/main/tests/functional/test_copy.py @@ -5,7 +5,7 @@ from awx.api.versioning import reverse from awx.main.utils import decrypt_field from awx.main.models.workflow import WorkflowJobTemplate, WorkflowJobTemplateNode, WorkflowApprovalTemplate from awx.main.models.jobs import JobTemplate -from awx.main.tasks import deep_copy_model_obj +from awx.main.tasks.system import deep_copy_model_obj @pytest.mark.django_db diff --git a/awx/main/tests/functional/test_instances.py b/awx/main/tests/functional/test_instances.py index 95bbd15014..b669423b9e 100644 --- a/awx/main/tests/functional/test_instances.py +++ b/awx/main/tests/functional/test_instances.py @@ -4,7 +4,7 @@ from unittest import mock from awx.main.models import AdHocCommand, InventoryUpdate, JobTemplate, ProjectUpdate from awx.main.models.activity_stream import ActivityStream from awx.main.models.ha import Instance, InstanceGroup -from awx.main.tasks import apply_cluster_membership_policies +from awx.main.tasks.system import apply_cluster_membership_policies from awx.api.versioning import reverse from django.utils.timezone import now diff --git a/awx/main/tests/functional/test_inventory_source_injectors.py b/awx/main/tests/functional/test_inventory_source_injectors.py index aa84534c16..0d4247feb3 100644 --- a/awx/main/tests/functional/test_inventory_source_injectors.py +++ b/awx/main/tests/functional/test_inventory_source_injectors.py @@ -5,7 +5,7 @@ import json import re from collections import namedtuple -from awx.main.tasks import RunInventoryUpdate +from awx.main.tasks.jobs import RunInventoryUpdate from awx.main.models import InventorySource, Credential, CredentialType, UnifiedJob, ExecutionEnvironment from awx.main.constants import CLOUD_PROVIDERS, STANDARD_INVENTORY_UPDATE_ENV from awx.main.tests import data @@ -257,6 +257,6 @@ def test_inventory_update_injected_content(this_kind, inventory, fake_credential # Also do not send websocket status updates with mock.patch.object(UnifiedJob, 'websocket_emit_status', mock.Mock()): # The point of this test is that we replace run with assertions - with mock.patch('awx.main.tasks.AWXReceptorJob.run', substitute_run): + with mock.patch('awx.main.tasks.receptor.AWXReceptorJob.run', substitute_run): # so this sets up everything for a run and then yields control over to substitute_run task.run(inventory_update.pk) diff --git a/awx/main/tests/functional/test_jobs.py b/awx/main/tests/functional/test_jobs.py index 7d97aa0b9b..a6626ce9c6 100644 --- a/awx/main/tests/functional/test_jobs.py +++ b/awx/main/tests/functional/test_jobs.py @@ -4,7 +4,7 @@ from unittest import mock import json from awx.main.models import Job, Instance, JobHostSummary, InventoryUpdate, InventorySource, Project, ProjectUpdate, SystemJob, AdHocCommand -from awx.main.tasks import cluster_node_heartbeat +from awx.main.tasks.system import cluster_node_heartbeat from django.test.utils import override_settings @@ -20,7 +20,7 @@ def test_orphan_unified_job_creation(instance, inventory): @pytest.mark.django_db -@mock.patch('awx.main.tasks.inspect_execution_nodes', lambda *args, **kwargs: None) +@mock.patch('awx.main.tasks.system.inspect_execution_nodes', lambda *args, **kwargs: None) @mock.patch('awx.main.models.ha.get_cpu_effective_capacity', lambda cpu: 8) @mock.patch('awx.main.models.ha.get_mem_effective_capacity', lambda mem: 62) def test_job_capacity_and_with_inactive_node(): diff --git a/awx/main/tests/functional/test_tasks.py b/awx/main/tests/functional/test_tasks.py index 9edf152924..951767d08e 100644 --- a/awx/main/tests/functional/test_tasks.py +++ b/awx/main/tests/functional/test_tasks.py @@ -2,7 +2,8 @@ import pytest from unittest import mock import os -from awx.main.tasks import RunProjectUpdate, RunInventoryUpdate, execution_node_health_check +from awx.main.tasks.jobs import RunProjectUpdate, RunInventoryUpdate +from awx.main.tasks.system import execution_node_health_check from awx.main.models import ProjectUpdate, InventoryUpdate, InventorySource, Instance @@ -49,7 +50,7 @@ class TestDependentInventoryUpdate: scm_inventory_source.scm_last_revision = '' proj_update = ProjectUpdate.objects.create(project=scm_inventory_source.source_project) with mock.patch.object(RunInventoryUpdate, 'run') as iu_run_mock: - with mock.patch('awx.main.tasks.create_partition'): + with mock.patch('awx.main.tasks.jobs.create_partition'): task._update_dependent_inventories(proj_update, [scm_inventory_source]) assert InventoryUpdate.objects.count() == 1 inv_update = InventoryUpdate.objects.first() @@ -73,7 +74,7 @@ class TestDependentInventoryUpdate: ProjectUpdate.objects.all().update(cancel_flag=True) with mock.patch.object(RunInventoryUpdate, 'run') as iu_run_mock: - with mock.patch('awx.main.tasks.create_partition'): + with mock.patch('awx.main.tasks.jobs.create_partition'): iu_run_mock.side_effect = user_cancels_project task._update_dependent_inventories(proj_update, [is1, is2]) # Verify that it bails after 1st update, detecting a cancel diff --git a/awx/main/tests/unit/settings/test_defaults.py b/awx/main/tests/unit/settings/test_defaults.py index b7d23a3b3e..a7f5eeeca8 100644 --- a/awx/main/tests/unit/settings/test_defaults.py +++ b/awx/main/tests/unit/settings/test_defaults.py @@ -7,7 +7,7 @@ from datetime import timedelta @pytest.mark.parametrize( "job_name,function_path", [ - ('tower_scheduler', 'awx.main.tasks.awx_periodic_scheduler'), + ('tower_scheduler', 'awx.main.tasks.system.awx_periodic_scheduler'), ], ) def test_CELERYBEAT_SCHEDULE(mocker, job_name, function_path): diff --git a/awx/main/tests/unit/test_tasks.py b/awx/main/tests/unit/test_tasks.py index 15aeb86504..76ec2ec337 100644 --- a/awx/main/tests/unit/test_tasks.py +++ b/awx/main/tests/unit/test_tasks.py @@ -32,7 +32,7 @@ from awx.main.models import ( User, build_safe_env, ) -from awx.main.models.credential import ManagedCredentialType +from awx.main.models.credential import HIDDEN_PASSWORD, ManagedCredentialType from awx.main import tasks from awx.main.utils import encrypt_field, encrypt_value @@ -113,12 +113,12 @@ def adhoc_update_model_wrapper(adhoc_job): def test_send_notifications_not_list(): with pytest.raises(TypeError): - tasks.send_notifications(None) + tasks.system.send_notifications(None) def test_send_notifications_job_id(mocker): with mocker.patch('awx.main.models.UnifiedJob.objects.get'): - tasks.send_notifications([], job_id=1) + tasks.system.send_notifications([], job_id=1) assert UnifiedJob.objects.get.called assert UnifiedJob.objects.get.called_with(id=1) @@ -127,7 +127,7 @@ def test_work_success_callback_missing_job(): task_data = {'type': 'project_update', 'id': 9999} with mock.patch('django.db.models.query.QuerySet.get') as get_mock: get_mock.side_effect = ProjectUpdate.DoesNotExist() - assert tasks.handle_work_success(task_data) is None + assert tasks.system.handle_work_success(task_data) is None @mock.patch('awx.main.models.UnifiedJob.objects.get') @@ -138,7 +138,7 @@ def test_send_notifications_list(mock_notifications_filter, mock_job_get, mocker mock_notifications = [mocker.MagicMock(spec=Notification, subject="test", body={'hello': 'world'})] mock_notifications_filter.return_value = mock_notifications - tasks.send_notifications([1, 2], job_id=1) + tasks.system.send_notifications([1, 2], job_id=1) assert Notification.objects.filter.call_count == 1 assert mock_notifications[0].status == "successful" assert mock_notifications[0].save.called @@ -158,7 +158,7 @@ def test_send_notifications_list(mock_notifications_filter, mock_job_get, mocker ], ) def test_safe_env_filtering(key, value): - assert build_safe_env({key: value})[key] == tasks.HIDDEN_PASSWORD + assert build_safe_env({key: value})[key] == HIDDEN_PASSWORD def test_safe_env_returns_new_copy(): @@ -168,7 +168,7 @@ def test_safe_env_returns_new_copy(): @pytest.mark.parametrize("source,expected", [(None, True), (False, False), (True, True)]) def test_openstack_client_config_generation(mocker, source, expected, private_data_dir): - update = tasks.RunInventoryUpdate() + update = tasks.jobs.RunInventoryUpdate() credential_type = CredentialType.defaults['openstack']() inputs = { 'host': 'https://keystone.openstack.example.org', @@ -208,7 +208,7 @@ def test_openstack_client_config_generation(mocker, source, expected, private_da @pytest.mark.parametrize("source,expected", [(None, True), (False, False), (True, True)]) def test_openstack_client_config_generation_with_project_domain_name(mocker, source, expected, private_data_dir): - update = tasks.RunInventoryUpdate() + update = tasks.jobs.RunInventoryUpdate() credential_type = CredentialType.defaults['openstack']() inputs = { 'host': 'https://keystone.openstack.example.org', @@ -250,7 +250,7 @@ def test_openstack_client_config_generation_with_project_domain_name(mocker, sou @pytest.mark.parametrize("source,expected", [(None, True), (False, False), (True, True)]) def test_openstack_client_config_generation_with_region(mocker, source, expected, private_data_dir): - update = tasks.RunInventoryUpdate() + update = tasks.jobs.RunInventoryUpdate() credential_type = CredentialType.defaults['openstack']() inputs = { 'host': 'https://keystone.openstack.example.org', @@ -294,7 +294,7 @@ def test_openstack_client_config_generation_with_region(mocker, source, expected @pytest.mark.parametrize("source,expected", [(False, False), (True, True)]) def test_openstack_client_config_generation_with_private_source_vars(mocker, source, expected, private_data_dir): - update = tasks.RunInventoryUpdate() + update = tasks.jobs.RunInventoryUpdate() credential_type = CredentialType.defaults['openstack']() inputs = { 'host': 'https://keystone.openstack.example.org', @@ -357,7 +357,7 @@ class TestExtraVarSanitation(TestJobExecution): job.created_by = User(pk=123, username='angry-spud') job.inventory = Inventory(pk=123, name='example-inv') - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.build_extra_vars_file(job, private_data_dir) fd = open(os.path.join(private_data_dir, 'env', 'extravars')) @@ -393,7 +393,7 @@ class TestExtraVarSanitation(TestJobExecution): def test_launchtime_vars_unsafe(self, job, private_data_dir): job.extra_vars = json.dumps({'msg': self.UNSAFE}) - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.build_extra_vars_file(job, private_data_dir) @@ -404,7 +404,7 @@ class TestExtraVarSanitation(TestJobExecution): def test_nested_launchtime_vars_unsafe(self, job, private_data_dir): job.extra_vars = json.dumps({'msg': {'a': [self.UNSAFE]}}) - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.build_extra_vars_file(job, private_data_dir) @@ -415,7 +415,7 @@ class TestExtraVarSanitation(TestJobExecution): def test_allowed_jt_extra_vars(self, job, private_data_dir): job.job_template.extra_vars = job.extra_vars = json.dumps({'msg': self.UNSAFE}) - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.build_extra_vars_file(job, private_data_dir) @@ -427,7 +427,7 @@ class TestExtraVarSanitation(TestJobExecution): def test_nested_allowed_vars(self, job, private_data_dir): job.extra_vars = json.dumps({'msg': {'a': {'b': [self.UNSAFE]}}}) job.job_template.extra_vars = job.extra_vars - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.build_extra_vars_file(job, private_data_dir) @@ -441,7 +441,7 @@ class TestExtraVarSanitation(TestJobExecution): # `other_var=SENSITIVE` job.job_template.extra_vars = json.dumps({'msg': self.UNSAFE}) job.extra_vars = json.dumps({'msg': 'other-value', 'other_var': self.UNSAFE}) - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.build_extra_vars_file(job, private_data_dir) @@ -456,7 +456,7 @@ class TestExtraVarSanitation(TestJobExecution): def test_overwritten_jt_extra_vars(self, job, private_data_dir): job.job_template.extra_vars = json.dumps({'msg': 'SAFE'}) job.extra_vars = json.dumps({'msg': self.UNSAFE}) - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.build_extra_vars_file(job, private_data_dir) @@ -472,13 +472,13 @@ class TestGenericRun: job.websocket_emit_status = mock.Mock() job.execution_environment = execution_environment - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.instance = job task.update_model = mock.Mock(return_value=job) task.model.objects.get = mock.Mock(return_value=job) task.build_private_data_files = mock.Mock(side_effect=OSError()) - with mock.patch('awx.main.tasks.copy_tree'): + with mock.patch('awx.main.tasks.jobs.copy_tree'): with pytest.raises(Exception): task.run(1) @@ -494,13 +494,13 @@ class TestGenericRun: job.send_notification_templates = mock.Mock() job.execution_environment = execution_environment - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.instance = job task.update_model = mock.Mock(wraps=update_model_wrapper) task.model.objects.get = mock.Mock(return_value=job) task.build_private_data_files = mock.Mock() - with mock.patch('awx.main.tasks.copy_tree'): + with mock.patch('awx.main.tasks.jobs.copy_tree'): with pytest.raises(Exception): task.run(1) @@ -508,7 +508,7 @@ class TestGenericRun: assert c in task.update_model.call_args_list def test_event_count(self): - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.dispatcher = mock.MagicMock() task.instance = Job() task.event_ct = 0 @@ -518,7 +518,7 @@ class TestGenericRun: assert 20 == task.event_ct def test_finished_callback_eof(self): - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.dispatcher = mock.MagicMock() task.instance = Job(pk=1, id=1) task.event_ct = 17 @@ -529,7 +529,7 @@ class TestGenericRun: class MockMe: pass - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.instance = job task.safe_env = {'secret_key': 'redacted_value'} task.update_model = mock.Mock(wraps=update_model_wrapper) @@ -546,7 +546,7 @@ class TestGenericRun: def test_created_by_extra_vars(self): job = Job(created_by=User(pk=123, username='angry-spud')) - task = tasks.RunJob() + task = tasks.jobs.RunJob() task._write_extra_vars_file = mock.Mock() task.build_extra_vars_file(job, None) @@ -563,7 +563,7 @@ class TestGenericRun: job.extra_vars = json.dumps({'super_secret': encrypt_value('CLASSIFIED', pk=None)}) job.survey_passwords = {'super_secret': '$encrypted$'} - task = tasks.RunJob() + task = tasks.jobs.RunJob() task._write_extra_vars_file = mock.Mock() task.build_extra_vars_file(job, None) @@ -576,11 +576,11 @@ class TestGenericRun: job = Job(project=Project(), inventory=Inventory()) job.execution_environment = execution_environment - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.instance = job task._write_extra_vars_file = mock.Mock() - with mock.patch('awx.main.tasks.settings.AWX_TASK_ENV', {'FOO': 'BAR'}): + with mock.patch('awx.main.tasks.jobs.settings.AWX_TASK_ENV', {'FOO': 'BAR'}): env = task.build_env(job, private_data_dir) assert env['FOO'] == 'BAR' @@ -595,7 +595,7 @@ class TestAdhocRun(TestJobExecution): adhoc_job.websocket_emit_status = mock.Mock() adhoc_job.send_notification_templates = mock.Mock() - task = tasks.RunAdHocCommand() + task = tasks.jobs.RunAdHocCommand() task.update_model = mock.Mock(wraps=adhoc_update_model_wrapper) task.model.objects.get = mock.Mock(return_value=adhoc_job) task.build_inventory = mock.Mock() @@ -619,7 +619,7 @@ class TestAdhocRun(TestJobExecution): }) #adhoc_job.websocket_emit_status = mock.Mock() - task = tasks.RunAdHocCommand() + task = tasks.jobs.RunAdHocCommand() #task.update_model = mock.Mock(wraps=adhoc_update_model_wrapper) #task.build_inventory = mock.Mock(return_value='/tmp/something.inventory') task._write_extra_vars_file = mock.Mock() @@ -634,7 +634,7 @@ class TestAdhocRun(TestJobExecution): def test_created_by_extra_vars(self): adhoc_job = AdHocCommand(created_by=User(pk=123, username='angry-spud')) - task = tasks.RunAdHocCommand() + task = tasks.jobs.RunAdHocCommand() task._write_extra_vars_file = mock.Mock() task.build_extra_vars_file(adhoc_job, None) @@ -693,7 +693,7 @@ class TestJobCredentials(TestJobExecution): } def test_username_jinja_usage(self, job, private_data_dir): - task = tasks.RunJob() + task = tasks.jobs.RunJob() ssh = CredentialType.defaults['ssh']() credential = Credential(pk=1, credential_type=ssh, inputs={'username': '{{ ansible_ssh_pass }}'}) job.credentials.add(credential) @@ -704,7 +704,7 @@ class TestJobCredentials(TestJobExecution): @pytest.mark.parametrize("flag", ['become_username', 'become_method']) def test_become_jinja_usage(self, job, private_data_dir, flag): - task = tasks.RunJob() + task = tasks.jobs.RunJob() ssh = CredentialType.defaults['ssh']() credential = Credential(pk=1, credential_type=ssh, inputs={'username': 'joe', flag: '{{ ansible_ssh_pass }}'}) job.credentials.add(credential) @@ -715,7 +715,7 @@ class TestJobCredentials(TestJobExecution): assert 'Jinja variables are not allowed' in str(e.value) def test_ssh_passwords(self, job, private_data_dir, field, password_name, expected_flag): - task = tasks.RunJob() + task = tasks.jobs.RunJob() ssh = CredentialType.defaults['ssh']() credential = Credential(pk=1, credential_type=ssh, inputs={'username': 'bob', field: 'secret'}) credential.inputs[field] = encrypt_field(credential, field) @@ -732,7 +732,7 @@ class TestJobCredentials(TestJobExecution): assert expected_flag in ' '.join(args) def test_net_ssh_key_unlock(self, job): - task = tasks.RunJob() + task = tasks.jobs.RunJob() net = CredentialType.defaults['net']() credential = Credential(pk=1, credential_type=net, inputs={'ssh_key_unlock': 'secret'}) credential.inputs['ssh_key_unlock'] = encrypt_field(credential, 'ssh_key_unlock') @@ -745,7 +745,7 @@ class TestJobCredentials(TestJobExecution): assert 'secret' in expect_passwords.values() def test_net_first_ssh_key_unlock_wins(self, job): - task = tasks.RunJob() + task = tasks.jobs.RunJob() for i in range(3): net = CredentialType.defaults['net']() credential = Credential(pk=i, credential_type=net, inputs={'ssh_key_unlock': 'secret{}'.format(i)}) @@ -759,7 +759,7 @@ class TestJobCredentials(TestJobExecution): assert 'secret0' in expect_passwords.values() def test_prefer_ssh_over_net_ssh_key_unlock(self, job): - task = tasks.RunJob() + task = tasks.jobs.RunJob() net = CredentialType.defaults['net']() net_credential = Credential(pk=1, credential_type=net, inputs={'ssh_key_unlock': 'net_secret'}) net_credential.inputs['ssh_key_unlock'] = encrypt_field(net_credential, 'ssh_key_unlock') @@ -778,7 +778,7 @@ class TestJobCredentials(TestJobExecution): assert 'ssh_secret' in expect_passwords.values() def test_vault_password(self, private_data_dir, job): - task = tasks.RunJob() + task = tasks.jobs.RunJob() vault = CredentialType.defaults['vault']() credential = Credential(pk=1, credential_type=vault, inputs={'vault_password': 'vault-me'}) credential.inputs['vault_password'] = encrypt_field(credential, 'vault_password') @@ -793,7 +793,7 @@ class TestJobCredentials(TestJobExecution): assert '--ask-vault-pass' in ' '.join(args) def test_vault_password_ask(self, private_data_dir, job): - task = tasks.RunJob() + task = tasks.jobs.RunJob() vault = CredentialType.defaults['vault']() credential = Credential(pk=1, credential_type=vault, inputs={'vault_password': 'ASK'}) credential.inputs['vault_password'] = encrypt_field(credential, 'vault_password') @@ -808,7 +808,7 @@ class TestJobCredentials(TestJobExecution): assert '--ask-vault-pass' in ' '.join(args) def test_multi_vault_password(self, private_data_dir, job): - task = tasks.RunJob() + task = tasks.jobs.RunJob() vault = CredentialType.defaults['vault']() for i, label in enumerate(['dev', 'prod', 'dotted.name']): credential = Credential(pk=i, credential_type=vault, inputs={'vault_password': 'pass@{}'.format(label), 'vault_id': label}) @@ -831,7 +831,7 @@ class TestJobCredentials(TestJobExecution): assert '--vault-id dotted.name@prompt' in ' '.join(args) def test_multi_vault_id_conflict(self, job): - task = tasks.RunJob() + task = tasks.jobs.RunJob() vault = CredentialType.defaults['vault']() for i in range(2): credential = Credential(pk=i, credential_type=vault, inputs={'vault_password': 'some-pass', 'vault_id': 'conflict'}) @@ -844,7 +844,7 @@ class TestJobCredentials(TestJobExecution): assert 'multiple vault credentials were specified with --vault-id' in str(e.value) def test_multi_vault_password_ask(self, private_data_dir, job): - task = tasks.RunJob() + task = tasks.jobs.RunJob() vault = CredentialType.defaults['vault']() for i, label in enumerate(['dev', 'prod']): credential = Credential(pk=i, credential_type=vault, inputs={'vault_password': 'ASK', 'vault_id': label}) @@ -897,7 +897,7 @@ class TestJobCredentials(TestJobExecution): assert env['K8S_AUTH_VERIFY_SSL'] == 'False' assert 'K8S_AUTH_SSL_CA_CERT' not in env - assert safe_env['K8S_AUTH_API_KEY'] == tasks.HIDDEN_PASSWORD + assert safe_env['K8S_AUTH_API_KEY'] == HIDDEN_PASSWORD def test_aws_cloud_credential(self, job, private_data_dir): aws = CredentialType.defaults['aws']() @@ -912,7 +912,7 @@ class TestJobCredentials(TestJobExecution): assert env['AWS_ACCESS_KEY_ID'] == 'bob' assert env['AWS_SECRET_ACCESS_KEY'] == 'secret' assert 'AWS_SECURITY_TOKEN' not in env - assert safe_env['AWS_SECRET_ACCESS_KEY'] == tasks.HIDDEN_PASSWORD + assert safe_env['AWS_SECRET_ACCESS_KEY'] == HIDDEN_PASSWORD def test_aws_cloud_credential_with_sts_token(self, private_data_dir, job): aws = CredentialType.defaults['aws']() @@ -928,7 +928,7 @@ class TestJobCredentials(TestJobExecution): assert env['AWS_ACCESS_KEY_ID'] == 'bob' assert env['AWS_SECRET_ACCESS_KEY'] == 'secret' assert env['AWS_SECURITY_TOKEN'] == 'token' - assert safe_env['AWS_SECRET_ACCESS_KEY'] == tasks.HIDDEN_PASSWORD + assert safe_env['AWS_SECRET_ACCESS_KEY'] == HIDDEN_PASSWORD def test_gce_credentials(self, private_data_dir, job): gce = CredentialType.defaults['gce']() @@ -963,7 +963,7 @@ class TestJobCredentials(TestJobExecution): assert env['AZURE_SECRET'] == 'some-secret' assert env['AZURE_TENANT'] == 'some-tenant' assert env['AZURE_SUBSCRIPTION_ID'] == 'some-subscription' - assert safe_env['AZURE_SECRET'] == tasks.HIDDEN_PASSWORD + assert safe_env['AZURE_SECRET'] == HIDDEN_PASSWORD def test_azure_rm_with_password(self, private_data_dir, job): azure = CredentialType.defaults['azure_rm']() @@ -981,7 +981,7 @@ class TestJobCredentials(TestJobExecution): assert env['AZURE_AD_USER'] == 'bob' assert env['AZURE_PASSWORD'] == 'secret' assert env['AZURE_CLOUD_ENVIRONMENT'] == 'foobar' - assert safe_env['AZURE_PASSWORD'] == tasks.HIDDEN_PASSWORD + assert safe_env['AZURE_PASSWORD'] == HIDDEN_PASSWORD def test_vmware_credentials(self, private_data_dir, job): vmware = CredentialType.defaults['vmware']() @@ -996,10 +996,10 @@ class TestJobCredentials(TestJobExecution): assert env['VMWARE_USER'] == 'bob' assert env['VMWARE_PASSWORD'] == 'secret' assert env['VMWARE_HOST'] == 'https://example.org' - assert safe_env['VMWARE_PASSWORD'] == tasks.HIDDEN_PASSWORD + assert safe_env['VMWARE_PASSWORD'] == HIDDEN_PASSWORD def test_openstack_credentials(self, private_data_dir, job): - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.instance = job openstack = CredentialType.defaults['openstack']() credential = Credential( @@ -1067,7 +1067,7 @@ class TestJobCredentials(TestJobExecution): ], ) def test_net_credentials(self, authorize, expected_authorize, job, private_data_dir): - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.instance = job net = CredentialType.defaults['net']() inputs = {'username': 'bob', 'password': 'secret', 'ssh_key_data': self.EXAMPLE_PRIVATE_KEY, 'authorize_password': 'authorizeme'} @@ -1089,7 +1089,7 @@ class TestJobCredentials(TestJobExecution): if authorize: assert env['ANSIBLE_NET_AUTH_PASS'] == 'authorizeme' assert open(env['ANSIBLE_NET_SSH_KEYFILE'], 'r').read() == self.EXAMPLE_PRIVATE_KEY - assert safe_env['ANSIBLE_NET_PASSWORD'] == tasks.HIDDEN_PASSWORD + assert safe_env['ANSIBLE_NET_PASSWORD'] == HIDDEN_PASSWORD def test_custom_environment_injectors_with_jinja_syntax_error(self, private_data_dir): some_cloud = CredentialType( @@ -1135,7 +1135,7 @@ class TestJobCredentials(TestJobExecution): assert env['TURBO_BUTTON'] == str(True) def test_custom_environment_injectors_with_reserved_env_var(self, private_data_dir, job): - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.instance = job some_cloud = CredentialType( kind='cloud', @@ -1168,10 +1168,10 @@ class TestJobCredentials(TestJobExecution): assert env['MY_CLOUD_PRIVATE_VAR'] == 'SUPER-SECRET-123' assert 'SUPER-SECRET-123' not in safe_env.values() - assert safe_env['MY_CLOUD_PRIVATE_VAR'] == tasks.HIDDEN_PASSWORD + assert safe_env['MY_CLOUD_PRIVATE_VAR'] == HIDDEN_PASSWORD def test_custom_environment_injectors_with_extra_vars(self, private_data_dir, job): - task = tasks.RunJob() + task = tasks.jobs.RunJob() some_cloud = CredentialType( kind='cloud', name='SomeCloud', @@ -1190,7 +1190,7 @@ class TestJobCredentials(TestJobExecution): assert hasattr(extra_vars["api_token"], '__UNSAFE__') def test_custom_environment_injectors_with_boolean_extra_vars(self, job, private_data_dir): - task = tasks.RunJob() + task = tasks.jobs.RunJob() some_cloud = CredentialType( kind='cloud', name='SomeCloud', @@ -1209,7 +1209,7 @@ class TestJobCredentials(TestJobExecution): return ['successful', 0] def test_custom_environment_injectors_with_complicated_boolean_template(self, job, private_data_dir): - task = tasks.RunJob() + task = tasks.jobs.RunJob() some_cloud = CredentialType( kind='cloud', name='SomeCloud', @@ -1230,7 +1230,7 @@ class TestJobCredentials(TestJobExecution): """ extra_vars that contain secret field values should be censored in the DB """ - task = tasks.RunJob() + task = tasks.jobs.RunJob() some_cloud = CredentialType( kind='cloud', name='SomeCloud', @@ -1331,11 +1331,11 @@ class TestJobCredentials(TestJobExecution): assert json_data['client_email'] == 'bob' assert json_data['project_id'] == 'some-project' - assert safe_env['AZURE_PASSWORD'] == tasks.HIDDEN_PASSWORD + assert safe_env['AZURE_PASSWORD'] == HIDDEN_PASSWORD def test_awx_task_env(self, settings, private_data_dir, job): settings.AWX_TASK_ENV = {'FOO': 'BAR'} - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.instance = job env = task.build_env(job, private_data_dir) @@ -1362,7 +1362,7 @@ class TestProjectUpdateGalaxyCredentials(TestJobExecution): def test_galaxy_credentials_ignore_certs(self, private_data_dir, project_update, ignore): settings.GALAXY_IGNORE_CERTS = ignore - task = tasks.RunProjectUpdate() + task = tasks.jobs.RunProjectUpdate() task.instance = project_update env = task.build_env(project_update, private_data_dir) if ignore: @@ -1371,7 +1371,7 @@ class TestProjectUpdateGalaxyCredentials(TestJobExecution): assert 'ANSIBLE_GALAXY_IGNORE' not in env def test_galaxy_credentials_empty(self, private_data_dir, project_update): - class RunProjectUpdate(tasks.RunProjectUpdate): + class RunProjectUpdate(tasks.jobs.RunProjectUpdate): __vars__ = {} def _write_extra_vars_file(self, private_data_dir, extra_vars, *kw): @@ -1390,7 +1390,7 @@ class TestProjectUpdateGalaxyCredentials(TestJobExecution): assert not k.startswith('ANSIBLE_GALAXY_SERVER') def test_single_public_galaxy(self, private_data_dir, project_update): - class RunProjectUpdate(tasks.RunProjectUpdate): + class RunProjectUpdate(tasks.jobs.RunProjectUpdate): __vars__ = {} def _write_extra_vars_file(self, private_data_dir, extra_vars, *kw): @@ -1439,7 +1439,7 @@ class TestProjectUpdateGalaxyCredentials(TestJobExecution): ) project_update.project.organization.galaxy_credentials.add(public_galaxy) project_update.project.organization.galaxy_credentials.add(rh) - task = tasks.RunProjectUpdate() + task = tasks.jobs.RunProjectUpdate() task.instance = project_update env = task.build_env(project_update, private_data_dir) assert sorted([(k, v) for k, v in env.items() if k.startswith('ANSIBLE_GALAXY')]) == [ @@ -1481,7 +1481,7 @@ class TestProjectUpdateCredentials(TestJobExecution): } def test_username_and_password_auth(self, project_update, scm_type): - task = tasks.RunProjectUpdate() + task = tasks.jobs.RunProjectUpdate() ssh = CredentialType.defaults['ssh']() project_update.scm_type = scm_type project_update.credential = Credential(pk=1, credential_type=ssh, inputs={'username': 'bob', 'password': 'secret'}) @@ -1495,7 +1495,7 @@ class TestProjectUpdateCredentials(TestJobExecution): assert 'secret' in expect_passwords.values() def test_ssh_key_auth(self, project_update, scm_type): - task = tasks.RunProjectUpdate() + task = tasks.jobs.RunProjectUpdate() ssh = CredentialType.defaults['ssh']() project_update.scm_type = scm_type project_update.credential = Credential(pk=1, credential_type=ssh, inputs={'username': 'bob', 'ssh_key_data': self.EXAMPLE_PRIVATE_KEY}) @@ -1509,7 +1509,7 @@ class TestProjectUpdateCredentials(TestJobExecution): def test_awx_task_env(self, project_update, settings, private_data_dir, scm_type, execution_environment): project_update.execution_environment = execution_environment settings.AWX_TASK_ENV = {'FOO': 'BAR'} - task = tasks.RunProjectUpdate() + task = tasks.jobs.RunProjectUpdate() task.instance = project_update project_update.scm_type = scm_type @@ -1524,7 +1524,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): return InventoryUpdate(pk=1, execution_environment=execution_environment, inventory_source=InventorySource(pk=1, inventory=Inventory(pk=1))) def test_source_without_credential(self, mocker, inventory_update, private_data_dir): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update inventory_update.source = 'ec2' inventory_update.get_cloud_credential = mocker.Mock(return_value=None) @@ -1537,7 +1537,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): assert 'AWS_SECRET_ACCESS_KEY' not in env def test_ec2_source(self, private_data_dir, inventory_update, mocker): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update aws = CredentialType.defaults['aws']() inventory_update.source = 'ec2' @@ -1558,10 +1558,10 @@ class TestInventoryUpdateCredentials(TestJobExecution): assert env['AWS_ACCESS_KEY_ID'] == 'bob' assert env['AWS_SECRET_ACCESS_KEY'] == 'secret' - assert safe_env['AWS_SECRET_ACCESS_KEY'] == tasks.HIDDEN_PASSWORD + assert safe_env['AWS_SECRET_ACCESS_KEY'] == HIDDEN_PASSWORD def test_vmware_source(self, inventory_update, private_data_dir, mocker): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update vmware = CredentialType.defaults['vmware']() inventory_update.source = 'vmware' @@ -1589,7 +1589,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): env["VMWARE_VALIDATE_CERTS"] == "False", def test_azure_rm_source_with_tenant(self, private_data_dir, inventory_update, mocker): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update azure_rm = CredentialType.defaults['azure_rm']() inventory_update.source = 'azure_rm' @@ -1622,10 +1622,10 @@ class TestInventoryUpdateCredentials(TestJobExecution): assert env['AZURE_SUBSCRIPTION_ID'] == 'some-subscription' assert env['AZURE_CLOUD_ENVIRONMENT'] == 'foobar' - assert safe_env['AZURE_SECRET'] == tasks.HIDDEN_PASSWORD + assert safe_env['AZURE_SECRET'] == HIDDEN_PASSWORD def test_azure_rm_source_with_password(self, private_data_dir, inventory_update, mocker): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update azure_rm = CredentialType.defaults['azure_rm']() inventory_update.source = 'azure_rm' @@ -1651,10 +1651,10 @@ class TestInventoryUpdateCredentials(TestJobExecution): assert env['AZURE_PASSWORD'] == 'secret' assert env['AZURE_CLOUD_ENVIRONMENT'] == 'foobar' - assert safe_env['AZURE_PASSWORD'] == tasks.HIDDEN_PASSWORD + assert safe_env['AZURE_PASSWORD'] == HIDDEN_PASSWORD def test_gce_source(self, inventory_update, private_data_dir, mocker): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update gce = CredentialType.defaults['gce']() inventory_update.source = 'gce' @@ -1684,7 +1684,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): assert json_data['project_id'] == 'some-project' def test_openstack_source(self, inventory_update, private_data_dir, mocker): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update openstack = CredentialType.defaults['openstack']() inventory_update.source = 'openstack' @@ -1724,7 +1724,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): ) def test_satellite6_source(self, inventory_update, private_data_dir, mocker): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update satellite6 = CredentialType.defaults['satellite6']() inventory_update.source = 'satellite6' @@ -1744,10 +1744,10 @@ class TestInventoryUpdateCredentials(TestJobExecution): assert env["FOREMAN_SERVER"] == "https://example.org" assert env["FOREMAN_USER"] == "bob" assert env["FOREMAN_PASSWORD"] == "secret" - assert safe_env["FOREMAN_PASSWORD"] == tasks.HIDDEN_PASSWORD + assert safe_env["FOREMAN_PASSWORD"] == HIDDEN_PASSWORD def test_insights_source(self, inventory_update, private_data_dir, mocker): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update insights = CredentialType.defaults['insights']() inventory_update.source = 'insights' @@ -1772,11 +1772,11 @@ class TestInventoryUpdateCredentials(TestJobExecution): assert env["INSIGHTS_USER"] == "bob" assert env["INSIGHTS_PASSWORD"] == "secret" - assert safe_env['INSIGHTS_PASSWORD'] == tasks.HIDDEN_PASSWORD + assert safe_env['INSIGHTS_PASSWORD'] == HIDDEN_PASSWORD @pytest.mark.parametrize('verify', [True, False]) def test_tower_source(self, verify, inventory_update, private_data_dir, mocker): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update tower = CredentialType.defaults['controller']() inventory_update.source = 'controller' @@ -1801,10 +1801,10 @@ class TestInventoryUpdateCredentials(TestJobExecution): assert env['CONTROLLER_VERIFY_SSL'] == 'True' else: assert env['CONTROLLER_VERIFY_SSL'] == 'False' - assert safe_env['CONTROLLER_PASSWORD'] == tasks.HIDDEN_PASSWORD + assert safe_env['CONTROLLER_PASSWORD'] == HIDDEN_PASSWORD def test_tower_source_ssl_verify_empty(self, inventory_update, private_data_dir, mocker): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update tower = CredentialType.defaults['controller']() inventory_update.source = 'controller' @@ -1832,7 +1832,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): assert env['TOWER_VERIFY_SSL'] == 'False' def test_awx_task_env(self, inventory_update, private_data_dir, settings, mocker): - task = tasks.RunInventoryUpdate() + task = tasks.jobs.RunInventoryUpdate() task.instance = inventory_update gce = CredentialType.defaults['gce']() inventory_update.source = 'gce' @@ -1883,7 +1883,7 @@ def test_aquire_lock_open_fail_logged(logging_getLogger, os_open): logger = mock.Mock() logging_getLogger.return_value = logger - ProjectUpdate = tasks.RunProjectUpdate() + ProjectUpdate = tasks.jobs.RunProjectUpdate() with pytest.raises(OSError): ProjectUpdate.acquire_lock(instance) @@ -1910,7 +1910,7 @@ def test_aquire_lock_acquisition_fail_logged(fcntl_lockf, logging_getLogger, os_ fcntl_lockf.side_effect = err - ProjectUpdate = tasks.RunProjectUpdate() + ProjectUpdate = tasks.jobs.RunProjectUpdate() with pytest.raises(IOError): ProjectUpdate.acquire_lock(instance) os_close.assert_called_with(3) @@ -1920,7 +1920,7 @@ def test_aquire_lock_acquisition_fail_logged(fcntl_lockf, logging_getLogger, os_ @pytest.mark.parametrize('injector_cls', [cls for cls in ManagedCredentialType.registry.values() if cls.injectors]) def test_managed_injector_redaction(injector_cls): """See awx.main.models.inventory.PluginFileInjector._get_shared_env - The ordering within awx.main.tasks.BaseTask and contract with build_env + The ordering within awx.main.tasks.jobs.BaseTask and contract with build_env requires that all managed injectors are safely redacted by the static method build_safe_env without having to employ the safe namespace as in inject_credential @@ -1947,7 +1947,7 @@ def test_notification_job_not_finished(logging_getLogger, mocker): logging_getLogger.return_value = logger with mocker.patch('awx.main.models.UnifiedJob.objects.get', uj): - tasks.handle_success_and_failure_notifications(1) + tasks.system.handle_success_and_failure_notifications(1) assert logger.warn.called_with(f"Failed to even try to send notifications for job '{uj}' due to job not being in finished state.") @@ -1955,7 +1955,7 @@ def test_notification_job_finished(mocker): uj = mocker.MagicMock(send_notification_templates=mocker.MagicMock(), finished=True) with mocker.patch('awx.main.models.UnifiedJob.objects.get', mocker.MagicMock(return_value=uj)): - tasks.handle_success_and_failure_notifications(1) + tasks.system.handle_success_and_failure_notifications(1) uj.send_notification_templates.assert_called() @@ -1964,12 +1964,12 @@ def test_job_run_no_ee(): proj = Project(pk=1, organization=org) job = Job(project=proj, organization=org, inventory=Inventory(pk=1)) job.execution_environment = None - task = tasks.RunJob() + task = tasks.jobs.RunJob() task.instance = job task.update_model = mock.Mock(return_value=job) task.model.objects.get = mock.Mock(return_value=job) - with mock.patch('awx.main.tasks.copy_tree'): + with mock.patch('awx.main.tasks.jobs.copy_tree'): with pytest.raises(RuntimeError) as e: task.pre_run_hook(job, private_data_dir) @@ -1983,7 +1983,7 @@ def test_project_update_no_ee(): proj = Project(pk=1, organization=org) project_update = ProjectUpdate(pk=1, project=proj, scm_type='git') project_update.execution_environment = None - task = tasks.RunProjectUpdate() + task = tasks.jobs.RunProjectUpdate() task.instance = project_update with pytest.raises(RuntimeError) as e: diff --git a/awx/main/tests/unit/utils/test_receptor.py b/awx/main/tests/unit/utils/test_receptor.py index 944494ebdc..0a7e182070 100644 --- a/awx/main/tests/unit/utils/test_receptor.py +++ b/awx/main/tests/unit/utils/test_receptor.py @@ -1,4 +1,4 @@ -from awx.main.utils.receptor import _convert_args_to_cli +from awx.main.tasks.receptor import _convert_args_to_cli def test_file_cleanup_scenario(): diff --git a/awx/main/utils/receptor.py b/awx/main/utils/receptor.py deleted file mode 100644 index e1961ca905..0000000000 --- a/awx/main/utils/receptor.py +++ /dev/null @@ -1,230 +0,0 @@ -import logging -import yaml -import time -from enum import Enum, unique - -from receptorctl.socket_interface import ReceptorControl - -from awx.main.exceptions import ReceptorNodeNotFound - -from django.conf import settings - - -logger = logging.getLogger('awx.main.utils.receptor') - -__RECEPTOR_CONF = '/etc/receptor/receptor.conf' - -RECEPTOR_ACTIVE_STATES = ('Pending', 'Running') - - -@unique -class ReceptorConnectionType(Enum): - DATAGRAM = 0 - STREAM = 1 - STREAMTLS = 2 - - -def get_receptor_sockfile(): - with open(__RECEPTOR_CONF, 'r') as f: - data = yaml.safe_load(f) - for section in data: - for entry_name, entry_data in section.items(): - if entry_name == 'control-service': - if 'filename' in entry_data: - return entry_data['filename'] - else: - raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} control-service entry does not have a filename parameter') - else: - raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} does not have control-service entry needed to get sockfile') - - -def get_tls_client(use_stream_tls=None): - if not use_stream_tls: - return None - - with open(__RECEPTOR_CONF, 'r') as f: - data = yaml.safe_load(f) - for section in data: - for entry_name, entry_data in section.items(): - if entry_name == 'tls-client': - if 'name' in entry_data: - return entry_data['name'] - return None - - -def get_receptor_ctl(): - receptor_sockfile = get_receptor_sockfile() - try: - return ReceptorControl(receptor_sockfile, config=__RECEPTOR_CONF, tlsclient=get_tls_client(True)) - except RuntimeError: - return ReceptorControl(receptor_sockfile) - - -def get_conn_type(node_name, receptor_ctl): - all_nodes = receptor_ctl.simple_command("status").get('Advertisements', None) - for node in all_nodes: - if node.get('NodeID') == node_name: - return ReceptorConnectionType(node.get('ConnType')) - raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh') - - -def administrative_workunit_reaper(work_list=None): - """ - This releases completed work units that were spawned by actions inside of this module - specifically, this should catch any completed work unit left by - - worker_info - - worker_cleanup - These should ordinarily be released when the method finishes, but this is a - cleanup of last-resort, in case something went awry - """ - receptor_ctl = get_receptor_ctl() - if work_list is None: - work_list = receptor_ctl.simple_command("work list") - - for unit_id, work_data in work_list.items(): - extra_data = work_data.get('ExtraData') - if (extra_data is None) or (extra_data.get('RemoteWorkType') != 'ansible-runner'): - continue # if this is not ansible-runner work, we do not want to touch it - params = extra_data.get('RemoteParams', {}).get('params') - if not params: - continue - if not (params == '--worker-info' or params.startswith('cleanup')): - continue # if this is not a cleanup or health check, we do not want to touch it - if work_data.get('StateName') in RECEPTOR_ACTIVE_STATES: - continue # do not want to touch active work units - logger.info(f'Reaping orphaned work unit {unit_id} with params {params}') - receptor_ctl.simple_command(f"work release {unit_id}") - - -class RemoteJobError(RuntimeError): - pass - - -def run_until_complete(node, timing_data=None, **kwargs): - """ - Runs an ansible-runner work_type on remote node, waits until it completes, then returns stdout. - """ - receptor_ctl = get_receptor_ctl() - - use_stream_tls = getattr(get_conn_type(node, receptor_ctl), 'name', None) == "STREAMTLS" - kwargs.setdefault('tlsclient', get_tls_client(use_stream_tls)) - kwargs.setdefault('ttl', '20s') - kwargs.setdefault('payload', '') - - transmit_start = time.time() - sign_work = False if settings.IS_K8S else True - result = receptor_ctl.submit_work(worktype='ansible-runner', node=node, signwork=sign_work, **kwargs) - - unit_id = result['unitid'] - run_start = time.time() - if timing_data: - timing_data['transmit_timing'] = run_start - transmit_start - run_timing = 0.0 - stdout = '' - - try: - - resultfile = receptor_ctl.get_work_results(unit_id) - - while run_timing < 20.0: - status = receptor_ctl.simple_command(f'work status {unit_id}') - state_name = status.get('StateName') - if state_name not in RECEPTOR_ACTIVE_STATES: - break - run_timing = time.time() - run_start - time.sleep(0.5) - else: - raise RemoteJobError(f'Receptor job timeout on {node} after {run_timing} seconds, state remains in {state_name}') - - if timing_data: - timing_data['run_timing'] = run_timing - - stdout = resultfile.read() - stdout = str(stdout, encoding='utf-8') - - finally: - - if settings.RECEPTOR_RELEASE_WORK: - res = receptor_ctl.simple_command(f"work release {unit_id}") - if res != {'released': unit_id}: - logger.warn(f'Could not confirm release of receptor work unit id {unit_id} from {node}, data: {res}') - - receptor_ctl.close() - - if state_name.lower() == 'failed': - work_detail = status.get('Detail', '') - if work_detail: - raise RemoteJobError(f'Receptor error from {node}, detail:\n{work_detail}') - else: - raise RemoteJobError(f'Unknown ansible-runner error on node {node}, stdout:\n{stdout}') - - return stdout - - -def worker_info(node_name, work_type='ansible-runner'): - error_list = [] - data = {'errors': error_list, 'transmit_timing': 0.0} - - try: - stdout = run_until_complete(node=node_name, timing_data=data, params={"params": "--worker-info"}) - - yaml_stdout = stdout.strip() - remote_data = {} - try: - remote_data = yaml.safe_load(yaml_stdout) - except Exception as json_e: - error_list.append(f'Failed to parse node {node_name} --worker-info output as YAML, error: {json_e}, data:\n{yaml_stdout}') - - if not isinstance(remote_data, dict): - error_list.append(f'Remote node {node_name} --worker-info output is not a YAML dict, output:{stdout}') - else: - error_list.extend(remote_data.pop('errors', [])) # merge both error lists - data.update(remote_data) - - except RemoteJobError as exc: - details = exc.args[0] - if 'unrecognized arguments: --worker-info' in details: - error_list.append(f'Old version (2.0.1 or earlier) of ansible-runner on node {node_name} without --worker-info') - else: - error_list.append(details) - - except (ReceptorNodeNotFound, RuntimeError) as exc: - error_list.append(str(exc)) - - # If we have a connection error, missing keys would be trivial consequence of that - if not data['errors']: - # see tasks.py usage of keys - missing_keys = set(('runner_version', 'mem_in_bytes', 'cpu_count')) - set(data.keys()) - if missing_keys: - data['errors'].append('Worker failed to return keys {}'.format(' '.join(missing_keys))) - - return data - - -def _convert_args_to_cli(vargs): - """ - For the ansible-runner worker cleanup command - converts the dictionary (parsed argparse variables) used for python interface - into a string of CLI options, which has to be used on execution nodes. - """ - args = ['cleanup'] - for option in ('exclude_strings', 'remove_images'): - if vargs.get(option): - args.append('--{}={}'.format(option.replace('_', '-'), ' '.join(vargs.get(option)))) - for option in ('file_pattern', 'image_prune', 'process_isolation_executable', 'grace_period'): - if vargs.get(option) is True: - args.append('--{}'.format(option.replace('_', '-'))) - elif vargs.get(option) not in (None, ''): - args.append('--{}={}'.format(option.replace('_', '-'), vargs.get(option))) - return args - - -def worker_cleanup(node_name, vargs, timeout=300.0): - args = _convert_args_to_cli(vargs) - - remote_command = ' '.join(args) - logger.debug(f'Running command over receptor mesh on {node_name}: ansible-runner worker {remote_command}') - - stdout = run_until_complete(node=node_name, params={"params": remote_command}) - - return stdout diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 0c37420c14..feab6832fa 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -425,18 +425,18 @@ EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 30 # once every 30 minutes check if an BROKER_URL = 'unix:///var/run/redis/redis.sock' CELERYBEAT_SCHEDULE = { - 'tower_scheduler': {'task': 'awx.main.tasks.awx_periodic_scheduler', 'schedule': timedelta(seconds=30), 'options': {'expires': 20}}, + 'tower_scheduler': {'task': 'awx.main.tasks.system.awx_periodic_scheduler', 'schedule': timedelta(seconds=30), 'options': {'expires': 20}}, 'cluster_heartbeat': { - 'task': 'awx.main.tasks.cluster_node_heartbeat', + 'task': 'awx.main.tasks.system.cluster_node_heartbeat', 'schedule': timedelta(seconds=CLUSTER_NODE_HEARTBEAT_PERIOD), 'options': {'expires': 50}, }, - 'gather_analytics': {'task': 'awx.main.tasks.gather_analytics', 'schedule': timedelta(minutes=5)}, + 'gather_analytics': {'task': 'awx.main.tasks.system.gather_analytics', 'schedule': timedelta(minutes=5)}, 'task_manager': {'task': 'awx.main.scheduler.tasks.run_task_manager', 'schedule': timedelta(seconds=20), 'options': {'expires': 20}}, - 'k8s_reaper': {'task': 'awx.main.tasks.awx_k8s_reaper', 'schedule': timedelta(seconds=60), 'options': {'expires': 50}}, - 'receptor_reaper': {'task': 'awx.main.tasks.awx_receptor_workunit_reaper', 'schedule': timedelta(seconds=60)}, + 'k8s_reaper': {'task': 'awx.main.tasks.system.awx_k8s_reaper', 'schedule': timedelta(seconds=60), 'options': {'expires': 50}}, + 'receptor_reaper': {'task': 'awx.main.tasks.system.awx_receptor_workunit_reaper', 'schedule': timedelta(seconds=60)}, 'send_subsystem_metrics': {'task': 'awx.main.analytics.analytics_tasks.send_subsystem_metrics', 'schedule': timedelta(seconds=20)}, - 'cleanup_images': {'task': 'awx.main.tasks.cleanup_images_and_files', 'schedule': timedelta(hours=3)}, + 'cleanup_images': {'task': 'awx.main.tasks.system.cleanup_images_and_files', 'schedule': timedelta(hours=3)}, } # Django Caching Configuration diff --git a/docs/debugging/debugging_misc.md b/docs/debugging/debugging_misc.md index 287bff2f6e..c6b4567df3 100644 --- a/docs/debugging/debugging_misc.md +++ b/docs/debugging/debugging_misc.md @@ -130,7 +130,7 @@ a telnet session: ```python # awx/main/tasks.py - class SomeTask(awx.main.tasks.BaseTask): + class SomeTask(awx.main.tasks.jobs.BaseTask): def run(self, pk, **kwargs): # This will set a breakpoint and open an interactive Python diff --git a/docs/tasks.md b/docs/tasks.md index 2e820c32f4..b627915784 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -86,13 +86,13 @@ appropriate AMQP queue: "uuid": "", "args": [1, 1], "kwargs": {}, - "task": "awx.main.tasks.add" + "task": "awx.main.tasks.system.add" } When a background worker receives the message, it deserializes it and runs the associated Python code: - awx.main.tasks.add(123) + awx.main.tasks.system.add(123) Dispatcher Implementation