Refactored tasks.py to a package

--- Added 3 new sub-package : awx.main.tasks.system , awx.main.tasks.jobs , awx.main.tasks.receptor
--- Modified the functional tests and unit tests accordingly
This commit is contained in:
Amol Gautam 2022-01-12 15:40:32 -05:00
parent 11f4b64229
commit a4a3ba65d7
36 changed files with 1607 additions and 1562 deletions

View File

@ -817,7 +817,7 @@ class ResourceAccessList(ParentMixin, ListAPIView):
def trigger_delayed_deep_copy(*args, **kwargs):
from awx.main.tasks import deep_copy_model_obj
from awx.main.tasks.system import deep_copy_model_obj
connection.on_commit(lambda: deep_copy_model_obj.delay(*args, **kwargs))

View File

@ -62,7 +62,7 @@ import pytz
from wsgiref.util import FileWrapper
# AWX
from awx.main.tasks import send_notifications, update_inventory_computed_fields
from awx.main.tasks.system import send_notifications, update_inventory_computed_fields
from awx.main.access import get_user_queryset, HostAccess
from awx.api.generics import (
APIView,
@ -431,7 +431,7 @@ class InstanceHealthCheck(GenericAPIView):
obj = self.get_object()
if obj.node_type == 'execution':
from awx.main.tasks import execution_node_health_check
from awx.main.tasks.system import execution_node_health_check
runner_data = execution_node_health_check(obj.hostname)
obj.refresh_from_db()
@ -441,7 +441,7 @@ class InstanceHealthCheck(GenericAPIView):
if extra_field in runner_data:
data[extra_field] = runner_data[extra_field]
else:
from awx.main.tasks import cluster_node_health_check
from awx.main.tasks.system import cluster_node_health_check
if settings.CLUSTER_HOST_ID == obj.hostname:
cluster_node_health_check(obj.hostname)

View File

@ -26,7 +26,7 @@ from awx.api.generics import APIView, GenericAPIView, ListAPIView, RetrieveUpdat
from awx.api.permissions import IsSystemAdminOrAuditor
from awx.api.versioning import reverse
from awx.main.utils import camelcase_to_underscore
from awx.main.tasks import handle_setting_changes
from awx.main.tasks.system import handle_setting_changes
from awx.conf.models import Setting
from awx.conf.serializers import SettingCategorySerializer, SettingSingletonSerializer
from awx.conf import settings_registry

View File

@ -17,7 +17,7 @@ import redis
from awx.main.consumers import emit_channel_notification
from awx.main.models import JobEvent, AdHocCommandEvent, ProjectUpdateEvent, InventoryUpdateEvent, SystemJobEvent, UnifiedJob, Job
from awx.main.tasks import handle_success_and_failure_notifications
from awx.main.tasks.system import handle_success_and_failure_notifications
from awx.main.models.events import emit_event_detail
from awx.main.utils.profiling import AWXProfiler
import awx.main.analytics.subsystem_metrics as s_metrics

View File

@ -9,7 +9,7 @@ from kubernetes.config import kube_config
from django.conf import settings
from django_guid.middleware import GuidMiddleware
from awx.main.tasks import dispatch_startup, inform_cluster_of_shutdown
from awx.main.tasks.system import dispatch_startup, inform_cluster_of_shutdown
from .base import BaseWorker
@ -30,8 +30,8 @@ class TaskWorker(BaseWorker):
"""
Transform a dotted notation task into an imported, callable function, e.g.,
awx.main.tasks.delete_inventory
awx.main.tasks.RunProjectUpdate
awx.main.tasks.system.delete_inventory
awx.main.tasks.jobs.RunProjectUpdate
"""
if not task.startswith('awx.'):
raise ValueError('{} is not a valid awx task'.format(task))
@ -73,15 +73,15 @@ class TaskWorker(BaseWorker):
'callbacks': [{
'args': [],
'kwargs': {}
'task': u'awx.main.tasks.handle_work_success'
'task': u'awx.main.tasks.system.handle_work_success'
}],
'errbacks': [{
'args': [],
'kwargs': {},
'task': 'awx.main.tasks.handle_work_error'
'task': 'awx.main.tasks.system.handle_work_error'
}],
'kwargs': {},
'task': u'awx.main.tasks.RunProjectUpdate'
'task': u'awx.main.tasks.jobs.RunProjectUpdate'
}
"""
settings.__clean_on_fork__()

View File

@ -1,6 +1,6 @@
from django.core.management.base import BaseCommand
from awx.main.tasks import profile_sql
from awx.main.tasks.system import profile_sql
class Command(BaseCommand):

View File

@ -144,7 +144,7 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin):
@classmethod
def _get_task_class(cls):
from awx.main.tasks import RunAdHocCommand
from awx.main.tasks.jobs import RunAdHocCommand
return RunAdHocCommand

View File

@ -388,7 +388,7 @@ class BasePlaybookEvent(CreatedModifiedModel):
job.get_event_queryset().filter(uuid__in=failed).update(failed=True)
# send success/failure notifications when we've finished handling the playbook_on_stats event
from awx.main.tasks import handle_success_and_failure_notifications # circular import
from awx.main.tasks.system import handle_success_and_failure_notifications # circular import
def _send_notifications():
handle_success_and_failure_notifications.apply_async([job.id])

View File

@ -376,7 +376,7 @@ class TowerScheduleState(SingletonModel):
def schedule_policy_task():
from awx.main.tasks import apply_cluster_membership_policies
from awx.main.tasks.system import apply_cluster_membership_policies
connection.on_commit(lambda: apply_cluster_membership_policies.apply_async())

View File

@ -366,7 +366,7 @@ class Inventory(CommonModelNameNotUnique, ResourceMixin, RelatedJobsMixin):
@transaction.atomic
def schedule_deletion(self, user_id=None):
from awx.main.tasks import delete_inventory
from awx.main.tasks.system import delete_inventory
from awx.main.signals import activity_stream_delete
if self.pending_deletion is True:
@ -382,7 +382,7 @@ class Inventory(CommonModelNameNotUnique, ResourceMixin, RelatedJobsMixin):
if self.kind == 'smart' and settings.AWX_REBUILD_SMART_MEMBERSHIP:
def on_commit():
from awx.main.tasks import update_host_smart_inventory_memberships
from awx.main.tasks.system import update_host_smart_inventory_memberships
update_host_smart_inventory_memberships.delay()
@ -551,7 +551,7 @@ class Host(CommonModelNameNotUnique, RelatedJobsMixin):
if settings.AWX_REBUILD_SMART_MEMBERSHIP:
def on_commit():
from awx.main.tasks import update_host_smart_inventory_memberships
from awx.main.tasks.system import update_host_smart_inventory_memberships
update_host_smart_inventory_memberships.delay()
@ -631,7 +631,7 @@ class Group(CommonModelNameNotUnique, RelatedJobsMixin):
@transaction.atomic
def delete_recursive(self):
from awx.main.utils import ignore_inventory_computed_fields
from awx.main.tasks import update_inventory_computed_fields
from awx.main.tasks.system import update_inventory_computed_fields
from awx.main.signals import disable_activity_stream, activity_stream_delete
def mark_actual():
@ -1219,7 +1219,7 @@ class InventoryUpdate(UnifiedJob, InventorySourceOptions, JobNotificationMixin,
@classmethod
def _get_task_class(cls):
from awx.main.tasks import RunInventoryUpdate
from awx.main.tasks.jobs import RunInventoryUpdate
return RunInventoryUpdate

View File

@ -583,7 +583,7 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana
@classmethod
def _get_task_class(cls):
from awx.main.tasks import RunJob
from awx.main.tasks.jobs import RunJob
return RunJob
@ -1213,7 +1213,7 @@ class SystemJob(UnifiedJob, SystemJobOptions, JobNotificationMixin):
@classmethod
def _get_task_class(cls):
from awx.main.tasks import RunSystemJob
from awx.main.tasks.jobs import RunSystemJob
return RunSystemJob

View File

@ -508,7 +508,7 @@ class JobNotificationMixin(object):
return (msg, body)
def send_notification_templates(self, status):
from awx.main.tasks import send_notifications # avoid circular import
from awx.main.tasks.system import send_notifications # avoid circular import
if status not in ['running', 'succeeded', 'failed']:
raise ValueError(_("status must be either running, succeeded or failed"))

View File

@ -471,7 +471,7 @@ class Project(UnifiedJobTemplate, ProjectOptions, ResourceMixin, CustomVirtualEn
r = super(Project, self).delete(*args, **kwargs)
for path_to_delete in paths_to_delete:
if self.scm_type and path_to_delete: # non-manual, concrete path
from awx.main.tasks import delete_project_files
from awx.main.tasks.system import delete_project_files
delete_project_files.delay(path_to_delete)
return r
@ -532,7 +532,7 @@ class ProjectUpdate(UnifiedJob, ProjectOptions, JobNotificationMixin, TaskManage
@classmethod
def _get_task_class(cls):
from awx.main.tasks import RunProjectUpdate
from awx.main.tasks.jobs import RunProjectUpdate
return RunProjectUpdate

View File

@ -1046,7 +1046,7 @@ class UnifiedJob(
fd = tempfile.NamedTemporaryFile(
mode='w', prefix='{}-{}-'.format(self.model_to_str(), self.pk), suffix='.out', dir=settings.JOBOUTPUT_ROOT, encoding='utf-8'
)
from awx.main.tasks import purge_old_stdout_files # circular import
from awx.main.tasks.system import purge_old_stdout_files # circular import
purge_old_stdout_files.apply_async()

View File

@ -813,7 +813,7 @@ class WorkflowApproval(UnifiedJob, JobNotificationMixin):
return True
def send_approval_notification(self, approval_status):
from awx.main.tasks import send_notifications # avoid circular import
from awx.main.tasks.system import send_notifications # avoid circular import
if self.workflow_job_template is None:
return

View File

@ -257,7 +257,7 @@ class TaskManager:
if self.start_task_limit == 0:
# schedule another run immediately after this task manager
schedule_task_manager()
from awx.main.tasks import handle_work_error, handle_work_success
from awx.main.tasks.system import handle_work_error, handle_work_success
dependent_tasks = dependent_tasks or []

View File

@ -57,7 +57,7 @@ from awx.main.models import (
from awx.main.constants import CENSOR_VALUE
from awx.main.utils import model_instance_diff, model_to_dict, camelcase_to_underscore, get_current_apps
from awx.main.utils import ignore_inventory_computed_fields, ignore_inventory_group_removal, _inventory_updates
from awx.main.tasks import update_inventory_computed_fields, handle_removed_image
from awx.main.tasks.system import update_inventory_computed_fields, handle_removed_image
from awx.main.fields import (
is_implicit_parent,
update_role_parentage_for_instance,

View File

File diff suppressed because it is too large Load Diff

534
awx/main/tasks/receptor.py Normal file
View File

@ -0,0 +1,534 @@
# Python
from base64 import b64encode
from collections import namedtuple
import concurrent.futures
from enum import Enum
import logging
import socket
import sys
import threading
import time
import yaml
# Django
from django.conf import settings
# Runner
import ansible_runner
# AWX
from awx.main.utils.execution_environments import get_default_pod_spec
from awx.main.exceptions import ReceptorNodeNotFound
from awx.main.utils.common import (
deepmerge,
parse_yaml_or_json,
cleanup_new_process,
)
# Receptorctl
from receptorctl.socket_interface import ReceptorControl
logger = logging.getLogger('awx.main.tasks.receptor')
__RECEPTOR_CONF = '/etc/receptor/receptor.conf'
RECEPTOR_ACTIVE_STATES = ('Pending', 'Running')
class ReceptorConnectionType(Enum):
DATAGRAM = 0
STREAM = 1
STREAMTLS = 2
def get_receptor_sockfile():
with open(__RECEPTOR_CONF, 'r') as f:
data = yaml.safe_load(f)
for section in data:
for entry_name, entry_data in section.items():
if entry_name == 'control-service':
if 'filename' in entry_data:
return entry_data['filename']
else:
raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} control-service entry does not have a filename parameter')
else:
raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} does not have control-service entry needed to get sockfile')
def get_tls_client(use_stream_tls=None):
if not use_stream_tls:
return None
with open(__RECEPTOR_CONF, 'r') as f:
data = yaml.safe_load(f)
for section in data:
for entry_name, entry_data in section.items():
if entry_name == 'tls-client':
if 'name' in entry_data:
return entry_data['name']
return None
def get_receptor_ctl():
receptor_sockfile = get_receptor_sockfile()
try:
return ReceptorControl(receptor_sockfile, config=__RECEPTOR_CONF, tlsclient=get_tls_client(True))
except RuntimeError:
return ReceptorControl(receptor_sockfile)
def get_conn_type(node_name, receptor_ctl):
all_nodes = receptor_ctl.simple_command("status").get('Advertisements', None)
for node in all_nodes:
if node.get('NodeID') == node_name:
return ReceptorConnectionType(node.get('ConnType'))
raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh')
def administrative_workunit_reaper(work_list=None):
"""
This releases completed work units that were spawned by actions inside of this module
specifically, this should catch any completed work unit left by
- worker_info
- worker_cleanup
These should ordinarily be released when the method finishes, but this is a
cleanup of last-resort, in case something went awry
"""
receptor_ctl = get_receptor_ctl()
if work_list is None:
work_list = receptor_ctl.simple_command("work list")
for unit_id, work_data in work_list.items():
extra_data = work_data.get('ExtraData')
if (extra_data is None) or (extra_data.get('RemoteWorkType') != 'ansible-runner'):
continue # if this is not ansible-runner work, we do not want to touch it
params = extra_data.get('RemoteParams', {}).get('params')
if not params:
continue
if not (params == '--worker-info' or params.startswith('cleanup')):
continue # if this is not a cleanup or health check, we do not want to touch it
if work_data.get('StateName') in RECEPTOR_ACTIVE_STATES:
continue # do not want to touch active work units
logger.info(f'Reaping orphaned work unit {unit_id} with params {params}')
receptor_ctl.simple_command(f"work release {unit_id}")
class RemoteJobError(RuntimeError):
pass
def run_until_complete(node, timing_data=None, **kwargs):
"""
Runs an ansible-runner work_type on remote node, waits until it completes, then returns stdout.
"""
receptor_ctl = get_receptor_ctl()
use_stream_tls = getattr(get_conn_type(node, receptor_ctl), 'name', None) == "STREAMTLS"
kwargs.setdefault('tlsclient', get_tls_client(use_stream_tls))
kwargs.setdefault('ttl', '20s')
kwargs.setdefault('payload', '')
transmit_start = time.time()
sign_work = False if settings.IS_K8S else True
result = receptor_ctl.submit_work(worktype='ansible-runner', node=node, signwork=sign_work, **kwargs)
unit_id = result['unitid']
run_start = time.time()
if timing_data:
timing_data['transmit_timing'] = run_start - transmit_start
run_timing = 0.0
stdout = ''
try:
resultfile = receptor_ctl.get_work_results(unit_id)
while run_timing < 20.0:
status = receptor_ctl.simple_command(f'work status {unit_id}')
state_name = status.get('StateName')
if state_name not in RECEPTOR_ACTIVE_STATES:
break
run_timing = time.time() - run_start
time.sleep(0.5)
else:
raise RemoteJobError(f'Receptor job timeout on {node} after {run_timing} seconds, state remains in {state_name}')
if timing_data:
timing_data['run_timing'] = run_timing
stdout = resultfile.read()
stdout = str(stdout, encoding='utf-8')
finally:
if settings.RECEPTOR_RELEASE_WORK:
res = receptor_ctl.simple_command(f"work release {unit_id}")
if res != {'released': unit_id}:
logger.warn(f'Could not confirm release of receptor work unit id {unit_id} from {node}, data: {res}')
receptor_ctl.close()
if state_name.lower() == 'failed':
work_detail = status.get('Detail', '')
if work_detail:
raise RemoteJobError(f'Receptor error from {node}, detail:\n{work_detail}')
else:
raise RemoteJobError(f'Unknown ansible-runner error on node {node}, stdout:\n{stdout}')
return stdout
def worker_info(node_name, work_type='ansible-runner'):
error_list = []
data = {'errors': error_list, 'transmit_timing': 0.0}
try:
stdout = run_until_complete(node=node_name, timing_data=data, params={"params": "--worker-info"})
yaml_stdout = stdout.strip()
remote_data = {}
try:
remote_data = yaml.safe_load(yaml_stdout)
except Exception as json_e:
error_list.append(f'Failed to parse node {node_name} --worker-info output as YAML, error: {json_e}, data:\n{yaml_stdout}')
if not isinstance(remote_data, dict):
error_list.append(f'Remote node {node_name} --worker-info output is not a YAML dict, output:{stdout}')
else:
error_list.extend(remote_data.pop('errors', [])) # merge both error lists
data.update(remote_data)
except RemoteJobError as exc:
details = exc.args[0]
if 'unrecognized arguments: --worker-info' in details:
error_list.append(f'Old version (2.0.1 or earlier) of ansible-runner on node {node_name} without --worker-info')
else:
error_list.append(details)
except (ReceptorNodeNotFound, RuntimeError) as exc:
error_list.append(str(exc))
# If we have a connection error, missing keys would be trivial consequence of that
if not data['errors']:
# see tasks.py usage of keys
missing_keys = set(('runner_version', 'mem_in_bytes', 'cpu_count')) - set(data.keys())
if missing_keys:
data['errors'].append('Worker failed to return keys {}'.format(' '.join(missing_keys)))
return data
def _convert_args_to_cli(vargs):
"""
For the ansible-runner worker cleanup command
converts the dictionary (parsed argparse variables) used for python interface
into a string of CLI options, which has to be used on execution nodes.
"""
args = ['cleanup']
for option in ('exclude_strings', 'remove_images'):
if vargs.get(option):
args.append('--{}={}'.format(option.replace('_', '-'), ' '.join(vargs.get(option))))
for option in ('file_pattern', 'image_prune', 'process_isolation_executable', 'grace_period'):
if vargs.get(option) is True:
args.append('--{}'.format(option.replace('_', '-')))
elif vargs.get(option) not in (None, ''):
args.append('--{}={}'.format(option.replace('_', '-'), vargs.get(option)))
return args
def worker_cleanup(node_name, vargs, timeout=300.0):
args = _convert_args_to_cli(vargs)
remote_command = ' '.join(args)
logger.debug(f'Running command over receptor mesh on {node_name}: ansible-runner worker {remote_command}')
stdout = run_until_complete(node=node_name, params={"params": remote_command})
return stdout
class TransmitterThread(threading.Thread):
def run(self):
self.exc = None
try:
super().run()
except Exception:
self.exc = sys.exc_info()
class AWXReceptorJob:
def __init__(self, task, runner_params=None):
self.task = task
self.runner_params = runner_params
self.unit_id = None
if self.task and not self.task.instance.is_container_group_task:
execution_environment_params = self.task.build_execution_environment_params(self.task.instance, runner_params['private_data_dir'])
self.runner_params.update(execution_environment_params)
if not settings.IS_K8S and self.work_type == 'local' and 'only_transmit_kwargs' not in self.runner_params:
self.runner_params['only_transmit_kwargs'] = True
def run(self):
# We establish a connection to the Receptor socket
receptor_ctl = get_receptor_ctl()
res = None
try:
res = self._run_internal(receptor_ctl)
return res
finally:
# Make sure to always release the work unit if we established it
if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK:
try:
receptor_ctl.simple_command(f"work release {self.unit_id}")
except Exception:
logger.exception(f"Error releasing work unit {self.unit_id}.")
@property
def sign_work(self):
return False if settings.IS_K8S else True
def _run_internal(self, receptor_ctl):
# Create a socketpair. Where the left side will be used for writing our payload
# (private data dir, kwargs). The right side will be passed to Receptor for
# reading.
sockin, sockout = socket.socketpair()
transmitter_thread = TransmitterThread(target=self.transmit, args=[sockin])
transmitter_thread.start()
# submit our work, passing
# in the right side of our socketpair for reading.
_kw = {}
if self.work_type == 'ansible-runner':
_kw['node'] = self.task.instance.execution_node
use_stream_tls = get_conn_type(_kw['node'], receptor_ctl).name == "STREAMTLS"
_kw['tlsclient'] = get_tls_client(use_stream_tls)
result = receptor_ctl.submit_work(worktype=self.work_type, payload=sockout.makefile('rb'), params=self.receptor_params, signwork=self.sign_work, **_kw)
self.unit_id = result['unitid']
# Update the job with the work unit in-memory so that the log_lifecycle
# will print out the work unit that is to be associated with the job in the database
# via the update_model() call.
# We want to log the work_unit_id as early as possible. A failure can happen in between
# when we start the job in receptor and when we associate the job <-> work_unit_id.
# In that case, there will be work running in receptor and Controller will not know
# which Job it is associated with.
# We do not programatically handle this case. Ideally, we would handle this with a reaper case.
# The two distinct job lifecycle log events below allow for us to at least detect when this
# edge case occurs. If the lifecycle event work_unit_id_received occurs without the
# work_unit_id_assigned event then this case may have occured.
self.task.instance.work_unit_id = result['unitid'] # Set work_unit_id in-memory only
self.task.instance.log_lifecycle("work_unit_id_received")
self.task.update_model(self.task.instance.pk, work_unit_id=result['unitid'])
self.task.instance.log_lifecycle("work_unit_id_assigned")
sockin.close()
sockout.close()
if transmitter_thread.exc:
raise transmitter_thread.exc[1].with_traceback(transmitter_thread.exc[2])
transmitter_thread.join()
resultsock, resultfile = receptor_ctl.get_work_results(self.unit_id, return_socket=True, return_sockfile=True)
# Both "processor" and "cancel_watcher" are spawned in separate threads.
# We wait for the first one to return. If cancel_watcher returns first,
# we yank the socket out from underneath the processor, which will cause it
# to exit. A reference to the processor_future is passed into the cancel_watcher_future,
# Which exits if the job has finished normally. The context manager ensures we do not
# leave any threads laying around.
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
processor_future = executor.submit(self.processor, resultfile)
cancel_watcher_future = executor.submit(self.cancel_watcher, processor_future)
futures = [processor_future, cancel_watcher_future]
first_future = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)
res = list(first_future.done)[0].result()
if res.status == 'canceled':
receptor_ctl.simple_command(f"work cancel {self.unit_id}")
resultsock.shutdown(socket.SHUT_RDWR)
resultfile.close()
elif res.status == 'error':
try:
unit_status = receptor_ctl.simple_command(f'work status {self.unit_id}')
detail = unit_status.get('Detail', None)
state_name = unit_status.get('StateName', None)
except Exception:
detail = ''
state_name = ''
logger.exception(f'An error was encountered while getting status for work unit {self.unit_id}')
if 'exceeded quota' in detail:
logger.warn(detail)
log_name = self.task.instance.log_format
logger.warn(f"Could not launch pod for {log_name}. Exceeded quota.")
self.task.update_model(self.task.instance.pk, status='pending')
return
# If ansible-runner ran, but an error occured at runtime, the traceback information
# is saved via the status_handler passed in to the processor.
if state_name == 'Succeeded':
return res
if not self.task.instance.result_traceback:
try:
resultsock = receptor_ctl.get_work_results(self.unit_id, return_sockfile=True)
lines = resultsock.readlines()
receptor_output = b"".join(lines).decode()
if receptor_output:
self.task.instance.result_traceback = receptor_output
self.task.instance.save(update_fields=['result_traceback'])
elif detail:
self.task.instance.result_traceback = detail
self.task.instance.save(update_fields=['result_traceback'])
else:
logger.warn(f'No result details or output from {self.task.instance.log_format}, status:\n{unit_status}')
except Exception:
raise RuntimeError(detail)
return res
# Spawned in a thread so Receptor can start reading before we finish writing, we
# write our payload to the left side of our socketpair.
@cleanup_new_process
def transmit(self, _socket):
try:
ansible_runner.interface.run(streamer='transmit', _output=_socket.makefile('wb'), **self.runner_params)
finally:
# Socket must be shutdown here, or the reader will hang forever.
_socket.shutdown(socket.SHUT_WR)
@cleanup_new_process
def processor(self, resultfile):
return ansible_runner.interface.run(
streamer='process',
quiet=True,
_input=resultfile,
event_handler=self.task.event_handler,
finished_callback=self.task.finished_callback,
status_handler=self.task.status_handler,
**self.runner_params,
)
@property
def receptor_params(self):
if self.task.instance.is_container_group_task:
spec_yaml = yaml.dump(self.pod_definition, explicit_start=True)
receptor_params = {
"secret_kube_pod": spec_yaml,
"pod_pending_timeout": getattr(settings, 'AWX_CONTAINER_GROUP_POD_PENDING_TIMEOUT', "5m"),
}
if self.credential:
kubeconfig_yaml = yaml.dump(self.kube_config, explicit_start=True)
receptor_params["secret_kube_config"] = kubeconfig_yaml
else:
private_data_dir = self.runner_params['private_data_dir']
if self.work_type == 'ansible-runner' and settings.AWX_CLEANUP_PATHS:
# on execution nodes, we rely on the private data dir being deleted
cli_params = f"--private-data-dir={private_data_dir} --delete"
else:
# on hybrid nodes, we rely on the private data dir NOT being deleted
cli_params = f"--private-data-dir={private_data_dir}"
receptor_params = {"params": cli_params}
return receptor_params
@property
def work_type(self):
if self.task.instance.is_container_group_task:
if self.credential:
return 'kubernetes-runtime-auth'
return 'kubernetes-incluster-auth'
if self.task.instance.execution_node == settings.CLUSTER_HOST_ID or self.task.instance.execution_node == self.task.instance.controller_node:
return 'local'
return 'ansible-runner'
@cleanup_new_process
def cancel_watcher(self, processor_future):
while True:
if processor_future.done():
return processor_future.result()
if self.task.cancel_callback():
result = namedtuple('result', ['status', 'rc'])
return result('canceled', 1)
time.sleep(1)
@property
def pod_definition(self):
ee = self.task.instance.execution_environment
default_pod_spec = get_default_pod_spec()
pod_spec_override = {}
if self.task and self.task.instance.instance_group.pod_spec_override:
pod_spec_override = parse_yaml_or_json(self.task.instance.instance_group.pod_spec_override)
# According to the deepmerge docstring, the second dictionary will override when
# they share keys, which is the desired behavior.
# This allows user to only provide elements they want to override, and for us to still provide any
# defaults they don't want to change
pod_spec = deepmerge(default_pod_spec, pod_spec_override)
pod_spec['spec']['containers'][0]['image'] = ee.image
pod_spec['spec']['containers'][0]['args'] = ['ansible-runner', 'worker', '--private-data-dir=/runner']
# Enforce EE Pull Policy
pull_options = {"always": "Always", "missing": "IfNotPresent", "never": "Never"}
if self.task and self.task.instance.execution_environment:
if self.task.instance.execution_environment.pull:
pod_spec['spec']['containers'][0]['imagePullPolicy'] = pull_options[self.task.instance.execution_environment.pull]
if self.task and self.task.instance.is_container_group_task:
# If EE credential is passed, create an imagePullSecret
if self.task.instance.execution_environment and self.task.instance.execution_environment.credential:
# Create pull secret in k8s cluster based on ee cred
from awx.main.scheduler.kubernetes import PodManager # prevent circular import
pm = PodManager(self.task.instance)
secret_name = pm.create_secret(job=self.task.instance)
# Inject secret name into podspec
pod_spec['spec']['imagePullSecrets'] = [{"name": secret_name}]
if self.task:
pod_spec['metadata'] = deepmerge(
pod_spec.get('metadata', {}),
dict(name=self.pod_name, labels={'ansible-awx': settings.INSTALL_UUID, 'ansible-awx-job-id': str(self.task.instance.id)}),
)
return pod_spec
@property
def pod_name(self):
return f"automation-job-{self.task.instance.id}"
@property
def credential(self):
return self.task.instance.instance_group.credential
@property
def namespace(self):
return self.pod_definition['metadata']['namespace']
@property
def kube_config(self):
host_input = self.credential.get_input('host')
config = {
"apiVersion": "v1",
"kind": "Config",
"preferences": {},
"clusters": [{"name": host_input, "cluster": {"server": host_input}}],
"users": [{"name": host_input, "user": {"token": self.credential.get_input('bearer_token')}}],
"contexts": [{"name": host_input, "context": {"cluster": host_input, "user": host_input, "namespace": self.namespace}}],
"current-context": host_input,
}
if self.credential.get_input('verify_ssl') and 'ssl_ca_cert' in self.credential.inputs:
config["clusters"][0]["cluster"]["certificate-authority-data"] = b64encode(
self.credential.get_input('ssl_ca_cert').encode() # encode to bytes
).decode() # decode the base64 data into a str
else:
config["clusters"][0]["cluster"]["insecure-skip-tls-verify"] = True
return config

897
awx/main/tasks/system.py Normal file
View File

@ -0,0 +1,897 @@
# Python
from collections import namedtuple
import functools
import importlib
import json
import logging
import os
from io import StringIO
from contextlib import redirect_stdout
import shutil
import time
from distutils.version import LooseVersion as Version
# Django
from django.conf import settings
from django.db import transaction, DatabaseError, IntegrityError
from django.db.models.fields.related import ForeignKey
from django.utils.timezone import now
from django.utils.encoding import smart_str
from django.contrib.auth.models import User
from django.utils.translation import ugettext_lazy as _
from django.utils.translation import gettext_noop
from django.core.cache import cache
from django.core.exceptions import ObjectDoesNotExist
# Django-CRUM
from crum import impersonate
# Runner
import ansible_runner.cleanup
# dateutil
from dateutil.parser import parse as parse_date
# AWX
from awx import __version__ as awx_application_version
from awx.main.access import access_registry
from awx.main.models import (
Schedule,
TowerScheduleState,
Instance,
InstanceGroup,
UnifiedJob,
Notification,
Inventory,
SmartInventoryMembership,
Job,
)
from awx.main.constants import ACTIVE_STATES
from awx.main.dispatch.publish import task
from awx.main.dispatch import get_local_queuename, reaper
from awx.main.utils.common import (
ignore_inventory_computed_fields,
ignore_inventory_group_removal,
schedule_task_manager,
)
from awx.main.utils.external_logging import reconfigure_rsyslog
from awx.main.utils.reload import stop_local_services
from awx.main.utils.pglock import advisory_lock
from awx.main.tasks.receptor import get_receptor_ctl, worker_info, worker_cleanup, administrative_workunit_reaper
from awx.main.consumers import emit_channel_notification
from awx.main import analytics
from awx.conf import settings_registry
from awx.main.analytics.subsystem_metrics import Metrics
from rest_framework.exceptions import PermissionDenied
logger = logging.getLogger('awx.main.tasks.system')
OPENSSH_KEY_ERROR = u'''\
It looks like you're trying to use a private key in OpenSSH format, which \
isn't supported by the installed version of OpenSSH on this instance. \
Try upgrading OpenSSH or providing your private key in an different format. \
'''
def dispatch_startup():
startup_logger = logging.getLogger('awx.main.tasks')
startup_logger.debug("Syncing Schedules")
for sch in Schedule.objects.all():
try:
sch.update_computed_fields()
except Exception:
logger.exception("Failed to rebuild schedule {}.".format(sch))
#
# When the dispatcher starts, if the instance cannot be found in the database,
# automatically register it. This is mostly useful for openshift-based
# deployments where:
#
# 2 Instances come online
# Instance B encounters a network blip, Instance A notices, and
# deprovisions it
# Instance B's connectivity is restored, the dispatcher starts, and it
# re-registers itself
#
# In traditional container-less deployments, instances don't get
# deprovisioned when they miss their heartbeat, so this code is mostly a
# no-op.
#
apply_cluster_membership_policies()
cluster_node_heartbeat()
Metrics().clear_values()
# Update Tower's rsyslog.conf file based on loggins settings in the db
reconfigure_rsyslog()
def inform_cluster_of_shutdown():
try:
this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID)
this_inst.mark_offline(update_last_seen=True, errors=_('Instance received normal shutdown signal'))
try:
reaper.reap(this_inst)
except Exception:
logger.exception('failed to reap jobs for {}'.format(this_inst.hostname))
logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname))
except Exception:
logger.exception('Encountered problem with normal shutdown signal.')
@task(queue=get_local_queuename)
def apply_cluster_membership_policies():
from awx.main.signals import disable_activity_stream
started_waiting = time.time()
with advisory_lock('cluster_policy_lock', wait=True):
lock_time = time.time() - started_waiting
if lock_time > 1.0:
to_log = logger.info
else:
to_log = logger.debug
to_log('Waited {} seconds to obtain lock name: cluster_policy_lock'.format(lock_time))
started_compute = time.time()
# Hop nodes should never get assigned to an InstanceGroup.
all_instances = list(Instance.objects.exclude(node_type='hop').order_by('id'))
all_groups = list(InstanceGroup.objects.prefetch_related('instances'))
total_instances = len(all_instances)
actual_groups = []
actual_instances = []
Group = namedtuple('Group', ['obj', 'instances', 'prior_instances'])
Node = namedtuple('Instance', ['obj', 'groups'])
# Process policy instance list first, these will represent manually managed memberships
instance_hostnames_map = {inst.hostname: inst for inst in all_instances}
for ig in all_groups:
group_actual = Group(obj=ig, instances=[], prior_instances=[instance.pk for instance in ig.instances.all()]) # obtained in prefetch
for hostname in ig.policy_instance_list:
if hostname not in instance_hostnames_map:
logger.info("Unknown instance {} in {} policy list".format(hostname, ig.name))
continue
inst = instance_hostnames_map[hostname]
group_actual.instances.append(inst.id)
# NOTE: arguable behavior: policy-list-group is not added to
# instance's group count for consideration in minimum-policy rules
if group_actual.instances:
logger.debug("Policy List, adding Instances {} to Group {}".format(group_actual.instances, ig.name))
actual_groups.append(group_actual)
# Process Instance minimum policies next, since it represents a concrete lower bound to the
# number of instances to make available to instance groups
actual_instances = [Node(obj=i, groups=[]) for i in all_instances if i.managed_by_policy]
logger.debug("Total instances: {}, available for policy: {}".format(total_instances, len(actual_instances)))
for g in sorted(actual_groups, key=lambda x: len(x.instances)):
exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control'
policy_min_added = []
for i in sorted(actual_instances, key=lambda x: len(x.groups)):
if i.obj.node_type == exclude_type:
continue # never place execution instances in controlplane group or control instances in other groups
if len(g.instances) >= g.obj.policy_instance_minimum:
break
if i.obj.id in g.instances:
# If the instance is already _in_ the group, it was
# applied earlier via the policy list
continue
g.instances.append(i.obj.id)
i.groups.append(g.obj.id)
policy_min_added.append(i.obj.id)
if policy_min_added:
logger.debug("Policy minimum, adding Instances {} to Group {}".format(policy_min_added, g.obj.name))
# Finally, process instance policy percentages
for g in sorted(actual_groups, key=lambda x: len(x.instances)):
exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control'
candidate_pool_ct = sum(1 for i in actual_instances if i.obj.node_type != exclude_type)
if not candidate_pool_ct:
continue
policy_per_added = []
for i in sorted(actual_instances, key=lambda x: len(x.groups)):
if i.obj.node_type == exclude_type:
continue
if i.obj.id in g.instances:
# If the instance is already _in_ the group, it was
# applied earlier via a minimum policy or policy list
continue
if 100 * float(len(g.instances)) / candidate_pool_ct >= g.obj.policy_instance_percentage:
break
g.instances.append(i.obj.id)
i.groups.append(g.obj.id)
policy_per_added.append(i.obj.id)
if policy_per_added:
logger.debug("Policy percentage, adding Instances {} to Group {}".format(policy_per_added, g.obj.name))
# Determine if any changes need to be made
needs_change = False
for g in actual_groups:
if set(g.instances) != set(g.prior_instances):
needs_change = True
break
if not needs_change:
logger.debug('Cluster policy no-op finished in {} seconds'.format(time.time() - started_compute))
return
# On a differential basis, apply instances to groups
with transaction.atomic():
with disable_activity_stream():
for g in actual_groups:
if g.obj.is_container_group:
logger.debug('Skipping containerized group {} for policy calculation'.format(g.obj.name))
continue
instances_to_add = set(g.instances) - set(g.prior_instances)
instances_to_remove = set(g.prior_instances) - set(g.instances)
if instances_to_add:
logger.debug('Adding instances {} to group {}'.format(list(instances_to_add), g.obj.name))
g.obj.instances.add(*instances_to_add)
if instances_to_remove:
logger.debug('Removing instances {} from group {}'.format(list(instances_to_remove), g.obj.name))
g.obj.instances.remove(*instances_to_remove)
logger.debug('Cluster policy computation finished in {} seconds'.format(time.time() - started_compute))
@task(queue='tower_broadcast_all')
def handle_setting_changes(setting_keys):
orig_len = len(setting_keys)
for i in range(orig_len):
for dependent_key in settings_registry.get_dependent_settings(setting_keys[i]):
setting_keys.append(dependent_key)
cache_keys = set(setting_keys)
logger.debug('cache delete_many(%r)', cache_keys)
cache.delete_many(cache_keys)
if any([setting.startswith('LOG_AGGREGATOR') for setting in setting_keys]):
reconfigure_rsyslog()
@task(queue='tower_broadcast_all')
def delete_project_files(project_path):
# TODO: possibly implement some retry logic
lock_file = project_path + '.lock'
if os.path.exists(project_path):
try:
shutil.rmtree(project_path)
logger.debug('Success removing project files {}'.format(project_path))
except Exception:
logger.exception('Could not remove project directory {}'.format(project_path))
if os.path.exists(lock_file):
try:
os.remove(lock_file)
logger.debug('Success removing {}'.format(lock_file))
except Exception:
logger.exception('Could not remove lock file {}'.format(lock_file))
@task(queue='tower_broadcast_all')
def profile_sql(threshold=1, minutes=1):
if threshold <= 0:
cache.delete('awx-profile-sql-threshold')
logger.error('SQL PROFILING DISABLED')
else:
cache.set('awx-profile-sql-threshold', threshold, timeout=minutes * 60)
logger.error('SQL QUERIES >={}s ENABLED FOR {} MINUTE(S)'.format(threshold, minutes))
@task(queue=get_local_queuename)
def send_notifications(notification_list, job_id=None):
if not isinstance(notification_list, list):
raise TypeError("notification_list should be of type list")
if job_id is not None:
job_actual = UnifiedJob.objects.get(id=job_id)
notifications = Notification.objects.filter(id__in=notification_list)
if job_id is not None:
job_actual.notifications.add(*notifications)
for notification in notifications:
update_fields = ['status', 'notifications_sent']
try:
sent = notification.notification_template.send(notification.subject, notification.body)
notification.status = "successful"
notification.notifications_sent = sent
if job_id is not None:
job_actual.log_lifecycle("notifications_sent")
except Exception as e:
logger.exception("Send Notification Failed {}".format(e))
notification.status = "failed"
notification.error = smart_str(e)
update_fields.append('error')
finally:
try:
notification.save(update_fields=update_fields)
except Exception:
logger.exception('Error saving notification {} result.'.format(notification.id))
@task(queue=get_local_queuename)
def gather_analytics():
from awx.conf.models import Setting
from rest_framework.fields import DateTimeField
last_gather = Setting.objects.filter(key='AUTOMATION_ANALYTICS_LAST_GATHER').first()
last_time = DateTimeField().to_internal_value(last_gather.value) if last_gather and last_gather.value else None
gather_time = now()
if not last_time or ((gather_time - last_time).total_seconds() > settings.AUTOMATION_ANALYTICS_GATHER_INTERVAL):
analytics.gather()
@task(queue=get_local_queuename)
def purge_old_stdout_files():
nowtime = time.time()
for f in os.listdir(settings.JOBOUTPUT_ROOT):
if os.path.getctime(os.path.join(settings.JOBOUTPUT_ROOT, f)) < nowtime - settings.LOCAL_STDOUT_EXPIRE_TIME:
os.unlink(os.path.join(settings.JOBOUTPUT_ROOT, f))
logger.debug("Removing {}".format(os.path.join(settings.JOBOUTPUT_ROOT, f)))
def _cleanup_images_and_files(**kwargs):
if settings.IS_K8S:
return
this_inst = Instance.objects.me()
runner_cleanup_kwargs = this_inst.get_cleanup_task_kwargs(**kwargs)
if runner_cleanup_kwargs:
stdout = ''
with StringIO() as buffer:
with redirect_stdout(buffer):
ansible_runner.cleanup.run_cleanup(runner_cleanup_kwargs)
stdout = buffer.getvalue()
if '(changed: True)' in stdout:
logger.info(f'Performed local cleanup with kwargs {kwargs}, output:\n{stdout}')
# if we are the first instance alphabetically, then run cleanup on execution nodes
checker_instance = Instance.objects.filter(node_type__in=['hybrid', 'control'], enabled=True, capacity__gt=0).order_by('-hostname').first()
if checker_instance and this_inst.hostname == checker_instance.hostname:
for inst in Instance.objects.filter(node_type='execution', enabled=True, capacity__gt=0):
runner_cleanup_kwargs = inst.get_cleanup_task_kwargs(**kwargs)
if not runner_cleanup_kwargs:
continue
try:
stdout = worker_cleanup(inst.hostname, runner_cleanup_kwargs)
if '(changed: True)' in stdout:
logger.info(f'Performed cleanup on execution node {inst.hostname} with output:\n{stdout}')
except RuntimeError:
logger.exception(f'Error running cleanup on execution node {inst.hostname}')
@task(queue='tower_broadcast_all')
def handle_removed_image(remove_images=None):
"""Special broadcast invocation of this method to handle case of deleted EE"""
_cleanup_images_and_files(remove_images=remove_images, file_pattern='')
@task(queue=get_local_queuename)
def cleanup_images_and_files():
_cleanup_images_and_files()
@task(queue=get_local_queuename)
def cluster_node_health_check(node):
"""
Used for the health check endpoint, refreshes the status of the instance, but must be ran on target node
"""
if node == '':
logger.warn('Local health check incorrectly called with blank string')
return
elif node != settings.CLUSTER_HOST_ID:
logger.warn(f'Local health check for {node} incorrectly sent to {settings.CLUSTER_HOST_ID}')
return
try:
this_inst = Instance.objects.me()
except Instance.DoesNotExist:
logger.warn(f'Instance record for {node} missing, could not check capacity.')
return
this_inst.local_health_check()
@task(queue=get_local_queuename)
def execution_node_health_check(node):
if node == '':
logger.warn('Remote health check incorrectly called with blank string')
return
try:
instance = Instance.objects.get(hostname=node)
except Instance.DoesNotExist:
logger.warn(f'Instance record for {node} missing, could not check capacity.')
return
if instance.node_type != 'execution':
raise RuntimeError(f'Execution node health check ran against {instance.node_type} node {instance.hostname}')
data = worker_info(node)
prior_capacity = instance.capacity
instance.save_health_data(
version='ansible-runner-' + data.get('runner_version', '???'),
cpu=data.get('cpu_count', 0),
memory=data.get('mem_in_bytes', 0),
uuid=data.get('uuid'),
errors='\n'.join(data.get('errors', [])),
)
if data['errors']:
formatted_error = "\n".join(data["errors"])
if prior_capacity:
logger.warn(f'Health check marking execution node {node} as lost, errors:\n{formatted_error}')
else:
logger.info(f'Failed to find capacity of new or lost execution node {node}, errors:\n{formatted_error}')
else:
logger.info('Set capacity of execution node {} to {}, worker info data:\n{}'.format(node, instance.capacity, json.dumps(data, indent=2)))
return data
def inspect_execution_nodes(instance_list):
with advisory_lock('inspect_execution_nodes_lock', wait=False):
node_lookup = {inst.hostname: inst for inst in instance_list}
ctl = get_receptor_ctl()
mesh_status = ctl.simple_command('status')
nowtime = now()
workers = mesh_status['Advertisements']
for ad in workers:
hostname = ad['NodeID']
if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []):
continue
changed = False
if hostname in node_lookup:
instance = node_lookup[hostname]
else:
logger.warn(f"Unrecognized node on mesh advertising ansible-runner work type: {hostname}")
continue
was_lost = instance.is_lost(ref_time=nowtime)
last_seen = parse_date(ad['Time'])
if instance.last_seen and instance.last_seen >= last_seen:
continue
instance.last_seen = last_seen
instance.save(update_fields=['last_seen'])
if changed:
execution_node_health_check.apply_async([hostname])
elif was_lost:
# if the instance *was* lost, but has appeared again,
# attempt to re-establish the initial capacity and version
# check
logger.warn(f'Execution node attempting to rejoin as instance {hostname}.')
execution_node_health_check.apply_async([hostname])
elif instance.capacity == 0 and instance.enabled:
# nodes with proven connection but need remediation run health checks are reduced frequency
if not instance.last_health_check or (nowtime - instance.last_health_check).total_seconds() >= settings.EXECUTION_NODE_REMEDIATION_CHECKS:
# Periodically re-run the health check of errored nodes, in case someone fixed it
# TODO: perhaps decrease the frequency of these checks
logger.debug(f'Restarting health check for execution node {hostname} with known errors.')
execution_node_health_check.apply_async([hostname])
@task(queue=get_local_queuename)
def cluster_node_heartbeat():
logger.debug("Cluster node heartbeat task.")
nowtime = now()
instance_list = list(Instance.objects.all())
this_inst = None
lost_instances = []
for inst in instance_list:
if inst.hostname == settings.CLUSTER_HOST_ID:
this_inst = inst
instance_list.remove(inst)
break
else:
(changed, this_inst) = Instance.objects.get_or_register()
if changed:
logger.info("Registered tower control node '{}'".format(this_inst.hostname))
inspect_execution_nodes(instance_list)
for inst in list(instance_list):
if inst.is_lost(ref_time=nowtime):
lost_instances.append(inst)
instance_list.remove(inst)
if this_inst:
startup_event = this_inst.is_lost(ref_time=nowtime)
this_inst.local_health_check()
if startup_event and this_inst.capacity != 0:
logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
return
else:
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
# IFF any node has a greater version than we do, then we'll shutdown services
for other_inst in instance_list:
if other_inst.version == "" or other_inst.version.startswith('ansible-runner') or other_inst.node_type == 'execution':
continue
if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG:
logger.error(
"Host {} reports version {}, but this node {} is at {}, shutting down".format(
other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version
)
)
# Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance.
# The heartbeat task will reset the capacity to the system capacity after upgrade.
stop_local_services(communicate=False)
raise RuntimeError("Shutting down.")
for other_inst in lost_instances:
try:
reaper.reap(other_inst)
except Exception:
logger.exception('failed to reap jobs for {}'.format(other_inst.hostname))
try:
# Capacity could already be 0 because:
# * It's a new node and it never had a heartbeat
# * It was set to 0 by another tower node running this method
# * It was set to 0 by this node, but auto deprovisioning is off
#
# If auto deprovisining is on, don't bother setting the capacity to 0
# since we will delete the node anyway.
if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES:
other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))
elif settings.AWX_AUTO_DEPROVISION_INSTANCES:
deprovision_hostname = other_inst.hostname
other_inst.delete()
logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))
except DatabaseError as e:
if 'did not affect any rows' in str(e):
logger.debug('Another instance has marked {} as lost'.format(other_inst.hostname))
else:
logger.exception('Error marking {} as lost'.format(other_inst.hostname))
@task(queue=get_local_queuename)
def awx_receptor_workunit_reaper():
"""
When an AWX job is launched via receptor, files such as status, stdin, and stdout are created
in a specific receptor directory. This directory on disk is a random 8 character string, e.g. qLL2JFNT
This is also called the work Unit ID in receptor, and is used in various receptor commands,
e.g. "work results qLL2JFNT"
After an AWX job executes, the receptor work unit directory is cleaned up by
issuing the work release command. In some cases the release process might fail, or
if AWX crashes during a job's execution, the work release command is never issued to begin with.
As such, this periodic task will obtain a list of all receptor work units, and find which ones
belong to AWX jobs that are in a completed state (status is canceled, error, or succeeded).
This task will call "work release" on each of these work units to clean up the files on disk.
Note that when we call "work release" on a work unit that actually represents remote work
both the local and remote work units are cleaned up.
Since we are cleaning up jobs that controller considers to be inactive, we take the added
precaution of calling "work cancel" in case the work unit is still active.
"""
if not settings.RECEPTOR_RELEASE_WORK:
return
logger.debug("Checking for unreleased receptor work units")
receptor_ctl = get_receptor_ctl()
receptor_work_list = receptor_ctl.simple_command("work list")
unit_ids = [id for id in receptor_work_list]
jobs_with_unreleased_receptor_units = UnifiedJob.objects.filter(work_unit_id__in=unit_ids).exclude(status__in=ACTIVE_STATES)
for job in jobs_with_unreleased_receptor_units:
logger.debug(f"{job.log_format} is not active, reaping receptor work unit {job.work_unit_id}")
receptor_ctl.simple_command(f"work cancel {job.work_unit_id}")
receptor_ctl.simple_command(f"work release {job.work_unit_id}")
administrative_workunit_reaper(receptor_work_list)
@task(queue=get_local_queuename)
def awx_k8s_reaper():
if not settings.RECEPTOR_RELEASE_WORK:
return
from awx.main.scheduler.kubernetes import PodManager # prevent circular import
for group in InstanceGroup.objects.filter(is_container_group=True).iterator():
logger.debug("Checking for orphaned k8s pods for {}.".format(group))
pods = PodManager.list_active_jobs(group)
for job in UnifiedJob.objects.filter(pk__in=pods.keys()).exclude(status__in=ACTIVE_STATES):
logger.debug('{} is no longer active, reaping orphaned k8s pod'.format(job.log_format))
try:
pm = PodManager(job)
pm.kube_api.delete_namespaced_pod(name=pods[job.id], namespace=pm.namespace, _request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT)
except Exception:
logger.exception("Failed to delete orphaned pod {} from {}".format(job.log_format, group))
@task(queue=get_local_queuename)
def awx_periodic_scheduler():
with advisory_lock('awx_periodic_scheduler_lock', wait=False) as acquired:
if acquired is False:
logger.debug("Not running periodic scheduler, another task holds lock")
return
logger.debug("Starting periodic scheduler")
run_now = now()
state = TowerScheduleState.get_solo()
last_run = state.schedule_last_run
logger.debug("Last scheduler run was: %s", last_run)
state.schedule_last_run = run_now
state.save()
old_schedules = Schedule.objects.enabled().before(last_run)
for schedule in old_schedules:
schedule.update_computed_fields()
schedules = Schedule.objects.enabled().between(last_run, run_now)
invalid_license = False
try:
access_registry[Job](None).check_license(quiet=True)
except PermissionDenied as e:
invalid_license = e
for schedule in schedules:
template = schedule.unified_job_template
schedule.update_computed_fields() # To update next_run timestamp.
if template.cache_timeout_blocked:
logger.warn("Cache timeout is in the future, bypassing schedule for template %s" % str(template.id))
continue
try:
job_kwargs = schedule.get_job_kwargs()
new_unified_job = schedule.unified_job_template.create_unified_job(**job_kwargs)
logger.debug('Spawned {} from schedule {}-{}.'.format(new_unified_job.log_format, schedule.name, schedule.pk))
if invalid_license:
new_unified_job.status = 'failed'
new_unified_job.job_explanation = str(invalid_license)
new_unified_job.save(update_fields=['status', 'job_explanation'])
new_unified_job.websocket_emit_status("failed")
raise invalid_license
can_start = new_unified_job.signal_start()
except Exception:
logger.exception('Error spawning scheduled job.')
continue
if not can_start:
new_unified_job.status = 'failed'
new_unified_job.job_explanation = gettext_noop(
"Scheduled job could not start because it \
was not in the right state or required manual credentials"
)
new_unified_job.save(update_fields=['status', 'job_explanation'])
new_unified_job.websocket_emit_status("failed")
emit_channel_notification('schedules-changed', dict(id=schedule.id, group_name="schedules"))
state.save()
@task(queue=get_local_queuename)
def handle_work_success(task_actual):
try:
instance = UnifiedJob.get_instance_by_type(task_actual['type'], task_actual['id'])
except ObjectDoesNotExist:
logger.warning('Missing {} `{}` in success callback.'.format(task_actual['type'], task_actual['id']))
return
if not instance:
return
schedule_task_manager()
@task(queue=get_local_queuename)
def handle_work_error(task_id, *args, **kwargs):
subtasks = kwargs.get('subtasks', None)
logger.debug('Executing error task id %s, subtasks: %s' % (task_id, str(subtasks)))
first_instance = None
first_instance_type = ''
if subtasks is not None:
for each_task in subtasks:
try:
instance = UnifiedJob.get_instance_by_type(each_task['type'], each_task['id'])
if not instance:
# Unknown task type
logger.warn("Unknown task type: {}".format(each_task['type']))
continue
except ObjectDoesNotExist:
logger.warning('Missing {} `{}` in error callback.'.format(each_task['type'], each_task['id']))
continue
if first_instance is None:
first_instance = instance
first_instance_type = each_task['type']
if instance.celery_task_id != task_id and not instance.cancel_flag and not instance.status == 'successful':
instance.status = 'failed'
instance.failed = True
if not instance.job_explanation:
instance.job_explanation = 'Previous Task Failed: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}' % (
first_instance_type,
first_instance.name,
first_instance.id,
)
instance.save()
instance.websocket_emit_status("failed")
# We only send 1 job complete message since all the job completion message
# handling does is trigger the scheduler. If we extend the functionality of
# what the job complete message handler does then we may want to send a
# completion event for each job here.
if first_instance:
schedule_task_manager()
pass
@task(queue=get_local_queuename)
def handle_success_and_failure_notifications(job_id):
uj = UnifiedJob.objects.get(pk=job_id)
retries = 0
while retries < 5:
if uj.finished:
uj.send_notification_templates('succeeded' if uj.status == 'successful' else 'failed')
return
else:
# wait a few seconds to avoid a race where the
# events are persisted _before_ the UJ.status
# changes from running -> successful
retries += 1
time.sleep(1)
uj = UnifiedJob.objects.get(pk=job_id)
logger.warn(f"Failed to even try to send notifications for job '{uj}' due to job not being in finished state.")
@task(queue=get_local_queuename)
def update_inventory_computed_fields(inventory_id):
"""
Signal handler and wrapper around inventory.update_computed_fields to
prevent unnecessary recursive calls.
"""
i = Inventory.objects.filter(id=inventory_id)
if not i.exists():
logger.error("Update Inventory Computed Fields failed due to missing inventory: " + str(inventory_id))
return
i = i[0]
try:
i.update_computed_fields()
except DatabaseError as e:
if 'did not affect any rows' in str(e):
logger.debug('Exiting duplicate update_inventory_computed_fields task.')
return
raise
def update_smart_memberships_for_inventory(smart_inventory):
current = set(SmartInventoryMembership.objects.filter(inventory=smart_inventory).values_list('host_id', flat=True))
new = set(smart_inventory.hosts.values_list('id', flat=True))
additions = new - current
removals = current - new
if additions or removals:
with transaction.atomic():
if removals:
SmartInventoryMembership.objects.filter(inventory=smart_inventory, host_id__in=removals).delete()
if additions:
add_for_inventory = [SmartInventoryMembership(inventory_id=smart_inventory.id, host_id=host_id) for host_id in additions]
SmartInventoryMembership.objects.bulk_create(add_for_inventory, ignore_conflicts=True)
logger.debug(
'Smart host membership cached for {}, {} additions, {} removals, {} total count.'.format(
smart_inventory.pk, len(additions), len(removals), len(new)
)
)
return True # changed
return False
@task(queue=get_local_queuename)
def update_host_smart_inventory_memberships():
smart_inventories = Inventory.objects.filter(kind='smart', host_filter__isnull=False, pending_deletion=False)
changed_inventories = set([])
for smart_inventory in smart_inventories:
try:
changed = update_smart_memberships_for_inventory(smart_inventory)
if changed:
changed_inventories.add(smart_inventory)
except IntegrityError:
logger.exception('Failed to update smart inventory memberships for {}'.format(smart_inventory.pk))
# Update computed fields for changed inventories outside atomic action
for smart_inventory in changed_inventories:
smart_inventory.update_computed_fields()
@task(queue=get_local_queuename)
def delete_inventory(inventory_id, user_id, retries=5):
# Delete inventory as user
if user_id is None:
user = None
else:
try:
user = User.objects.get(id=user_id)
except Exception:
user = None
with ignore_inventory_computed_fields(), ignore_inventory_group_removal(), impersonate(user):
try:
i = Inventory.objects.get(id=inventory_id)
for host in i.hosts.iterator():
host.job_events_as_primary_host.update(host=None)
i.delete()
emit_channel_notification('inventories-status_changed', {'group_name': 'inventories', 'inventory_id': inventory_id, 'status': 'deleted'})
logger.debug('Deleted inventory {} as user {}.'.format(inventory_id, user_id))
except Inventory.DoesNotExist:
logger.exception("Delete Inventory failed due to missing inventory: " + str(inventory_id))
return
except DatabaseError:
logger.exception('Database error deleting inventory {}, but will retry.'.format(inventory_id))
if retries > 0:
time.sleep(10)
delete_inventory(inventory_id, user_id, retries=retries - 1)
def with_path_cleanup(f):
@functools.wraps(f)
def _wrapped(self, *args, **kwargs):
try:
return f(self, *args, **kwargs)
finally:
for p in self.cleanup_paths:
try:
if os.path.isdir(p):
shutil.rmtree(p, ignore_errors=True)
elif os.path.exists(p):
os.remove(p)
except OSError:
logger.exception("Failed to remove tmp file: {}".format(p))
self.cleanup_paths = []
return _wrapped
def _reconstruct_relationships(copy_mapping):
for old_obj, new_obj in copy_mapping.items():
model = type(old_obj)
for field_name in getattr(model, 'FIELDS_TO_PRESERVE_AT_COPY', []):
field = model._meta.get_field(field_name)
if isinstance(field, ForeignKey):
if getattr(new_obj, field_name, None):
continue
related_obj = getattr(old_obj, field_name)
related_obj = copy_mapping.get(related_obj, related_obj)
setattr(new_obj, field_name, related_obj)
elif field.many_to_many:
for related_obj in getattr(old_obj, field_name).all():
logger.debug('Deep copy: Adding {} to {}({}).{} relationship'.format(related_obj, new_obj, model, field_name))
getattr(new_obj, field_name).add(copy_mapping.get(related_obj, related_obj))
new_obj.save()
@task(queue=get_local_queuename)
def deep_copy_model_obj(model_module, model_name, obj_pk, new_obj_pk, user_pk, uuid, permission_check_func=None):
sub_obj_list = cache.get(uuid)
if sub_obj_list is None:
logger.error('Deep copy {} from {} to {} failed unexpectedly.'.format(model_name, obj_pk, new_obj_pk))
return
logger.debug('Deep copy {} from {} to {}.'.format(model_name, obj_pk, new_obj_pk))
from awx.api.generics import CopyAPIView
from awx.main.signals import disable_activity_stream
model = getattr(importlib.import_module(model_module), model_name, None)
if model is None:
return
try:
obj = model.objects.get(pk=obj_pk)
new_obj = model.objects.get(pk=new_obj_pk)
creater = User.objects.get(pk=user_pk)
except ObjectDoesNotExist:
logger.warning("Object or user no longer exists.")
return
with transaction.atomic(), ignore_inventory_computed_fields(), disable_activity_stream():
copy_mapping = {}
for sub_obj_setup in sub_obj_list:
sub_model = getattr(importlib.import_module(sub_obj_setup[0]), sub_obj_setup[1], None)
if sub_model is None:
continue
try:
sub_obj = sub_model.objects.get(pk=sub_obj_setup[2])
except ObjectDoesNotExist:
continue
copy_mapping.update(CopyAPIView.copy_model_obj(obj, new_obj, sub_model, sub_obj, creater))
_reconstruct_relationships(copy_mapping)
if permission_check_func:
permission_check_func = getattr(getattr(importlib.import_module(permission_check_func[0]), permission_check_func[1]), permission_check_func[2])
permission_check_func(creater, copy_mapping.values())
if isinstance(new_obj, Inventory):
update_inventory_computed_fields.delay(new_obj.id)

View File

@ -62,7 +62,7 @@ def test_health_check_throws_error(post, admin_user):
# we will simulate a receptor error, similar to this one
# https://github.com/ansible/receptor/blob/156e6e24a49fbf868734507f9943ac96208ed8f5/receptorctl/receptorctl/socket_interface.py#L204
# related to issue https://github.com/ansible/tower/issues/5315
with mock.patch('awx.main.utils.receptor.run_until_complete', side_effect=RuntimeError('Remote error: foobar')):
with mock.patch('awx.main.tasks.receptor.run_until_complete', side_effect=RuntimeError('Remote error: foobar')):
post(url=url, user=admin_user, expect=200)
instance.refresh_from_db()
assert 'Remote error: foobar' in instance.errors

View File

@ -5,7 +5,7 @@ from collections import namedtuple
from unittest import mock # noqa
import pytest
from awx.main.tasks import AWXReceptorJob
from awx.main.tasks.receptor import AWXReceptorJob
from awx.main.utils import (
create_temporary_fifo,
)

View File

@ -3,7 +3,7 @@ from unittest import mock
from datetime import timedelta
from awx.main.scheduler import TaskManager
from awx.main.models import InstanceGroup, WorkflowJob
from awx.main.tasks import apply_cluster_membership_policies
from awx.main.tasks.system import apply_cluster_membership_policies
@pytest.mark.django_db

View File

@ -5,7 +5,7 @@ from awx.api.versioning import reverse
from awx.main.utils import decrypt_field
from awx.main.models.workflow import WorkflowJobTemplate, WorkflowJobTemplateNode, WorkflowApprovalTemplate
from awx.main.models.jobs import JobTemplate
from awx.main.tasks import deep_copy_model_obj
from awx.main.tasks.system import deep_copy_model_obj
@pytest.mark.django_db

View File

@ -4,7 +4,7 @@ from unittest import mock
from awx.main.models import AdHocCommand, InventoryUpdate, JobTemplate, ProjectUpdate
from awx.main.models.activity_stream import ActivityStream
from awx.main.models.ha import Instance, InstanceGroup
from awx.main.tasks import apply_cluster_membership_policies
from awx.main.tasks.system import apply_cluster_membership_policies
from awx.api.versioning import reverse
from django.utils.timezone import now

View File

@ -5,7 +5,7 @@ import json
import re
from collections import namedtuple
from awx.main.tasks import RunInventoryUpdate
from awx.main.tasks.jobs import RunInventoryUpdate
from awx.main.models import InventorySource, Credential, CredentialType, UnifiedJob, ExecutionEnvironment
from awx.main.constants import CLOUD_PROVIDERS, STANDARD_INVENTORY_UPDATE_ENV
from awx.main.tests import data
@ -257,6 +257,6 @@ def test_inventory_update_injected_content(this_kind, inventory, fake_credential
# Also do not send websocket status updates
with mock.patch.object(UnifiedJob, 'websocket_emit_status', mock.Mock()):
# The point of this test is that we replace run with assertions
with mock.patch('awx.main.tasks.AWXReceptorJob.run', substitute_run):
with mock.patch('awx.main.tasks.receptor.AWXReceptorJob.run', substitute_run):
# so this sets up everything for a run and then yields control over to substitute_run
task.run(inventory_update.pk)

View File

@ -4,7 +4,7 @@ from unittest import mock
import json
from awx.main.models import Job, Instance, JobHostSummary, InventoryUpdate, InventorySource, Project, ProjectUpdate, SystemJob, AdHocCommand
from awx.main.tasks import cluster_node_heartbeat
from awx.main.tasks.system import cluster_node_heartbeat
from django.test.utils import override_settings
@ -20,7 +20,7 @@ def test_orphan_unified_job_creation(instance, inventory):
@pytest.mark.django_db
@mock.patch('awx.main.tasks.inspect_execution_nodes', lambda *args, **kwargs: None)
@mock.patch('awx.main.tasks.system.inspect_execution_nodes', lambda *args, **kwargs: None)
@mock.patch('awx.main.models.ha.get_cpu_effective_capacity', lambda cpu: 8)
@mock.patch('awx.main.models.ha.get_mem_effective_capacity', lambda mem: 62)
def test_job_capacity_and_with_inactive_node():

View File

@ -2,7 +2,8 @@ import pytest
from unittest import mock
import os
from awx.main.tasks import RunProjectUpdate, RunInventoryUpdate, execution_node_health_check
from awx.main.tasks.jobs import RunProjectUpdate, RunInventoryUpdate
from awx.main.tasks.system import execution_node_health_check
from awx.main.models import ProjectUpdate, InventoryUpdate, InventorySource, Instance
@ -49,7 +50,7 @@ class TestDependentInventoryUpdate:
scm_inventory_source.scm_last_revision = ''
proj_update = ProjectUpdate.objects.create(project=scm_inventory_source.source_project)
with mock.patch.object(RunInventoryUpdate, 'run') as iu_run_mock:
with mock.patch('awx.main.tasks.create_partition'):
with mock.patch('awx.main.tasks.jobs.create_partition'):
task._update_dependent_inventories(proj_update, [scm_inventory_source])
assert InventoryUpdate.objects.count() == 1
inv_update = InventoryUpdate.objects.first()
@ -73,7 +74,7 @@ class TestDependentInventoryUpdate:
ProjectUpdate.objects.all().update(cancel_flag=True)
with mock.patch.object(RunInventoryUpdate, 'run') as iu_run_mock:
with mock.patch('awx.main.tasks.create_partition'):
with mock.patch('awx.main.tasks.jobs.create_partition'):
iu_run_mock.side_effect = user_cancels_project
task._update_dependent_inventories(proj_update, [is1, is2])
# Verify that it bails after 1st update, detecting a cancel

View File

@ -7,7 +7,7 @@ from datetime import timedelta
@pytest.mark.parametrize(
"job_name,function_path",
[
('tower_scheduler', 'awx.main.tasks.awx_periodic_scheduler'),
('tower_scheduler', 'awx.main.tasks.system.awx_periodic_scheduler'),
],
)
def test_CELERYBEAT_SCHEDULE(mocker, job_name, function_path):

View File

@ -32,7 +32,7 @@ from awx.main.models import (
User,
build_safe_env,
)
from awx.main.models.credential import ManagedCredentialType
from awx.main.models.credential import HIDDEN_PASSWORD, ManagedCredentialType
from awx.main import tasks
from awx.main.utils import encrypt_field, encrypt_value
@ -113,12 +113,12 @@ def adhoc_update_model_wrapper(adhoc_job):
def test_send_notifications_not_list():
with pytest.raises(TypeError):
tasks.send_notifications(None)
tasks.system.send_notifications(None)
def test_send_notifications_job_id(mocker):
with mocker.patch('awx.main.models.UnifiedJob.objects.get'):
tasks.send_notifications([], job_id=1)
tasks.system.send_notifications([], job_id=1)
assert UnifiedJob.objects.get.called
assert UnifiedJob.objects.get.called_with(id=1)
@ -127,7 +127,7 @@ def test_work_success_callback_missing_job():
task_data = {'type': 'project_update', 'id': 9999}
with mock.patch('django.db.models.query.QuerySet.get') as get_mock:
get_mock.side_effect = ProjectUpdate.DoesNotExist()
assert tasks.handle_work_success(task_data) is None
assert tasks.system.handle_work_success(task_data) is None
@mock.patch('awx.main.models.UnifiedJob.objects.get')
@ -138,7 +138,7 @@ def test_send_notifications_list(mock_notifications_filter, mock_job_get, mocker
mock_notifications = [mocker.MagicMock(spec=Notification, subject="test", body={'hello': 'world'})]
mock_notifications_filter.return_value = mock_notifications
tasks.send_notifications([1, 2], job_id=1)
tasks.system.send_notifications([1, 2], job_id=1)
assert Notification.objects.filter.call_count == 1
assert mock_notifications[0].status == "successful"
assert mock_notifications[0].save.called
@ -158,7 +158,7 @@ def test_send_notifications_list(mock_notifications_filter, mock_job_get, mocker
],
)
def test_safe_env_filtering(key, value):
assert build_safe_env({key: value})[key] == tasks.HIDDEN_PASSWORD
assert build_safe_env({key: value})[key] == HIDDEN_PASSWORD
def test_safe_env_returns_new_copy():
@ -168,7 +168,7 @@ def test_safe_env_returns_new_copy():
@pytest.mark.parametrize("source,expected", [(None, True), (False, False), (True, True)])
def test_openstack_client_config_generation(mocker, source, expected, private_data_dir):
update = tasks.RunInventoryUpdate()
update = tasks.jobs.RunInventoryUpdate()
credential_type = CredentialType.defaults['openstack']()
inputs = {
'host': 'https://keystone.openstack.example.org',
@ -208,7 +208,7 @@ def test_openstack_client_config_generation(mocker, source, expected, private_da
@pytest.mark.parametrize("source,expected", [(None, True), (False, False), (True, True)])
def test_openstack_client_config_generation_with_project_domain_name(mocker, source, expected, private_data_dir):
update = tasks.RunInventoryUpdate()
update = tasks.jobs.RunInventoryUpdate()
credential_type = CredentialType.defaults['openstack']()
inputs = {
'host': 'https://keystone.openstack.example.org',
@ -250,7 +250,7 @@ def test_openstack_client_config_generation_with_project_domain_name(mocker, sou
@pytest.mark.parametrize("source,expected", [(None, True), (False, False), (True, True)])
def test_openstack_client_config_generation_with_region(mocker, source, expected, private_data_dir):
update = tasks.RunInventoryUpdate()
update = tasks.jobs.RunInventoryUpdate()
credential_type = CredentialType.defaults['openstack']()
inputs = {
'host': 'https://keystone.openstack.example.org',
@ -294,7 +294,7 @@ def test_openstack_client_config_generation_with_region(mocker, source, expected
@pytest.mark.parametrize("source,expected", [(False, False), (True, True)])
def test_openstack_client_config_generation_with_private_source_vars(mocker, source, expected, private_data_dir):
update = tasks.RunInventoryUpdate()
update = tasks.jobs.RunInventoryUpdate()
credential_type = CredentialType.defaults['openstack']()
inputs = {
'host': 'https://keystone.openstack.example.org',
@ -357,7 +357,7 @@ class TestExtraVarSanitation(TestJobExecution):
job.created_by = User(pk=123, username='angry-spud')
job.inventory = Inventory(pk=123, name='example-inv')
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.build_extra_vars_file(job, private_data_dir)
fd = open(os.path.join(private_data_dir, 'env', 'extravars'))
@ -393,7 +393,7 @@ class TestExtraVarSanitation(TestJobExecution):
def test_launchtime_vars_unsafe(self, job, private_data_dir):
job.extra_vars = json.dumps({'msg': self.UNSAFE})
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.build_extra_vars_file(job, private_data_dir)
@ -404,7 +404,7 @@ class TestExtraVarSanitation(TestJobExecution):
def test_nested_launchtime_vars_unsafe(self, job, private_data_dir):
job.extra_vars = json.dumps({'msg': {'a': [self.UNSAFE]}})
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.build_extra_vars_file(job, private_data_dir)
@ -415,7 +415,7 @@ class TestExtraVarSanitation(TestJobExecution):
def test_allowed_jt_extra_vars(self, job, private_data_dir):
job.job_template.extra_vars = job.extra_vars = json.dumps({'msg': self.UNSAFE})
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.build_extra_vars_file(job, private_data_dir)
@ -427,7 +427,7 @@ class TestExtraVarSanitation(TestJobExecution):
def test_nested_allowed_vars(self, job, private_data_dir):
job.extra_vars = json.dumps({'msg': {'a': {'b': [self.UNSAFE]}}})
job.job_template.extra_vars = job.extra_vars
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.build_extra_vars_file(job, private_data_dir)
@ -441,7 +441,7 @@ class TestExtraVarSanitation(TestJobExecution):
# `other_var=SENSITIVE`
job.job_template.extra_vars = json.dumps({'msg': self.UNSAFE})
job.extra_vars = json.dumps({'msg': 'other-value', 'other_var': self.UNSAFE})
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.build_extra_vars_file(job, private_data_dir)
@ -456,7 +456,7 @@ class TestExtraVarSanitation(TestJobExecution):
def test_overwritten_jt_extra_vars(self, job, private_data_dir):
job.job_template.extra_vars = json.dumps({'msg': 'SAFE'})
job.extra_vars = json.dumps({'msg': self.UNSAFE})
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.build_extra_vars_file(job, private_data_dir)
@ -472,13 +472,13 @@ class TestGenericRun:
job.websocket_emit_status = mock.Mock()
job.execution_environment = execution_environment
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.instance = job
task.update_model = mock.Mock(return_value=job)
task.model.objects.get = mock.Mock(return_value=job)
task.build_private_data_files = mock.Mock(side_effect=OSError())
with mock.patch('awx.main.tasks.copy_tree'):
with mock.patch('awx.main.tasks.jobs.copy_tree'):
with pytest.raises(Exception):
task.run(1)
@ -494,13 +494,13 @@ class TestGenericRun:
job.send_notification_templates = mock.Mock()
job.execution_environment = execution_environment
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.instance = job
task.update_model = mock.Mock(wraps=update_model_wrapper)
task.model.objects.get = mock.Mock(return_value=job)
task.build_private_data_files = mock.Mock()
with mock.patch('awx.main.tasks.copy_tree'):
with mock.patch('awx.main.tasks.jobs.copy_tree'):
with pytest.raises(Exception):
task.run(1)
@ -508,7 +508,7 @@ class TestGenericRun:
assert c in task.update_model.call_args_list
def test_event_count(self):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.dispatcher = mock.MagicMock()
task.instance = Job()
task.event_ct = 0
@ -518,7 +518,7 @@ class TestGenericRun:
assert 20 == task.event_ct
def test_finished_callback_eof(self):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.dispatcher = mock.MagicMock()
task.instance = Job(pk=1, id=1)
task.event_ct = 17
@ -529,7 +529,7 @@ class TestGenericRun:
class MockMe:
pass
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.instance = job
task.safe_env = {'secret_key': 'redacted_value'}
task.update_model = mock.Mock(wraps=update_model_wrapper)
@ -546,7 +546,7 @@ class TestGenericRun:
def test_created_by_extra_vars(self):
job = Job(created_by=User(pk=123, username='angry-spud'))
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task._write_extra_vars_file = mock.Mock()
task.build_extra_vars_file(job, None)
@ -563,7 +563,7 @@ class TestGenericRun:
job.extra_vars = json.dumps({'super_secret': encrypt_value('CLASSIFIED', pk=None)})
job.survey_passwords = {'super_secret': '$encrypted$'}
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task._write_extra_vars_file = mock.Mock()
task.build_extra_vars_file(job, None)
@ -576,11 +576,11 @@ class TestGenericRun:
job = Job(project=Project(), inventory=Inventory())
job.execution_environment = execution_environment
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.instance = job
task._write_extra_vars_file = mock.Mock()
with mock.patch('awx.main.tasks.settings.AWX_TASK_ENV', {'FOO': 'BAR'}):
with mock.patch('awx.main.tasks.jobs.settings.AWX_TASK_ENV', {'FOO': 'BAR'}):
env = task.build_env(job, private_data_dir)
assert env['FOO'] == 'BAR'
@ -595,7 +595,7 @@ class TestAdhocRun(TestJobExecution):
adhoc_job.websocket_emit_status = mock.Mock()
adhoc_job.send_notification_templates = mock.Mock()
task = tasks.RunAdHocCommand()
task = tasks.jobs.RunAdHocCommand()
task.update_model = mock.Mock(wraps=adhoc_update_model_wrapper)
task.model.objects.get = mock.Mock(return_value=adhoc_job)
task.build_inventory = mock.Mock()
@ -619,7 +619,7 @@ class TestAdhocRun(TestJobExecution):
})
#adhoc_job.websocket_emit_status = mock.Mock()
task = tasks.RunAdHocCommand()
task = tasks.jobs.RunAdHocCommand()
#task.update_model = mock.Mock(wraps=adhoc_update_model_wrapper)
#task.build_inventory = mock.Mock(return_value='/tmp/something.inventory')
task._write_extra_vars_file = mock.Mock()
@ -634,7 +634,7 @@ class TestAdhocRun(TestJobExecution):
def test_created_by_extra_vars(self):
adhoc_job = AdHocCommand(created_by=User(pk=123, username='angry-spud'))
task = tasks.RunAdHocCommand()
task = tasks.jobs.RunAdHocCommand()
task._write_extra_vars_file = mock.Mock()
task.build_extra_vars_file(adhoc_job, None)
@ -693,7 +693,7 @@ class TestJobCredentials(TestJobExecution):
}
def test_username_jinja_usage(self, job, private_data_dir):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
ssh = CredentialType.defaults['ssh']()
credential = Credential(pk=1, credential_type=ssh, inputs={'username': '{{ ansible_ssh_pass }}'})
job.credentials.add(credential)
@ -704,7 +704,7 @@ class TestJobCredentials(TestJobExecution):
@pytest.mark.parametrize("flag", ['become_username', 'become_method'])
def test_become_jinja_usage(self, job, private_data_dir, flag):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
ssh = CredentialType.defaults['ssh']()
credential = Credential(pk=1, credential_type=ssh, inputs={'username': 'joe', flag: '{{ ansible_ssh_pass }}'})
job.credentials.add(credential)
@ -715,7 +715,7 @@ class TestJobCredentials(TestJobExecution):
assert 'Jinja variables are not allowed' in str(e.value)
def test_ssh_passwords(self, job, private_data_dir, field, password_name, expected_flag):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
ssh = CredentialType.defaults['ssh']()
credential = Credential(pk=1, credential_type=ssh, inputs={'username': 'bob', field: 'secret'})
credential.inputs[field] = encrypt_field(credential, field)
@ -732,7 +732,7 @@ class TestJobCredentials(TestJobExecution):
assert expected_flag in ' '.join(args)
def test_net_ssh_key_unlock(self, job):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
net = CredentialType.defaults['net']()
credential = Credential(pk=1, credential_type=net, inputs={'ssh_key_unlock': 'secret'})
credential.inputs['ssh_key_unlock'] = encrypt_field(credential, 'ssh_key_unlock')
@ -745,7 +745,7 @@ class TestJobCredentials(TestJobExecution):
assert 'secret' in expect_passwords.values()
def test_net_first_ssh_key_unlock_wins(self, job):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
for i in range(3):
net = CredentialType.defaults['net']()
credential = Credential(pk=i, credential_type=net, inputs={'ssh_key_unlock': 'secret{}'.format(i)})
@ -759,7 +759,7 @@ class TestJobCredentials(TestJobExecution):
assert 'secret0' in expect_passwords.values()
def test_prefer_ssh_over_net_ssh_key_unlock(self, job):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
net = CredentialType.defaults['net']()
net_credential = Credential(pk=1, credential_type=net, inputs={'ssh_key_unlock': 'net_secret'})
net_credential.inputs['ssh_key_unlock'] = encrypt_field(net_credential, 'ssh_key_unlock')
@ -778,7 +778,7 @@ class TestJobCredentials(TestJobExecution):
assert 'ssh_secret' in expect_passwords.values()
def test_vault_password(self, private_data_dir, job):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
vault = CredentialType.defaults['vault']()
credential = Credential(pk=1, credential_type=vault, inputs={'vault_password': 'vault-me'})
credential.inputs['vault_password'] = encrypt_field(credential, 'vault_password')
@ -793,7 +793,7 @@ class TestJobCredentials(TestJobExecution):
assert '--ask-vault-pass' in ' '.join(args)
def test_vault_password_ask(self, private_data_dir, job):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
vault = CredentialType.defaults['vault']()
credential = Credential(pk=1, credential_type=vault, inputs={'vault_password': 'ASK'})
credential.inputs['vault_password'] = encrypt_field(credential, 'vault_password')
@ -808,7 +808,7 @@ class TestJobCredentials(TestJobExecution):
assert '--ask-vault-pass' in ' '.join(args)
def test_multi_vault_password(self, private_data_dir, job):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
vault = CredentialType.defaults['vault']()
for i, label in enumerate(['dev', 'prod', 'dotted.name']):
credential = Credential(pk=i, credential_type=vault, inputs={'vault_password': 'pass@{}'.format(label), 'vault_id': label})
@ -831,7 +831,7 @@ class TestJobCredentials(TestJobExecution):
assert '--vault-id dotted.name@prompt' in ' '.join(args)
def test_multi_vault_id_conflict(self, job):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
vault = CredentialType.defaults['vault']()
for i in range(2):
credential = Credential(pk=i, credential_type=vault, inputs={'vault_password': 'some-pass', 'vault_id': 'conflict'})
@ -844,7 +844,7 @@ class TestJobCredentials(TestJobExecution):
assert 'multiple vault credentials were specified with --vault-id' in str(e.value)
def test_multi_vault_password_ask(self, private_data_dir, job):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
vault = CredentialType.defaults['vault']()
for i, label in enumerate(['dev', 'prod']):
credential = Credential(pk=i, credential_type=vault, inputs={'vault_password': 'ASK', 'vault_id': label})
@ -897,7 +897,7 @@ class TestJobCredentials(TestJobExecution):
assert env['K8S_AUTH_VERIFY_SSL'] == 'False'
assert 'K8S_AUTH_SSL_CA_CERT' not in env
assert safe_env['K8S_AUTH_API_KEY'] == tasks.HIDDEN_PASSWORD
assert safe_env['K8S_AUTH_API_KEY'] == HIDDEN_PASSWORD
def test_aws_cloud_credential(self, job, private_data_dir):
aws = CredentialType.defaults['aws']()
@ -912,7 +912,7 @@ class TestJobCredentials(TestJobExecution):
assert env['AWS_ACCESS_KEY_ID'] == 'bob'
assert env['AWS_SECRET_ACCESS_KEY'] == 'secret'
assert 'AWS_SECURITY_TOKEN' not in env
assert safe_env['AWS_SECRET_ACCESS_KEY'] == tasks.HIDDEN_PASSWORD
assert safe_env['AWS_SECRET_ACCESS_KEY'] == HIDDEN_PASSWORD
def test_aws_cloud_credential_with_sts_token(self, private_data_dir, job):
aws = CredentialType.defaults['aws']()
@ -928,7 +928,7 @@ class TestJobCredentials(TestJobExecution):
assert env['AWS_ACCESS_KEY_ID'] == 'bob'
assert env['AWS_SECRET_ACCESS_KEY'] == 'secret'
assert env['AWS_SECURITY_TOKEN'] == 'token'
assert safe_env['AWS_SECRET_ACCESS_KEY'] == tasks.HIDDEN_PASSWORD
assert safe_env['AWS_SECRET_ACCESS_KEY'] == HIDDEN_PASSWORD
def test_gce_credentials(self, private_data_dir, job):
gce = CredentialType.defaults['gce']()
@ -963,7 +963,7 @@ class TestJobCredentials(TestJobExecution):
assert env['AZURE_SECRET'] == 'some-secret'
assert env['AZURE_TENANT'] == 'some-tenant'
assert env['AZURE_SUBSCRIPTION_ID'] == 'some-subscription'
assert safe_env['AZURE_SECRET'] == tasks.HIDDEN_PASSWORD
assert safe_env['AZURE_SECRET'] == HIDDEN_PASSWORD
def test_azure_rm_with_password(self, private_data_dir, job):
azure = CredentialType.defaults['azure_rm']()
@ -981,7 +981,7 @@ class TestJobCredentials(TestJobExecution):
assert env['AZURE_AD_USER'] == 'bob'
assert env['AZURE_PASSWORD'] == 'secret'
assert env['AZURE_CLOUD_ENVIRONMENT'] == 'foobar'
assert safe_env['AZURE_PASSWORD'] == tasks.HIDDEN_PASSWORD
assert safe_env['AZURE_PASSWORD'] == HIDDEN_PASSWORD
def test_vmware_credentials(self, private_data_dir, job):
vmware = CredentialType.defaults['vmware']()
@ -996,10 +996,10 @@ class TestJobCredentials(TestJobExecution):
assert env['VMWARE_USER'] == 'bob'
assert env['VMWARE_PASSWORD'] == 'secret'
assert env['VMWARE_HOST'] == 'https://example.org'
assert safe_env['VMWARE_PASSWORD'] == tasks.HIDDEN_PASSWORD
assert safe_env['VMWARE_PASSWORD'] == HIDDEN_PASSWORD
def test_openstack_credentials(self, private_data_dir, job):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.instance = job
openstack = CredentialType.defaults['openstack']()
credential = Credential(
@ -1067,7 +1067,7 @@ class TestJobCredentials(TestJobExecution):
],
)
def test_net_credentials(self, authorize, expected_authorize, job, private_data_dir):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.instance = job
net = CredentialType.defaults['net']()
inputs = {'username': 'bob', 'password': 'secret', 'ssh_key_data': self.EXAMPLE_PRIVATE_KEY, 'authorize_password': 'authorizeme'}
@ -1089,7 +1089,7 @@ class TestJobCredentials(TestJobExecution):
if authorize:
assert env['ANSIBLE_NET_AUTH_PASS'] == 'authorizeme'
assert open(env['ANSIBLE_NET_SSH_KEYFILE'], 'r').read() == self.EXAMPLE_PRIVATE_KEY
assert safe_env['ANSIBLE_NET_PASSWORD'] == tasks.HIDDEN_PASSWORD
assert safe_env['ANSIBLE_NET_PASSWORD'] == HIDDEN_PASSWORD
def test_custom_environment_injectors_with_jinja_syntax_error(self, private_data_dir):
some_cloud = CredentialType(
@ -1135,7 +1135,7 @@ class TestJobCredentials(TestJobExecution):
assert env['TURBO_BUTTON'] == str(True)
def test_custom_environment_injectors_with_reserved_env_var(self, private_data_dir, job):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.instance = job
some_cloud = CredentialType(
kind='cloud',
@ -1168,10 +1168,10 @@ class TestJobCredentials(TestJobExecution):
assert env['MY_CLOUD_PRIVATE_VAR'] == 'SUPER-SECRET-123'
assert 'SUPER-SECRET-123' not in safe_env.values()
assert safe_env['MY_CLOUD_PRIVATE_VAR'] == tasks.HIDDEN_PASSWORD
assert safe_env['MY_CLOUD_PRIVATE_VAR'] == HIDDEN_PASSWORD
def test_custom_environment_injectors_with_extra_vars(self, private_data_dir, job):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
some_cloud = CredentialType(
kind='cloud',
name='SomeCloud',
@ -1190,7 +1190,7 @@ class TestJobCredentials(TestJobExecution):
assert hasattr(extra_vars["api_token"], '__UNSAFE__')
def test_custom_environment_injectors_with_boolean_extra_vars(self, job, private_data_dir):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
some_cloud = CredentialType(
kind='cloud',
name='SomeCloud',
@ -1209,7 +1209,7 @@ class TestJobCredentials(TestJobExecution):
return ['successful', 0]
def test_custom_environment_injectors_with_complicated_boolean_template(self, job, private_data_dir):
task = tasks.RunJob()
task = tasks.jobs.RunJob()
some_cloud = CredentialType(
kind='cloud',
name='SomeCloud',
@ -1230,7 +1230,7 @@ class TestJobCredentials(TestJobExecution):
"""
extra_vars that contain secret field values should be censored in the DB
"""
task = tasks.RunJob()
task = tasks.jobs.RunJob()
some_cloud = CredentialType(
kind='cloud',
name='SomeCloud',
@ -1331,11 +1331,11 @@ class TestJobCredentials(TestJobExecution):
assert json_data['client_email'] == 'bob'
assert json_data['project_id'] == 'some-project'
assert safe_env['AZURE_PASSWORD'] == tasks.HIDDEN_PASSWORD
assert safe_env['AZURE_PASSWORD'] == HIDDEN_PASSWORD
def test_awx_task_env(self, settings, private_data_dir, job):
settings.AWX_TASK_ENV = {'FOO': 'BAR'}
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.instance = job
env = task.build_env(job, private_data_dir)
@ -1362,7 +1362,7 @@ class TestProjectUpdateGalaxyCredentials(TestJobExecution):
def test_galaxy_credentials_ignore_certs(self, private_data_dir, project_update, ignore):
settings.GALAXY_IGNORE_CERTS = ignore
task = tasks.RunProjectUpdate()
task = tasks.jobs.RunProjectUpdate()
task.instance = project_update
env = task.build_env(project_update, private_data_dir)
if ignore:
@ -1371,7 +1371,7 @@ class TestProjectUpdateGalaxyCredentials(TestJobExecution):
assert 'ANSIBLE_GALAXY_IGNORE' not in env
def test_galaxy_credentials_empty(self, private_data_dir, project_update):
class RunProjectUpdate(tasks.RunProjectUpdate):
class RunProjectUpdate(tasks.jobs.RunProjectUpdate):
__vars__ = {}
def _write_extra_vars_file(self, private_data_dir, extra_vars, *kw):
@ -1390,7 +1390,7 @@ class TestProjectUpdateGalaxyCredentials(TestJobExecution):
assert not k.startswith('ANSIBLE_GALAXY_SERVER')
def test_single_public_galaxy(self, private_data_dir, project_update):
class RunProjectUpdate(tasks.RunProjectUpdate):
class RunProjectUpdate(tasks.jobs.RunProjectUpdate):
__vars__ = {}
def _write_extra_vars_file(self, private_data_dir, extra_vars, *kw):
@ -1439,7 +1439,7 @@ class TestProjectUpdateGalaxyCredentials(TestJobExecution):
)
project_update.project.organization.galaxy_credentials.add(public_galaxy)
project_update.project.organization.galaxy_credentials.add(rh)
task = tasks.RunProjectUpdate()
task = tasks.jobs.RunProjectUpdate()
task.instance = project_update
env = task.build_env(project_update, private_data_dir)
assert sorted([(k, v) for k, v in env.items() if k.startswith('ANSIBLE_GALAXY')]) == [
@ -1481,7 +1481,7 @@ class TestProjectUpdateCredentials(TestJobExecution):
}
def test_username_and_password_auth(self, project_update, scm_type):
task = tasks.RunProjectUpdate()
task = tasks.jobs.RunProjectUpdate()
ssh = CredentialType.defaults['ssh']()
project_update.scm_type = scm_type
project_update.credential = Credential(pk=1, credential_type=ssh, inputs={'username': 'bob', 'password': 'secret'})
@ -1495,7 +1495,7 @@ class TestProjectUpdateCredentials(TestJobExecution):
assert 'secret' in expect_passwords.values()
def test_ssh_key_auth(self, project_update, scm_type):
task = tasks.RunProjectUpdate()
task = tasks.jobs.RunProjectUpdate()
ssh = CredentialType.defaults['ssh']()
project_update.scm_type = scm_type
project_update.credential = Credential(pk=1, credential_type=ssh, inputs={'username': 'bob', 'ssh_key_data': self.EXAMPLE_PRIVATE_KEY})
@ -1509,7 +1509,7 @@ class TestProjectUpdateCredentials(TestJobExecution):
def test_awx_task_env(self, project_update, settings, private_data_dir, scm_type, execution_environment):
project_update.execution_environment = execution_environment
settings.AWX_TASK_ENV = {'FOO': 'BAR'}
task = tasks.RunProjectUpdate()
task = tasks.jobs.RunProjectUpdate()
task.instance = project_update
project_update.scm_type = scm_type
@ -1524,7 +1524,7 @@ class TestInventoryUpdateCredentials(TestJobExecution):
return InventoryUpdate(pk=1, execution_environment=execution_environment, inventory_source=InventorySource(pk=1, inventory=Inventory(pk=1)))
def test_source_without_credential(self, mocker, inventory_update, private_data_dir):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
inventory_update.source = 'ec2'
inventory_update.get_cloud_credential = mocker.Mock(return_value=None)
@ -1537,7 +1537,7 @@ class TestInventoryUpdateCredentials(TestJobExecution):
assert 'AWS_SECRET_ACCESS_KEY' not in env
def test_ec2_source(self, private_data_dir, inventory_update, mocker):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
aws = CredentialType.defaults['aws']()
inventory_update.source = 'ec2'
@ -1558,10 +1558,10 @@ class TestInventoryUpdateCredentials(TestJobExecution):
assert env['AWS_ACCESS_KEY_ID'] == 'bob'
assert env['AWS_SECRET_ACCESS_KEY'] == 'secret'
assert safe_env['AWS_SECRET_ACCESS_KEY'] == tasks.HIDDEN_PASSWORD
assert safe_env['AWS_SECRET_ACCESS_KEY'] == HIDDEN_PASSWORD
def test_vmware_source(self, inventory_update, private_data_dir, mocker):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
vmware = CredentialType.defaults['vmware']()
inventory_update.source = 'vmware'
@ -1589,7 +1589,7 @@ class TestInventoryUpdateCredentials(TestJobExecution):
env["VMWARE_VALIDATE_CERTS"] == "False",
def test_azure_rm_source_with_tenant(self, private_data_dir, inventory_update, mocker):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
azure_rm = CredentialType.defaults['azure_rm']()
inventory_update.source = 'azure_rm'
@ -1622,10 +1622,10 @@ class TestInventoryUpdateCredentials(TestJobExecution):
assert env['AZURE_SUBSCRIPTION_ID'] == 'some-subscription'
assert env['AZURE_CLOUD_ENVIRONMENT'] == 'foobar'
assert safe_env['AZURE_SECRET'] == tasks.HIDDEN_PASSWORD
assert safe_env['AZURE_SECRET'] == HIDDEN_PASSWORD
def test_azure_rm_source_with_password(self, private_data_dir, inventory_update, mocker):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
azure_rm = CredentialType.defaults['azure_rm']()
inventory_update.source = 'azure_rm'
@ -1651,10 +1651,10 @@ class TestInventoryUpdateCredentials(TestJobExecution):
assert env['AZURE_PASSWORD'] == 'secret'
assert env['AZURE_CLOUD_ENVIRONMENT'] == 'foobar'
assert safe_env['AZURE_PASSWORD'] == tasks.HIDDEN_PASSWORD
assert safe_env['AZURE_PASSWORD'] == HIDDEN_PASSWORD
def test_gce_source(self, inventory_update, private_data_dir, mocker):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
gce = CredentialType.defaults['gce']()
inventory_update.source = 'gce'
@ -1684,7 +1684,7 @@ class TestInventoryUpdateCredentials(TestJobExecution):
assert json_data['project_id'] == 'some-project'
def test_openstack_source(self, inventory_update, private_data_dir, mocker):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
openstack = CredentialType.defaults['openstack']()
inventory_update.source = 'openstack'
@ -1724,7 +1724,7 @@ class TestInventoryUpdateCredentials(TestJobExecution):
)
def test_satellite6_source(self, inventory_update, private_data_dir, mocker):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
satellite6 = CredentialType.defaults['satellite6']()
inventory_update.source = 'satellite6'
@ -1744,10 +1744,10 @@ class TestInventoryUpdateCredentials(TestJobExecution):
assert env["FOREMAN_SERVER"] == "https://example.org"
assert env["FOREMAN_USER"] == "bob"
assert env["FOREMAN_PASSWORD"] == "secret"
assert safe_env["FOREMAN_PASSWORD"] == tasks.HIDDEN_PASSWORD
assert safe_env["FOREMAN_PASSWORD"] == HIDDEN_PASSWORD
def test_insights_source(self, inventory_update, private_data_dir, mocker):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
insights = CredentialType.defaults['insights']()
inventory_update.source = 'insights'
@ -1772,11 +1772,11 @@ class TestInventoryUpdateCredentials(TestJobExecution):
assert env["INSIGHTS_USER"] == "bob"
assert env["INSIGHTS_PASSWORD"] == "secret"
assert safe_env['INSIGHTS_PASSWORD'] == tasks.HIDDEN_PASSWORD
assert safe_env['INSIGHTS_PASSWORD'] == HIDDEN_PASSWORD
@pytest.mark.parametrize('verify', [True, False])
def test_tower_source(self, verify, inventory_update, private_data_dir, mocker):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
tower = CredentialType.defaults['controller']()
inventory_update.source = 'controller'
@ -1801,10 +1801,10 @@ class TestInventoryUpdateCredentials(TestJobExecution):
assert env['CONTROLLER_VERIFY_SSL'] == 'True'
else:
assert env['CONTROLLER_VERIFY_SSL'] == 'False'
assert safe_env['CONTROLLER_PASSWORD'] == tasks.HIDDEN_PASSWORD
assert safe_env['CONTROLLER_PASSWORD'] == HIDDEN_PASSWORD
def test_tower_source_ssl_verify_empty(self, inventory_update, private_data_dir, mocker):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
tower = CredentialType.defaults['controller']()
inventory_update.source = 'controller'
@ -1832,7 +1832,7 @@ class TestInventoryUpdateCredentials(TestJobExecution):
assert env['TOWER_VERIFY_SSL'] == 'False'
def test_awx_task_env(self, inventory_update, private_data_dir, settings, mocker):
task = tasks.RunInventoryUpdate()
task = tasks.jobs.RunInventoryUpdate()
task.instance = inventory_update
gce = CredentialType.defaults['gce']()
inventory_update.source = 'gce'
@ -1883,7 +1883,7 @@ def test_aquire_lock_open_fail_logged(logging_getLogger, os_open):
logger = mock.Mock()
logging_getLogger.return_value = logger
ProjectUpdate = tasks.RunProjectUpdate()
ProjectUpdate = tasks.jobs.RunProjectUpdate()
with pytest.raises(OSError):
ProjectUpdate.acquire_lock(instance)
@ -1910,7 +1910,7 @@ def test_aquire_lock_acquisition_fail_logged(fcntl_lockf, logging_getLogger, os_
fcntl_lockf.side_effect = err
ProjectUpdate = tasks.RunProjectUpdate()
ProjectUpdate = tasks.jobs.RunProjectUpdate()
with pytest.raises(IOError):
ProjectUpdate.acquire_lock(instance)
os_close.assert_called_with(3)
@ -1920,7 +1920,7 @@ def test_aquire_lock_acquisition_fail_logged(fcntl_lockf, logging_getLogger, os_
@pytest.mark.parametrize('injector_cls', [cls for cls in ManagedCredentialType.registry.values() if cls.injectors])
def test_managed_injector_redaction(injector_cls):
"""See awx.main.models.inventory.PluginFileInjector._get_shared_env
The ordering within awx.main.tasks.BaseTask and contract with build_env
The ordering within awx.main.tasks.jobs.BaseTask and contract with build_env
requires that all managed injectors are safely redacted by the
static method build_safe_env without having to employ the safe namespace
as in inject_credential
@ -1947,7 +1947,7 @@ def test_notification_job_not_finished(logging_getLogger, mocker):
logging_getLogger.return_value = logger
with mocker.patch('awx.main.models.UnifiedJob.objects.get', uj):
tasks.handle_success_and_failure_notifications(1)
tasks.system.handle_success_and_failure_notifications(1)
assert logger.warn.called_with(f"Failed to even try to send notifications for job '{uj}' due to job not being in finished state.")
@ -1955,7 +1955,7 @@ def test_notification_job_finished(mocker):
uj = mocker.MagicMock(send_notification_templates=mocker.MagicMock(), finished=True)
with mocker.patch('awx.main.models.UnifiedJob.objects.get', mocker.MagicMock(return_value=uj)):
tasks.handle_success_and_failure_notifications(1)
tasks.system.handle_success_and_failure_notifications(1)
uj.send_notification_templates.assert_called()
@ -1964,12 +1964,12 @@ def test_job_run_no_ee():
proj = Project(pk=1, organization=org)
job = Job(project=proj, organization=org, inventory=Inventory(pk=1))
job.execution_environment = None
task = tasks.RunJob()
task = tasks.jobs.RunJob()
task.instance = job
task.update_model = mock.Mock(return_value=job)
task.model.objects.get = mock.Mock(return_value=job)
with mock.patch('awx.main.tasks.copy_tree'):
with mock.patch('awx.main.tasks.jobs.copy_tree'):
with pytest.raises(RuntimeError) as e:
task.pre_run_hook(job, private_data_dir)
@ -1983,7 +1983,7 @@ def test_project_update_no_ee():
proj = Project(pk=1, organization=org)
project_update = ProjectUpdate(pk=1, project=proj, scm_type='git')
project_update.execution_environment = None
task = tasks.RunProjectUpdate()
task = tasks.jobs.RunProjectUpdate()
task.instance = project_update
with pytest.raises(RuntimeError) as e:

View File

@ -1,4 +1,4 @@
from awx.main.utils.receptor import _convert_args_to_cli
from awx.main.tasks.receptor import _convert_args_to_cli
def test_file_cleanup_scenario():

View File

@ -1,230 +0,0 @@
import logging
import yaml
import time
from enum import Enum, unique
from receptorctl.socket_interface import ReceptorControl
from awx.main.exceptions import ReceptorNodeNotFound
from django.conf import settings
logger = logging.getLogger('awx.main.utils.receptor')
__RECEPTOR_CONF = '/etc/receptor/receptor.conf'
RECEPTOR_ACTIVE_STATES = ('Pending', 'Running')
@unique
class ReceptorConnectionType(Enum):
DATAGRAM = 0
STREAM = 1
STREAMTLS = 2
def get_receptor_sockfile():
with open(__RECEPTOR_CONF, 'r') as f:
data = yaml.safe_load(f)
for section in data:
for entry_name, entry_data in section.items():
if entry_name == 'control-service':
if 'filename' in entry_data:
return entry_data['filename']
else:
raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} control-service entry does not have a filename parameter')
else:
raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} does not have control-service entry needed to get sockfile')
def get_tls_client(use_stream_tls=None):
if not use_stream_tls:
return None
with open(__RECEPTOR_CONF, 'r') as f:
data = yaml.safe_load(f)
for section in data:
for entry_name, entry_data in section.items():
if entry_name == 'tls-client':
if 'name' in entry_data:
return entry_data['name']
return None
def get_receptor_ctl():
receptor_sockfile = get_receptor_sockfile()
try:
return ReceptorControl(receptor_sockfile, config=__RECEPTOR_CONF, tlsclient=get_tls_client(True))
except RuntimeError:
return ReceptorControl(receptor_sockfile)
def get_conn_type(node_name, receptor_ctl):
all_nodes = receptor_ctl.simple_command("status").get('Advertisements', None)
for node in all_nodes:
if node.get('NodeID') == node_name:
return ReceptorConnectionType(node.get('ConnType'))
raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh')
def administrative_workunit_reaper(work_list=None):
"""
This releases completed work units that were spawned by actions inside of this module
specifically, this should catch any completed work unit left by
- worker_info
- worker_cleanup
These should ordinarily be released when the method finishes, but this is a
cleanup of last-resort, in case something went awry
"""
receptor_ctl = get_receptor_ctl()
if work_list is None:
work_list = receptor_ctl.simple_command("work list")
for unit_id, work_data in work_list.items():
extra_data = work_data.get('ExtraData')
if (extra_data is None) or (extra_data.get('RemoteWorkType') != 'ansible-runner'):
continue # if this is not ansible-runner work, we do not want to touch it
params = extra_data.get('RemoteParams', {}).get('params')
if not params:
continue
if not (params == '--worker-info' or params.startswith('cleanup')):
continue # if this is not a cleanup or health check, we do not want to touch it
if work_data.get('StateName') in RECEPTOR_ACTIVE_STATES:
continue # do not want to touch active work units
logger.info(f'Reaping orphaned work unit {unit_id} with params {params}')
receptor_ctl.simple_command(f"work release {unit_id}")
class RemoteJobError(RuntimeError):
pass
def run_until_complete(node, timing_data=None, **kwargs):
"""
Runs an ansible-runner work_type on remote node, waits until it completes, then returns stdout.
"""
receptor_ctl = get_receptor_ctl()
use_stream_tls = getattr(get_conn_type(node, receptor_ctl), 'name', None) == "STREAMTLS"
kwargs.setdefault('tlsclient', get_tls_client(use_stream_tls))
kwargs.setdefault('ttl', '20s')
kwargs.setdefault('payload', '')
transmit_start = time.time()
sign_work = False if settings.IS_K8S else True
result = receptor_ctl.submit_work(worktype='ansible-runner', node=node, signwork=sign_work, **kwargs)
unit_id = result['unitid']
run_start = time.time()
if timing_data:
timing_data['transmit_timing'] = run_start - transmit_start
run_timing = 0.0
stdout = ''
try:
resultfile = receptor_ctl.get_work_results(unit_id)
while run_timing < 20.0:
status = receptor_ctl.simple_command(f'work status {unit_id}')
state_name = status.get('StateName')
if state_name not in RECEPTOR_ACTIVE_STATES:
break
run_timing = time.time() - run_start
time.sleep(0.5)
else:
raise RemoteJobError(f'Receptor job timeout on {node} after {run_timing} seconds, state remains in {state_name}')
if timing_data:
timing_data['run_timing'] = run_timing
stdout = resultfile.read()
stdout = str(stdout, encoding='utf-8')
finally:
if settings.RECEPTOR_RELEASE_WORK:
res = receptor_ctl.simple_command(f"work release {unit_id}")
if res != {'released': unit_id}:
logger.warn(f'Could not confirm release of receptor work unit id {unit_id} from {node}, data: {res}')
receptor_ctl.close()
if state_name.lower() == 'failed':
work_detail = status.get('Detail', '')
if work_detail:
raise RemoteJobError(f'Receptor error from {node}, detail:\n{work_detail}')
else:
raise RemoteJobError(f'Unknown ansible-runner error on node {node}, stdout:\n{stdout}')
return stdout
def worker_info(node_name, work_type='ansible-runner'):
error_list = []
data = {'errors': error_list, 'transmit_timing': 0.0}
try:
stdout = run_until_complete(node=node_name, timing_data=data, params={"params": "--worker-info"})
yaml_stdout = stdout.strip()
remote_data = {}
try:
remote_data = yaml.safe_load(yaml_stdout)
except Exception as json_e:
error_list.append(f'Failed to parse node {node_name} --worker-info output as YAML, error: {json_e}, data:\n{yaml_stdout}')
if not isinstance(remote_data, dict):
error_list.append(f'Remote node {node_name} --worker-info output is not a YAML dict, output:{stdout}')
else:
error_list.extend(remote_data.pop('errors', [])) # merge both error lists
data.update(remote_data)
except RemoteJobError as exc:
details = exc.args[0]
if 'unrecognized arguments: --worker-info' in details:
error_list.append(f'Old version (2.0.1 or earlier) of ansible-runner on node {node_name} without --worker-info')
else:
error_list.append(details)
except (ReceptorNodeNotFound, RuntimeError) as exc:
error_list.append(str(exc))
# If we have a connection error, missing keys would be trivial consequence of that
if not data['errors']:
# see tasks.py usage of keys
missing_keys = set(('runner_version', 'mem_in_bytes', 'cpu_count')) - set(data.keys())
if missing_keys:
data['errors'].append('Worker failed to return keys {}'.format(' '.join(missing_keys)))
return data
def _convert_args_to_cli(vargs):
"""
For the ansible-runner worker cleanup command
converts the dictionary (parsed argparse variables) used for python interface
into a string of CLI options, which has to be used on execution nodes.
"""
args = ['cleanup']
for option in ('exclude_strings', 'remove_images'):
if vargs.get(option):
args.append('--{}={}'.format(option.replace('_', '-'), ' '.join(vargs.get(option))))
for option in ('file_pattern', 'image_prune', 'process_isolation_executable', 'grace_period'):
if vargs.get(option) is True:
args.append('--{}'.format(option.replace('_', '-')))
elif vargs.get(option) not in (None, ''):
args.append('--{}={}'.format(option.replace('_', '-'), vargs.get(option)))
return args
def worker_cleanup(node_name, vargs, timeout=300.0):
args = _convert_args_to_cli(vargs)
remote_command = ' '.join(args)
logger.debug(f'Running command over receptor mesh on {node_name}: ansible-runner worker {remote_command}')
stdout = run_until_complete(node=node_name, params={"params": remote_command})
return stdout

View File

@ -425,18 +425,18 @@ EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 30 # once every 30 minutes check if an
BROKER_URL = 'unix:///var/run/redis/redis.sock'
CELERYBEAT_SCHEDULE = {
'tower_scheduler': {'task': 'awx.main.tasks.awx_periodic_scheduler', 'schedule': timedelta(seconds=30), 'options': {'expires': 20}},
'tower_scheduler': {'task': 'awx.main.tasks.system.awx_periodic_scheduler', 'schedule': timedelta(seconds=30), 'options': {'expires': 20}},
'cluster_heartbeat': {
'task': 'awx.main.tasks.cluster_node_heartbeat',
'task': 'awx.main.tasks.system.cluster_node_heartbeat',
'schedule': timedelta(seconds=CLUSTER_NODE_HEARTBEAT_PERIOD),
'options': {'expires': 50},
},
'gather_analytics': {'task': 'awx.main.tasks.gather_analytics', 'schedule': timedelta(minutes=5)},
'gather_analytics': {'task': 'awx.main.tasks.system.gather_analytics', 'schedule': timedelta(minutes=5)},
'task_manager': {'task': 'awx.main.scheduler.tasks.run_task_manager', 'schedule': timedelta(seconds=20), 'options': {'expires': 20}},
'k8s_reaper': {'task': 'awx.main.tasks.awx_k8s_reaper', 'schedule': timedelta(seconds=60), 'options': {'expires': 50}},
'receptor_reaper': {'task': 'awx.main.tasks.awx_receptor_workunit_reaper', 'schedule': timedelta(seconds=60)},
'k8s_reaper': {'task': 'awx.main.tasks.system.awx_k8s_reaper', 'schedule': timedelta(seconds=60), 'options': {'expires': 50}},
'receptor_reaper': {'task': 'awx.main.tasks.system.awx_receptor_workunit_reaper', 'schedule': timedelta(seconds=60)},
'send_subsystem_metrics': {'task': 'awx.main.analytics.analytics_tasks.send_subsystem_metrics', 'schedule': timedelta(seconds=20)},
'cleanup_images': {'task': 'awx.main.tasks.cleanup_images_and_files', 'schedule': timedelta(hours=3)},
'cleanup_images': {'task': 'awx.main.tasks.system.cleanup_images_and_files', 'schedule': timedelta(hours=3)},
}
# Django Caching Configuration

View File

@ -130,7 +130,7 @@ a telnet session:
```python
# awx/main/tasks.py
class SomeTask(awx.main.tasks.BaseTask):
class SomeTask(awx.main.tasks.jobs.BaseTask):
def run(self, pk, **kwargs):
# This will set a breakpoint and open an interactive Python

View File

@ -86,13 +86,13 @@ appropriate AMQP queue:
"uuid": "<some_unique_string>",
"args": [1, 1],
"kwargs": {},
"task": "awx.main.tasks.add"
"task": "awx.main.tasks.system.add"
}
When a background worker receives the message, it deserializes it and runs the
associated Python code:
awx.main.tasks.add(123)
awx.main.tasks.system.add(123)
Dispatcher Implementation