mirror of
https://github.com/ansible/awx.git
synced 2026-02-03 10:38:15 -03:30
* WIP First pass * started removing feature flags and adjusting logic * Add decorator * moved to dispatcher decorator * updated as many as I could find * Keep callback receiver working * remove any code that is not used by the call back receiver * add back auto_max_workers * added back get_auto_max_workers into common utils * Remove control and hazmat (squash this not done) * moved status out and deleted control as no longer needed * removed unused imports * adjusted test import to pull correct method * fixed imports and addressed clusternode heartbeat test * Update function comments * Add back hazmat for config and remove baseworker * added back hazmat per @alancoding feedback around config * removed baseworker completely and refactored it into the callback worker * Fix dispatcher run call and remove dispatch setting * remove dispatcher mock publish setting * Adjust heartbeat arg and more formatting * fixed the call to cluster_node_heartbeat missing binder * Fix attribute error in server logs
897 lines
37 KiB
Python
897 lines
37 KiB
Python
# Python
|
|
from base64 import b64encode
|
|
from collections import namedtuple
|
|
import concurrent.futures
|
|
from enum import Enum
|
|
import logging
|
|
import os
|
|
import shutil
|
|
import socket
|
|
import time
|
|
import yaml
|
|
|
|
# Django
|
|
from django.conf import settings
|
|
from django.db import connections
|
|
|
|
# Runner
|
|
import ansible_runner
|
|
|
|
# django-ansible-base
|
|
from ansible_base.lib.utils.db import advisory_lock
|
|
|
|
# Dispatcherd
|
|
from dispatcherd.publish import task
|
|
|
|
# AWX
|
|
from awx.main.utils.execution_environments import get_default_pod_spec
|
|
from awx.main.exceptions import ReceptorNodeNotFound
|
|
from awx.main.utils.common import (
|
|
deepmerge,
|
|
parse_yaml_or_json,
|
|
cleanup_new_process,
|
|
)
|
|
from awx.main.constants import MAX_ISOLATED_PATH_COLON_DELIMITER
|
|
from awx.main.tasks.signals import signal_state, signal_callback, SignalExit
|
|
from awx.main.models import Instance, InstanceLink, UnifiedJob, ReceptorAddress
|
|
from awx.main.dispatch import get_task_queuename
|
|
|
|
# Receptorctl
|
|
from receptorctl.socket_interface import ReceptorControl
|
|
|
|
from filelock import FileLock
|
|
|
|
logger = logging.getLogger('awx.main.tasks.receptor')
|
|
__RECEPTOR_CONF = '/etc/receptor/receptor.conf'
|
|
__RECEPTOR_CONF_LOCKFILE = f'{__RECEPTOR_CONF}.lock'
|
|
RECEPTOR_ACTIVE_STATES = ('Pending', 'Running')
|
|
|
|
|
|
class ReceptorConnectionType(Enum):
|
|
DATAGRAM = 0
|
|
STREAM = 1
|
|
STREAMTLS = 2
|
|
|
|
|
|
"""
|
|
Translate receptorctl messages that come in over stdout into
|
|
structured messages. Currently, these are error messages.
|
|
"""
|
|
|
|
|
|
class ReceptorErrorBase:
|
|
_MESSAGE = 'Receptor Error'
|
|
|
|
def __init__(self, node: str = 'N/A', state_name: str = 'N/A'):
|
|
self.node = node
|
|
self.state_name = state_name
|
|
|
|
def __str__(self):
|
|
return f"{self.__class__.__name__} '{self._MESSAGE}' on node '{self.node}' with state '{self.state_name}'"
|
|
|
|
|
|
class WorkUnitError(ReceptorErrorBase):
|
|
_MESSAGE = 'unknown work unit '
|
|
|
|
def __init__(self, work_unit_id: str, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self.work_unit_id = work_unit_id
|
|
|
|
def __str__(self):
|
|
return f"{super().__str__()} work unit id '{self.work_unit_id}'"
|
|
|
|
|
|
class WorkUnitCancelError(WorkUnitError):
|
|
_MESSAGE = 'error cancelling remote unit: unknown work unit '
|
|
|
|
|
|
class WorkUnitResultsError(WorkUnitError):
|
|
_MESSAGE = 'Failed to get results: unknown work unit '
|
|
|
|
|
|
class UnknownError(ReceptorErrorBase):
|
|
_MESSAGE = 'Unknown receptor ctl error'
|
|
|
|
def __init__(self, msg, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self._MESSAGE = msg
|
|
|
|
|
|
class FuzzyError:
|
|
def __new__(self, e: RuntimeError, node: str, state_name: str):
|
|
"""
|
|
At the time of writing this comment all of the sub-classes detection
|
|
is centralized in this parent class. It's like a Router().
|
|
Someone may find it better to push down the error detection logic into
|
|
each sub-class.
|
|
"""
|
|
msg = e.args[0]
|
|
|
|
common_startswith = (WorkUnitCancelError, WorkUnitResultsError, WorkUnitError)
|
|
|
|
for klass in common_startswith:
|
|
if msg.startswith(klass._MESSAGE):
|
|
work_unit_id = msg[len(klass._MESSAGE) :]
|
|
return klass(work_unit_id, node=node, state_name=state_name)
|
|
|
|
return UnknownError(msg, node=node, state_name=state_name)
|
|
|
|
|
|
def read_receptor_config():
|
|
# for K8S deployments, getting a lock is necessary as another process
|
|
# may be re-writing the config at this time
|
|
if settings.IS_K8S:
|
|
lock = FileLock(__RECEPTOR_CONF_LOCKFILE)
|
|
with lock:
|
|
with open(__RECEPTOR_CONF, 'r') as f:
|
|
return yaml.safe_load(f)
|
|
else:
|
|
with open(__RECEPTOR_CONF, 'r') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def work_signing_enabled(config_data):
|
|
for section in config_data:
|
|
if 'work-signing' in section:
|
|
return True
|
|
return False
|
|
|
|
|
|
def get_receptor_sockfile(config_data):
|
|
for section in config_data:
|
|
for entry_name, entry_data in section.items():
|
|
if entry_name == 'control-service':
|
|
if 'filename' in entry_data:
|
|
return entry_data['filename']
|
|
else:
|
|
raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} control-service entry does not have a filename parameter')
|
|
else:
|
|
raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} does not have control-service entry needed to get sockfile')
|
|
|
|
|
|
def get_tls_client(config_data, use_stream_tls=None):
|
|
if not use_stream_tls:
|
|
return None
|
|
|
|
for section in config_data:
|
|
for entry_name, entry_data in section.items():
|
|
if entry_name == 'tls-client':
|
|
if 'name' in entry_data:
|
|
return entry_data['name']
|
|
return None
|
|
|
|
|
|
def get_receptor_ctl(config_data=None):
|
|
if config_data is None:
|
|
config_data = read_receptor_config()
|
|
receptor_sockfile = get_receptor_sockfile(config_data)
|
|
try:
|
|
return ReceptorControl(receptor_sockfile, config=__RECEPTOR_CONF, tlsclient=get_tls_client(config_data, True))
|
|
except RuntimeError:
|
|
return ReceptorControl(receptor_sockfile)
|
|
|
|
|
|
def find_node_in_mesh(node_name, receptor_ctl):
|
|
attempts = 10
|
|
backoff = 1
|
|
for attempt in range(attempts):
|
|
all_nodes = receptor_ctl.simple_command("status").get('Advertisements', None)
|
|
for node in all_nodes:
|
|
if node.get('NodeID') == node_name:
|
|
return node
|
|
else:
|
|
logger.warning(f"Instance {node_name} is not in the receptor mesh. {attempts-attempt} attempts left.")
|
|
time.sleep(backoff)
|
|
backoff += 1
|
|
else:
|
|
raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh')
|
|
|
|
|
|
def get_conn_type(node_name, receptor_ctl):
|
|
node = find_node_in_mesh(node_name, receptor_ctl)
|
|
return ReceptorConnectionType(node.get('ConnType'))
|
|
|
|
|
|
def administrative_workunit_reaper(work_list=None):
|
|
"""
|
|
This releases completed work units that were spawned by actions inside of this module
|
|
specifically, this should catch any completed work unit left by
|
|
- worker_info
|
|
- worker_cleanup
|
|
These should ordinarily be released when the method finishes, but this is a
|
|
cleanup of last-resort, in case something went awry
|
|
"""
|
|
receptor_ctl = get_receptor_ctl()
|
|
if work_list is None:
|
|
work_list = receptor_ctl.simple_command("work list")
|
|
|
|
for unit_id, work_data in work_list.items():
|
|
extra_data = work_data.get('ExtraData')
|
|
if extra_data is None:
|
|
continue # if this is not ansible-runner work, we do not want to touch it
|
|
if isinstance(extra_data, str):
|
|
if not work_data.get('StateName', None) or work_data.get('StateName') in RECEPTOR_ACTIVE_STATES:
|
|
continue
|
|
else:
|
|
if extra_data.get('RemoteWorkType') != 'ansible-runner':
|
|
continue
|
|
params = extra_data.get('RemoteParams', {}).get('params')
|
|
if not params:
|
|
continue
|
|
if not (params == '--worker-info' or params.startswith('cleanup')):
|
|
continue # if this is not a cleanup or health check, we do not want to touch it
|
|
if work_data.get('StateName') in RECEPTOR_ACTIVE_STATES:
|
|
continue # do not want to touch active work units
|
|
logger.info(f'Reaping orphaned work unit {unit_id} with params {params}')
|
|
receptor_ctl.simple_command(f"work release {unit_id}")
|
|
|
|
|
|
class RemoteJobError(RuntimeError):
|
|
pass
|
|
|
|
|
|
def run_until_complete(node, timing_data=None, worktype='ansible-runner', ttl='20s', **kwargs):
|
|
"""
|
|
Runs an ansible-runner work_type on remote node, waits until it completes, then returns stdout.
|
|
"""
|
|
|
|
config_data = read_receptor_config()
|
|
receptor_ctl = get_receptor_ctl(config_data)
|
|
|
|
use_stream_tls = getattr(get_conn_type(node, receptor_ctl), 'name', None) == "STREAMTLS"
|
|
kwargs.setdefault('tlsclient', get_tls_client(config_data, use_stream_tls))
|
|
if ttl is not None:
|
|
kwargs['ttl'] = ttl
|
|
kwargs.setdefault('payload', '')
|
|
if work_signing_enabled(config_data):
|
|
kwargs['signwork'] = True
|
|
|
|
transmit_start = time.time()
|
|
result = receptor_ctl.submit_work(worktype=worktype, node=node, **kwargs)
|
|
|
|
unit_id = result['unitid']
|
|
run_start = time.time()
|
|
if timing_data:
|
|
timing_data['transmit_timing'] = run_start - transmit_start
|
|
run_timing = 0.0
|
|
stdout = ''
|
|
state_name = 'local var never set'
|
|
|
|
try:
|
|
resultfile = receptor_ctl.get_work_results(unit_id)
|
|
|
|
while run_timing < 20.0:
|
|
status = receptor_ctl.simple_command(f'work status {unit_id}')
|
|
state_name = status.get('StateName')
|
|
if state_name not in RECEPTOR_ACTIVE_STATES:
|
|
break
|
|
run_timing = time.time() - run_start
|
|
time.sleep(0.5)
|
|
else:
|
|
raise RemoteJobError(f'Receptor job timeout on {node} after {run_timing} seconds, state remains in {state_name}')
|
|
|
|
if timing_data:
|
|
timing_data['run_timing'] = run_timing
|
|
|
|
stdout = resultfile.read()
|
|
stdout = str(stdout, encoding='utf-8')
|
|
|
|
except RuntimeError as e:
|
|
receptor_e = FuzzyError(e, node, state_name)
|
|
if type(receptor_e) in (
|
|
WorkUnitError,
|
|
WorkUnitResultsError,
|
|
):
|
|
logger.warning(f'While consuming job results: {receptor_e}')
|
|
else:
|
|
raise
|
|
finally:
|
|
if settings.RECEPTOR_RELEASE_WORK:
|
|
try:
|
|
res = receptor_ctl.simple_command(f"work release {unit_id}")
|
|
|
|
if res != {'released': unit_id}:
|
|
logger.warning(f'Could not confirm release of receptor work unit id {unit_id} from {node}, data: {res}')
|
|
|
|
receptor_ctl.close()
|
|
except RuntimeError as e:
|
|
receptor_e = FuzzyError(e, node, state_name)
|
|
if type(receptor_e) in (
|
|
WorkUnitError,
|
|
WorkUnitCancelError,
|
|
):
|
|
logger.warning(f"While releasing work: {receptor_e}")
|
|
else:
|
|
logger.error(f"While releasing work: {receptor_e}")
|
|
|
|
if state_name.lower() == 'failed':
|
|
work_detail = status.get('Detail', '')
|
|
if work_detail:
|
|
if stdout:
|
|
raise RemoteJobError(f'Receptor error from {node}, detail:\n{work_detail}\nstdout:\n{stdout}')
|
|
else:
|
|
raise RemoteJobError(f'Receptor error from {node}, detail:\n{work_detail}')
|
|
else:
|
|
raise RemoteJobError(f'Unknown ansible-runner error on node {node}, stdout:\n{stdout}')
|
|
|
|
return stdout
|
|
|
|
|
|
def worker_info(node_name, work_type='ansible-runner'):
|
|
error_list = []
|
|
data = {'errors': error_list, 'transmit_timing': 0.0}
|
|
|
|
try:
|
|
stdout = run_until_complete(node=node_name, timing_data=data, params={"params": "--worker-info"})
|
|
|
|
yaml_stdout = stdout.strip()
|
|
remote_data = {}
|
|
try:
|
|
remote_data = yaml.safe_load(yaml_stdout)
|
|
except Exception as json_e:
|
|
error_list.append(f'Failed to parse node {node_name} --worker-info output as YAML, error: {json_e}, data:\n{yaml_stdout}')
|
|
|
|
if not isinstance(remote_data, dict):
|
|
error_list.append(f'Remote node {node_name} --worker-info output is not a YAML dict, output:{stdout}')
|
|
else:
|
|
error_list.extend(remote_data.pop('errors', [])) # merge both error lists
|
|
data.update(remote_data)
|
|
|
|
except RemoteJobError as exc:
|
|
details = exc.args[0]
|
|
if 'unrecognized arguments: --worker-info' in details:
|
|
error_list.append(f'Old version (2.0.1 or earlier) of ansible-runner on node {node_name} without --worker-info')
|
|
else:
|
|
error_list.append(details)
|
|
|
|
except Exception as exc:
|
|
error_list.append(str(exc))
|
|
|
|
# If we have a connection error, missing keys would be trivial consequence of that
|
|
if not data['errors']:
|
|
# see tasks.py usage of keys
|
|
missing_keys = set(('runner_version', 'mem_in_bytes', 'cpu_count')) - set(data.keys())
|
|
if missing_keys:
|
|
data['errors'].append('Worker failed to return keys {}'.format(' '.join(missing_keys)))
|
|
|
|
return data
|
|
|
|
|
|
def _convert_args_to_cli(vargs):
|
|
"""
|
|
For the ansible-runner worker cleanup command
|
|
converts the dictionary (parsed argparse variables) used for python interface
|
|
into a string of CLI options, which has to be used on execution nodes.
|
|
"""
|
|
args = ['cleanup']
|
|
for option in ('exclude_strings', 'remove_images'):
|
|
if vargs.get(option):
|
|
args.append('--{} {}'.format(option.replace('_', '-'), ' '.join(f'"{item}"' for item in vargs.get(option))))
|
|
for option in ('file_pattern', 'image_prune', 'process_isolation_executable', 'grace_period'):
|
|
if vargs.get(option) is True:
|
|
args.append('--{}'.format(option.replace('_', '-')))
|
|
elif vargs.get(option) not in (None, ''):
|
|
args.append('--{}={}'.format(option.replace('_', '-'), vargs.get(option)))
|
|
return args
|
|
|
|
|
|
def worker_cleanup(node_name, vargs):
|
|
args = _convert_args_to_cli(vargs)
|
|
|
|
remote_command = ' '.join(args)
|
|
logger.debug(f'Running command over receptor mesh on {node_name}: ansible-runner worker {remote_command}')
|
|
|
|
stdout = run_until_complete(node=node_name, params={"params": remote_command})
|
|
|
|
return stdout
|
|
|
|
|
|
class AWXReceptorJob:
|
|
def __init__(self, task, runner_params=None):
|
|
self.task = task
|
|
self.runner_params = runner_params
|
|
self.unit_id = None
|
|
|
|
if self.task and not self.task.instance.is_container_group_task:
|
|
execution_environment_params = self.task.build_execution_environment_params(self.task.instance, runner_params['private_data_dir'])
|
|
self.runner_params.update(execution_environment_params)
|
|
|
|
if not settings.IS_K8S and self.work_type == 'local' and 'only_transmit_kwargs' not in self.runner_params:
|
|
self.runner_params['only_transmit_kwargs'] = True
|
|
|
|
def run(self):
|
|
# We establish a connection to the Receptor socket
|
|
self.config_data = read_receptor_config()
|
|
receptor_ctl = get_receptor_ctl(self.config_data)
|
|
|
|
res = None
|
|
try:
|
|
res = self._run_internal(receptor_ctl)
|
|
return res
|
|
finally:
|
|
status = getattr(res, 'status', 'error')
|
|
self._receptor_release_work(receptor_ctl, status)
|
|
|
|
def _receptor_release_work(self, receptor_ctl: ReceptorControl, status: str) -> None:
|
|
"""
|
|
Releases the work unit from Receptor if certain conditions are met.
|
|
This method checks several conditions before attempting to release the work unit:
|
|
- If `self.unit_id` is `None`, the method returns immediately.
|
|
- If the `RECEPTOR_RELEASE_WORK` setting is `False`, the method returns immediately.
|
|
- If the `RECEPTOR_KEEP_WORK_ON_ERROR` setting is `True` and the status is 'error', the method returns immediately.
|
|
If none of the above conditions are met, the method attempts to release the work unit using the Receptor control command.
|
|
If an exception occurs during the release process, it logs an error message.
|
|
Args:
|
|
receptor_ctl (ReceptorControl): The Receptor control object used to issue commands.
|
|
status (str): The status of the work unit, which may affect whether it is released.
|
|
"""
|
|
|
|
if self.unit_id is None:
|
|
logger.debug("No work unit ID to release.")
|
|
return
|
|
|
|
if settings.RECEPTOR_RELEASE_WORK is False:
|
|
logger.debug(f"RECEPTOR_RELEASE_WORK is False, not releasing work unit {self.unit_id}.")
|
|
return
|
|
|
|
if settings.RECEPTOR_KEEP_WORK_ON_ERROR and status == 'error':
|
|
logger.debug(f"RECEPTOR_KEEP_WORK_ON_ERROR is True and status is 'error', not releasing work unit {self.unit_id}.")
|
|
return
|
|
|
|
try:
|
|
logger.debug(f"Released work unit {self.unit_id}.")
|
|
receptor_ctl.simple_command(f"work release {self.unit_id}")
|
|
except Exception:
|
|
logger.exception(f"Error releasing work unit {self.unit_id}.")
|
|
|
|
def _run_internal(self, receptor_ctl):
|
|
# Create a socketpair. Where the left side will be used for writing our payload
|
|
# (private data dir, kwargs). The right side will be passed to Receptor for
|
|
# reading.
|
|
sockin, sockout = socket.socketpair()
|
|
|
|
# Prepare the submit_work kwargs before creating threads, because references to settings are not thread-safe
|
|
work_submit_kw = dict(worktype=self.work_type, params=self.receptor_params, signwork=self.sign_work)
|
|
if self.work_type == 'ansible-runner':
|
|
work_submit_kw['node'] = self.task.instance.execution_node
|
|
use_stream_tls = get_conn_type(work_submit_kw['node'], receptor_ctl).name == "STREAMTLS"
|
|
work_submit_kw['tlsclient'] = get_tls_client(self.config_data, use_stream_tls)
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
transmitter_future = executor.submit(self.transmit, sockin)
|
|
|
|
# submit our work, passing in the right side of our socketpair for reading.
|
|
result = receptor_ctl.submit_work(payload=sockout.makefile('rb'), **work_submit_kw)
|
|
|
|
sockin.close()
|
|
sockout.close()
|
|
|
|
self.unit_id = result['unitid']
|
|
# Update the job with the work unit in-memory so that the log_lifecycle
|
|
# will print out the work unit that is to be associated with the job in the database
|
|
# via the update_model() call.
|
|
# We want to log the work_unit_id as early as possible. A failure can happen in between
|
|
# when we start the job in receptor and when we associate the job <-> work_unit_id.
|
|
# In that case, there will be work running in receptor and Controller will not know
|
|
# which Job it is associated with.
|
|
# We do not programatically handle this case. Ideally, we would handle this with a reaper case.
|
|
# The two distinct job lifecycle log events below allow for us to at least detect when this
|
|
# edge case occurs. If the lifecycle event work_unit_id_received occurs without the
|
|
# work_unit_id_assigned event then this case may have occured.
|
|
self.task.instance.work_unit_id = result['unitid'] # Set work_unit_id in-memory only
|
|
self.task.instance.log_lifecycle("work_unit_id_received")
|
|
self.task.update_model(self.task.instance.pk, work_unit_id=result['unitid'])
|
|
self.task.instance.log_lifecycle("work_unit_id_assigned")
|
|
|
|
# Throws an exception if the transmit failed.
|
|
# Will be caught by the try/except in BaseTask#run.
|
|
transmitter_future.result()
|
|
|
|
# Artifacts are an output, but sometimes they are an input as well
|
|
# this is the case with fact cache, where clearing facts deletes a file, and this must be captured
|
|
artifact_dir = os.path.join(self.runner_params['private_data_dir'], 'artifacts')
|
|
if self.work_type != 'local' and os.path.exists(artifact_dir):
|
|
shutil.rmtree(artifact_dir)
|
|
|
|
resultsock, resultfile = receptor_ctl.get_work_results(self.unit_id, return_socket=True, return_sockfile=True)
|
|
|
|
connections.close_all()
|
|
|
|
# "processor" and the main thread will be separate threads.
|
|
# If a cancel happens, the main thread will encounter an exception, in which case
|
|
# we yank the socket out from underneath the processor, which will cause it to exit.
|
|
# The ThreadPoolExecutor context manager ensures we do not leave any threads laying around.
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
processor_future = executor.submit(self.processor, resultfile)
|
|
|
|
try:
|
|
signal_state.raise_exception = True
|
|
# address race condition where SIGTERM was issued after this dispatcher task started
|
|
if signal_callback():
|
|
raise SignalExit()
|
|
res = processor_future.result()
|
|
except SignalExit:
|
|
receptor_ctl.simple_command(f"work cancel {self.unit_id}")
|
|
resultsock.shutdown(socket.SHUT_RDWR)
|
|
resultfile.close()
|
|
result = namedtuple('result', ['status', 'rc'])
|
|
res = result('canceled', 1)
|
|
finally:
|
|
signal_state.raise_exception = False
|
|
|
|
if res.status == 'error':
|
|
# If ansible-runner ran, but an error occured at runtime, the traceback information
|
|
# is saved via the status_handler passed in to the processor.
|
|
if 'result_traceback' in self.task.runner_callback.extra_update_fields:
|
|
return res
|
|
|
|
try:
|
|
unit_status = receptor_ctl.simple_command(f'work status {self.unit_id}')
|
|
detail = unit_status.get('Detail', None)
|
|
state_name = unit_status.get('StateName', None)
|
|
stdout_size = unit_status.get('StdoutSize', 0)
|
|
except Exception:
|
|
detail = ''
|
|
state_name = ''
|
|
stdout_size = 0
|
|
logger.exception(f'An error was encountered while getting status for work unit {self.unit_id}')
|
|
|
|
if 'exceeded quota' in detail:
|
|
logger.warning(detail)
|
|
log_name = self.task.instance.log_format
|
|
logger.warning(f"Could not launch pod for {log_name}. Exceeded quota.")
|
|
self.task.update_model(self.task.instance.pk, status='pending')
|
|
return
|
|
|
|
try:
|
|
receptor_output = ''
|
|
if state_name == 'Failed' and self.task.runner_callback.event_ct == 0:
|
|
# if receptor work unit failed and no events were emitted, work results may
|
|
# contain useful information about why the job failed. In case stdout is
|
|
# massive, only ask for last 1000 bytes
|
|
startpos = max(stdout_size - 1000, 0)
|
|
resultsock, resultfile = receptor_ctl.get_work_results(self.unit_id, startpos=startpos, return_socket=True, return_sockfile=True)
|
|
lines = resultfile.readlines()
|
|
receptor_output = b"".join(lines).decode()
|
|
if receptor_output:
|
|
self.task.runner_callback.delay_update(result_traceback=f'Worker output:\n{receptor_output}')
|
|
elif detail:
|
|
self.task.runner_callback.delay_update(result_traceback=f'Receptor detail:\n{detail}')
|
|
else:
|
|
logger.warning(f'No result details or output from {self.task.instance.log_format}, status:\n{state_name}')
|
|
except Exception:
|
|
logger.exception(f'Work results error from job id={self.task.instance.id} work_unit={self.task.instance.work_unit_id}')
|
|
raise RuntimeError(detail)
|
|
|
|
return res
|
|
|
|
# Spawned in a thread so Receptor can start reading before we finish writing, we
|
|
# write our payload to the left side of our socketpair.
|
|
@cleanup_new_process
|
|
def transmit(self, _socket):
|
|
try:
|
|
ansible_runner.interface.run(streamer='transmit', _output=_socket.makefile('wb'), **self.runner_params)
|
|
finally:
|
|
# Socket must be shutdown here, or the reader will hang forever.
|
|
_socket.shutdown(socket.SHUT_WR)
|
|
|
|
@cleanup_new_process
|
|
def processor(self, resultfile):
|
|
return ansible_runner.interface.run(
|
|
streamer='process',
|
|
quiet=True,
|
|
_input=resultfile,
|
|
event_handler=self.task.runner_callback.event_handler,
|
|
finished_callback=self.task.runner_callback.finished_callback,
|
|
status_handler=self.task.runner_callback.status_handler,
|
|
artifacts_handler=self.task.runner_callback.artifacts_handler,
|
|
**self.runner_params,
|
|
)
|
|
|
|
@property
|
|
def receptor_params(self):
|
|
if self.task.instance.is_container_group_task:
|
|
spec_yaml = yaml.dump(self.pod_definition, explicit_start=True)
|
|
|
|
receptor_params = {
|
|
"secret_kube_pod": spec_yaml,
|
|
"pod_pending_timeout": getattr(settings, 'AWX_CONTAINER_GROUP_POD_PENDING_TIMEOUT', "5m"),
|
|
}
|
|
|
|
if self.credential:
|
|
kubeconfig_yaml = yaml.dump(self.kube_config, explicit_start=True)
|
|
receptor_params["secret_kube_config"] = kubeconfig_yaml
|
|
else:
|
|
private_data_dir = self.runner_params['private_data_dir']
|
|
if self.work_type == 'ansible-runner' and settings.AWX_CLEANUP_PATHS:
|
|
# on execution nodes, we rely on the private data dir being deleted
|
|
cli_params = f"--private-data-dir={private_data_dir} --delete"
|
|
else:
|
|
# on hybrid nodes, we rely on the private data dir NOT being deleted
|
|
cli_params = f"--private-data-dir={private_data_dir}"
|
|
receptor_params = {"params": cli_params}
|
|
|
|
return receptor_params
|
|
|
|
@property
|
|
def sign_work(self):
|
|
if self.work_type in ('ansible-runner', 'local'):
|
|
return work_signing_enabled(self.config_data)
|
|
return False
|
|
|
|
@property
|
|
def work_type(self):
|
|
if self.task.instance.is_container_group_task:
|
|
if self.credential:
|
|
return 'kubernetes-runtime-auth'
|
|
return 'kubernetes-incluster-auth'
|
|
if self.task.instance.execution_node == settings.CLUSTER_HOST_ID or self.task.instance.execution_node == self.task.instance.controller_node:
|
|
return 'local'
|
|
return 'ansible-runner'
|
|
|
|
@property
|
|
def pod_definition(self):
|
|
ee = self.task.instance.execution_environment
|
|
|
|
default_pod_spec = get_default_pod_spec()
|
|
|
|
pod_spec_override = {}
|
|
if self.task and self.task.instance.instance_group.pod_spec_override:
|
|
pod_spec_override = parse_yaml_or_json(self.task.instance.instance_group.pod_spec_override)
|
|
# According to the deepmerge docstring, the second dictionary will override when
|
|
# they share keys, which is the desired behavior.
|
|
# This allows user to only provide elements they want to override, and for us to still provide any
|
|
# defaults they don't want to change
|
|
pod_spec = deepmerge(default_pod_spec, pod_spec_override)
|
|
|
|
pod_spec['spec']['containers'][0]['image'] = ee.image
|
|
pod_spec['spec']['containers'][0]['args'] = ['ansible-runner', 'worker', '--private-data-dir=/runner']
|
|
|
|
if settings.AWX_RUNNER_KEEPALIVE_SECONDS:
|
|
pod_spec['spec']['containers'][0].setdefault('env', [])
|
|
pod_spec['spec']['containers'][0]['env'].append({'name': 'ANSIBLE_RUNNER_KEEPALIVE_SECONDS', 'value': str(settings.AWX_RUNNER_KEEPALIVE_SECONDS)})
|
|
|
|
# Enforce EE Pull Policy
|
|
pull_options = {"always": "Always", "missing": "IfNotPresent", "never": "Never"}
|
|
if self.task and self.task.instance.execution_environment:
|
|
if self.task.instance.execution_environment.pull:
|
|
pod_spec['spec']['containers'][0]['imagePullPolicy'] = pull_options[self.task.instance.execution_environment.pull]
|
|
|
|
# This allows the user to also expose the isolated path list
|
|
# to EEs running in k8s/ocp environments, i.e. container groups.
|
|
# This assumes the node and SA supports hostPath volumes
|
|
# type is not passed due to backward compatibility,
|
|
# which means that no checks will be performed before mounting the hostPath volume.
|
|
if settings.AWX_MOUNT_ISOLATED_PATHS_ON_K8S and settings.AWX_ISOLATION_SHOW_PATHS:
|
|
spec_volume_mounts = []
|
|
spec_volumes = []
|
|
|
|
for idx, this_path in enumerate(settings.AWX_ISOLATION_SHOW_PATHS):
|
|
mount_option = None
|
|
if this_path.count(':') == MAX_ISOLATED_PATH_COLON_DELIMITER:
|
|
src, dest, mount_option = this_path.split(':')
|
|
elif this_path.count(':') == MAX_ISOLATED_PATH_COLON_DELIMITER - 1:
|
|
src, dest = this_path.split(':')
|
|
else:
|
|
src = dest = this_path
|
|
|
|
# Enforce read-only volume if 'ro' has been explicitly passed
|
|
# We do this so we can use the same configuration for regular scenarios and k8s
|
|
# Since flags like ':O', ':z' or ':Z' are not valid in the k8s realm
|
|
# Example: /data:/data:ro
|
|
read_only = bool('ro' == mount_option)
|
|
|
|
# Since type is not being passed, k8s by default will not perform any checks if the
|
|
# hostPath volume exists on the k8s node itself.
|
|
spec_volumes.append({'name': f'volume-{idx}', 'hostPath': {'path': src}})
|
|
|
|
spec_volume_mounts.append({'name': f'volume-{idx}', 'mountPath': f'{dest}', 'readOnly': read_only})
|
|
|
|
# merge any volumes definition already present in the pod_spec
|
|
if 'volumes' in pod_spec['spec']:
|
|
pod_spec['spec']['volumes'] += spec_volumes
|
|
else:
|
|
pod_spec['spec']['volumes'] = spec_volumes
|
|
|
|
# merge any volumesMounts definition already present in the pod_spec
|
|
if 'volumeMounts' in pod_spec['spec']['containers'][0]:
|
|
pod_spec['spec']['containers'][0]['volumeMounts'] += spec_volume_mounts
|
|
else:
|
|
pod_spec['spec']['containers'][0]['volumeMounts'] = spec_volume_mounts
|
|
|
|
if self.task and self.task.instance.is_container_group_task:
|
|
# If EE credential is passed, create an imagePullSecret
|
|
if self.task.instance.execution_environment and self.task.instance.execution_environment.credential:
|
|
# Create pull secret in k8s cluster based on ee cred
|
|
from awx.main.scheduler.kubernetes import PodManager # prevent circular import
|
|
|
|
pm = PodManager(self.task.instance)
|
|
secret_name = pm.create_secret(job=self.task.instance)
|
|
|
|
# Inject secret name into podspec
|
|
pod_spec['spec']['imagePullSecrets'] = [{"name": secret_name}]
|
|
|
|
if self.task:
|
|
pod_spec['metadata'] = deepmerge(
|
|
pod_spec.get('metadata', {}),
|
|
dict(name=self.pod_name, labels={'ansible-awx': settings.INSTALL_UUID, 'ansible-awx-job-id': str(self.task.instance.id)}),
|
|
)
|
|
|
|
return pod_spec
|
|
|
|
@property
|
|
def pod_name(self):
|
|
return f"automation-job-{self.task.instance.id}"
|
|
|
|
@property
|
|
def credential(self):
|
|
return self.task.instance.instance_group.credential
|
|
|
|
@property
|
|
def namespace(self):
|
|
return self.pod_definition['metadata']['namespace']
|
|
|
|
@property
|
|
def kube_config(self):
|
|
host_input = self.credential.get_input('host')
|
|
config = {
|
|
"apiVersion": "v1",
|
|
"kind": "Config",
|
|
"preferences": {},
|
|
"clusters": [{"name": host_input, "cluster": {"server": host_input}}],
|
|
"users": [{"name": host_input, "user": {"token": self.credential.get_input('bearer_token')}}],
|
|
"contexts": [{"name": host_input, "context": {"cluster": host_input, "user": host_input, "namespace": self.namespace}}],
|
|
"current-context": host_input,
|
|
}
|
|
|
|
if self.credential.get_input('verify_ssl') and 'ssl_ca_cert' in self.credential.inputs:
|
|
config["clusters"][0]["cluster"]["certificate-authority-data"] = b64encode(
|
|
self.credential.get_input('ssl_ca_cert').encode() # encode to bytes
|
|
).decode() # decode the base64 data into a str
|
|
else:
|
|
config["clusters"][0]["cluster"]["insecure-skip-tls-verify"] = True
|
|
return config
|
|
|
|
|
|
# TODO: receptor reload expects ordering within config items to be preserved
|
|
# if python dictionary is not preserving order properly, may need to find a
|
|
# solution. yaml.dump does not seem to work well with OrderedDict. below line may help
|
|
# yaml.add_representer(OrderedDict, lambda dumper, data: dumper.represent_mapping('tag:yaml.org,2002:map', data.items()))
|
|
#
|
|
RECEPTOR_CONFIG_STARTER = (
|
|
{'local-only': None},
|
|
{'log-level': settings.RECEPTOR_LOG_LEVEL},
|
|
{'node': {'firewallrules': [{'action': 'reject', 'tonode': settings.CLUSTER_HOST_ID, 'toservice': 'control'}]}},
|
|
{'control-service': {'service': 'control', 'filename': '/var/run/receptor/receptor.sock', 'permissions': '0660'}},
|
|
{'work-command': {'worktype': 'local', 'command': 'ansible-runner', 'params': 'worker', 'allowruntimeparams': True}},
|
|
{'work-signing': {'privatekey': '/etc/receptor/work_private_key.pem', 'tokenexpiration': '1m'}},
|
|
{
|
|
'work-kubernetes': {
|
|
'worktype': 'kubernetes-runtime-auth',
|
|
'authmethod': 'runtime',
|
|
'allowruntimeauth': True,
|
|
'allowruntimepod': True,
|
|
'allowruntimeparams': True,
|
|
}
|
|
},
|
|
{
|
|
'work-kubernetes': {
|
|
'worktype': 'kubernetes-incluster-auth',
|
|
'authmethod': 'incluster',
|
|
'allowruntimeauth': True,
|
|
'allowruntimepod': True,
|
|
'allowruntimeparams': True,
|
|
}
|
|
},
|
|
{
|
|
'tls-client': {
|
|
'name': 'tlsclient',
|
|
'rootcas': '/etc/receptor/tls/ca/mesh-CA.crt',
|
|
'cert': '/etc/receptor/tls/receptor.crt',
|
|
'key': '/etc/receptor/tls/receptor.key',
|
|
'mintls13': False,
|
|
}
|
|
},
|
|
)
|
|
|
|
|
|
def should_update_config(new_config):
|
|
'''
|
|
checks that the list of instances matches the list of
|
|
tcp-peers in the config
|
|
'''
|
|
|
|
current_config = read_receptor_config() # this gets receptor conf lock
|
|
for config_entry in current_config:
|
|
if config_entry not in new_config:
|
|
logger.warning(f"{config_entry} should not be in receptor config. Updating.")
|
|
return True
|
|
for config_entry in new_config:
|
|
if config_entry not in current_config:
|
|
logger.warning(f"{config_entry} missing from receptor config. Updating.")
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def generate_config_data():
|
|
# returns two values
|
|
# receptor config - based on current database peers
|
|
# should_update - If True, receptor_config differs from the receptor conf file on disk
|
|
addresses = ReceptorAddress.objects.filter(peers_from_control_nodes=True)
|
|
|
|
receptor_config = list(RECEPTOR_CONFIG_STARTER)
|
|
for address in addresses:
|
|
if address.get_peer_type():
|
|
peer = {
|
|
f'{address.get_peer_type()}': {
|
|
'address': f'{address.get_full_address()}',
|
|
'tls': 'tlsclient',
|
|
}
|
|
}
|
|
receptor_config.append(peer)
|
|
else:
|
|
logger.warning(f"Receptor address {address} has unsupported peer type, skipping.")
|
|
should_update = should_update_config(receptor_config)
|
|
return receptor_config, should_update
|
|
|
|
|
|
def reload_receptor():
|
|
logger.warning("Receptor config changed, reloading receptor")
|
|
|
|
# This needs to be outside of the lock because this function itself will acquire the lock.
|
|
receptor_ctl = get_receptor_ctl()
|
|
|
|
attempts = 10
|
|
for backoff in range(1, attempts + 1):
|
|
try:
|
|
receptor_ctl.simple_command("reload")
|
|
break
|
|
except ValueError:
|
|
logger.warning(f"Unable to reload Receptor configuration. {attempts-backoff} attempts left.")
|
|
time.sleep(backoff)
|
|
else:
|
|
raise RuntimeError("Receptor reload failed")
|
|
|
|
|
|
@task(on_duplicate='queue_one')
|
|
def write_receptor_config():
|
|
"""
|
|
This task runs async on each control node, K8S only.
|
|
It is triggered whenever remote is added or removed, or if peers_from_control_nodes
|
|
is flipped.
|
|
It is possible for write_receptor_config to be called multiple times.
|
|
For example, if new instances are added in quick succession.
|
|
To prevent that case, each control node first grabs a DB advisory lock, specific
|
|
to just that control node (i.e. multiple control nodes can run this function
|
|
at the same time, since it only writes the local receptor config file)
|
|
"""
|
|
with advisory_lock(f"{settings.CLUSTER_HOST_ID}_write_receptor_config", wait=True):
|
|
# Config file needs to be updated
|
|
receptor_config, should_update = generate_config_data()
|
|
if should_update:
|
|
lock = FileLock(__RECEPTOR_CONF_LOCKFILE)
|
|
with lock:
|
|
with open(__RECEPTOR_CONF, 'w') as file:
|
|
yaml.dump(receptor_config, file, default_flow_style=False)
|
|
reload_receptor()
|
|
|
|
|
|
@task(queue=get_task_queuename, on_duplicate='discard')
|
|
def remove_deprovisioned_node(hostname):
|
|
InstanceLink.objects.filter(source__hostname=hostname).update(link_state=InstanceLink.States.REMOVING)
|
|
InstanceLink.objects.filter(target__instance__hostname=hostname).update(link_state=InstanceLink.States.REMOVING)
|
|
|
|
node_jobs = UnifiedJob.objects.filter(
|
|
execution_node=hostname,
|
|
status__in=(
|
|
'running',
|
|
'waiting',
|
|
),
|
|
)
|
|
while node_jobs.exists():
|
|
time.sleep(60)
|
|
|
|
# This will as a side effect also delete the InstanceLinks that are tied to it.
|
|
Instance.objects.filter(hostname=hostname).delete()
|