mirror of
https://github.com/ansible/awx.git
synced 2026-04-28 21:25:25 -02:30
Refactored tasks.py to a package
--- Added 3 new sub-package : awx.main.tasks.system , awx.main.tasks.jobs , awx.main.tasks.receptor --- Modified the functional tests and unit tests accordingly
This commit is contained in:
0
awx/main/tasks/__init__.py
Normal file
0
awx/main/tasks/__init__.py
Normal file
2178
awx/main/tasks/jobs.py
Normal file
2178
awx/main/tasks/jobs.py
Normal file
File diff suppressed because it is too large
Load Diff
534
awx/main/tasks/receptor.py
Normal file
534
awx/main/tasks/receptor.py
Normal file
@@ -0,0 +1,534 @@
|
||||
# Python
|
||||
from base64 import b64encode
|
||||
from collections import namedtuple
|
||||
import concurrent.futures
|
||||
from enum import Enum
|
||||
import logging
|
||||
import socket
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import yaml
|
||||
|
||||
# Django
|
||||
from django.conf import settings
|
||||
|
||||
# Runner
|
||||
import ansible_runner
|
||||
|
||||
# AWX
|
||||
from awx.main.utils.execution_environments import get_default_pod_spec
|
||||
from awx.main.exceptions import ReceptorNodeNotFound
|
||||
from awx.main.utils.common import (
|
||||
deepmerge,
|
||||
parse_yaml_or_json,
|
||||
cleanup_new_process,
|
||||
)
|
||||
|
||||
# Receptorctl
|
||||
from receptorctl.socket_interface import ReceptorControl
|
||||
|
||||
logger = logging.getLogger('awx.main.tasks.receptor')
|
||||
__RECEPTOR_CONF = '/etc/receptor/receptor.conf'
|
||||
RECEPTOR_ACTIVE_STATES = ('Pending', 'Running')
|
||||
|
||||
|
||||
class ReceptorConnectionType(Enum):
|
||||
DATAGRAM = 0
|
||||
STREAM = 1
|
||||
STREAMTLS = 2
|
||||
|
||||
|
||||
def get_receptor_sockfile():
|
||||
with open(__RECEPTOR_CONF, 'r') as f:
|
||||
data = yaml.safe_load(f)
|
||||
for section in data:
|
||||
for entry_name, entry_data in section.items():
|
||||
if entry_name == 'control-service':
|
||||
if 'filename' in entry_data:
|
||||
return entry_data['filename']
|
||||
else:
|
||||
raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} control-service entry does not have a filename parameter')
|
||||
else:
|
||||
raise RuntimeError(f'Receptor conf {__RECEPTOR_CONF} does not have control-service entry needed to get sockfile')
|
||||
|
||||
|
||||
def get_tls_client(use_stream_tls=None):
|
||||
if not use_stream_tls:
|
||||
return None
|
||||
|
||||
with open(__RECEPTOR_CONF, 'r') as f:
|
||||
data = yaml.safe_load(f)
|
||||
for section in data:
|
||||
for entry_name, entry_data in section.items():
|
||||
if entry_name == 'tls-client':
|
||||
if 'name' in entry_data:
|
||||
return entry_data['name']
|
||||
return None
|
||||
|
||||
|
||||
def get_receptor_ctl():
|
||||
receptor_sockfile = get_receptor_sockfile()
|
||||
try:
|
||||
return ReceptorControl(receptor_sockfile, config=__RECEPTOR_CONF, tlsclient=get_tls_client(True))
|
||||
except RuntimeError:
|
||||
return ReceptorControl(receptor_sockfile)
|
||||
|
||||
|
||||
def get_conn_type(node_name, receptor_ctl):
|
||||
all_nodes = receptor_ctl.simple_command("status").get('Advertisements', None)
|
||||
for node in all_nodes:
|
||||
if node.get('NodeID') == node_name:
|
||||
return ReceptorConnectionType(node.get('ConnType'))
|
||||
raise ReceptorNodeNotFound(f'Instance {node_name} is not in the receptor mesh')
|
||||
|
||||
|
||||
def administrative_workunit_reaper(work_list=None):
|
||||
"""
|
||||
This releases completed work units that were spawned by actions inside of this module
|
||||
specifically, this should catch any completed work unit left by
|
||||
- worker_info
|
||||
- worker_cleanup
|
||||
These should ordinarily be released when the method finishes, but this is a
|
||||
cleanup of last-resort, in case something went awry
|
||||
"""
|
||||
receptor_ctl = get_receptor_ctl()
|
||||
if work_list is None:
|
||||
work_list = receptor_ctl.simple_command("work list")
|
||||
|
||||
for unit_id, work_data in work_list.items():
|
||||
extra_data = work_data.get('ExtraData')
|
||||
if (extra_data is None) or (extra_data.get('RemoteWorkType') != 'ansible-runner'):
|
||||
continue # if this is not ansible-runner work, we do not want to touch it
|
||||
params = extra_data.get('RemoteParams', {}).get('params')
|
||||
if not params:
|
||||
continue
|
||||
if not (params == '--worker-info' or params.startswith('cleanup')):
|
||||
continue # if this is not a cleanup or health check, we do not want to touch it
|
||||
if work_data.get('StateName') in RECEPTOR_ACTIVE_STATES:
|
||||
continue # do not want to touch active work units
|
||||
logger.info(f'Reaping orphaned work unit {unit_id} with params {params}')
|
||||
receptor_ctl.simple_command(f"work release {unit_id}")
|
||||
|
||||
|
||||
class RemoteJobError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def run_until_complete(node, timing_data=None, **kwargs):
|
||||
"""
|
||||
Runs an ansible-runner work_type on remote node, waits until it completes, then returns stdout.
|
||||
"""
|
||||
receptor_ctl = get_receptor_ctl()
|
||||
|
||||
use_stream_tls = getattr(get_conn_type(node, receptor_ctl), 'name', None) == "STREAMTLS"
|
||||
kwargs.setdefault('tlsclient', get_tls_client(use_stream_tls))
|
||||
kwargs.setdefault('ttl', '20s')
|
||||
kwargs.setdefault('payload', '')
|
||||
|
||||
transmit_start = time.time()
|
||||
sign_work = False if settings.IS_K8S else True
|
||||
result = receptor_ctl.submit_work(worktype='ansible-runner', node=node, signwork=sign_work, **kwargs)
|
||||
|
||||
unit_id = result['unitid']
|
||||
run_start = time.time()
|
||||
if timing_data:
|
||||
timing_data['transmit_timing'] = run_start - transmit_start
|
||||
run_timing = 0.0
|
||||
stdout = ''
|
||||
|
||||
try:
|
||||
|
||||
resultfile = receptor_ctl.get_work_results(unit_id)
|
||||
|
||||
while run_timing < 20.0:
|
||||
status = receptor_ctl.simple_command(f'work status {unit_id}')
|
||||
state_name = status.get('StateName')
|
||||
if state_name not in RECEPTOR_ACTIVE_STATES:
|
||||
break
|
||||
run_timing = time.time() - run_start
|
||||
time.sleep(0.5)
|
||||
else:
|
||||
raise RemoteJobError(f'Receptor job timeout on {node} after {run_timing} seconds, state remains in {state_name}')
|
||||
|
||||
if timing_data:
|
||||
timing_data['run_timing'] = run_timing
|
||||
|
||||
stdout = resultfile.read()
|
||||
stdout = str(stdout, encoding='utf-8')
|
||||
|
||||
finally:
|
||||
|
||||
if settings.RECEPTOR_RELEASE_WORK:
|
||||
res = receptor_ctl.simple_command(f"work release {unit_id}")
|
||||
if res != {'released': unit_id}:
|
||||
logger.warn(f'Could not confirm release of receptor work unit id {unit_id} from {node}, data: {res}')
|
||||
|
||||
receptor_ctl.close()
|
||||
|
||||
if state_name.lower() == 'failed':
|
||||
work_detail = status.get('Detail', '')
|
||||
if work_detail:
|
||||
raise RemoteJobError(f'Receptor error from {node}, detail:\n{work_detail}')
|
||||
else:
|
||||
raise RemoteJobError(f'Unknown ansible-runner error on node {node}, stdout:\n{stdout}')
|
||||
|
||||
return stdout
|
||||
|
||||
|
||||
def worker_info(node_name, work_type='ansible-runner'):
|
||||
error_list = []
|
||||
data = {'errors': error_list, 'transmit_timing': 0.0}
|
||||
|
||||
try:
|
||||
stdout = run_until_complete(node=node_name, timing_data=data, params={"params": "--worker-info"})
|
||||
|
||||
yaml_stdout = stdout.strip()
|
||||
remote_data = {}
|
||||
try:
|
||||
remote_data = yaml.safe_load(yaml_stdout)
|
||||
except Exception as json_e:
|
||||
error_list.append(f'Failed to parse node {node_name} --worker-info output as YAML, error: {json_e}, data:\n{yaml_stdout}')
|
||||
|
||||
if not isinstance(remote_data, dict):
|
||||
error_list.append(f'Remote node {node_name} --worker-info output is not a YAML dict, output:{stdout}')
|
||||
else:
|
||||
error_list.extend(remote_data.pop('errors', [])) # merge both error lists
|
||||
data.update(remote_data)
|
||||
|
||||
except RemoteJobError as exc:
|
||||
details = exc.args[0]
|
||||
if 'unrecognized arguments: --worker-info' in details:
|
||||
error_list.append(f'Old version (2.0.1 or earlier) of ansible-runner on node {node_name} without --worker-info')
|
||||
else:
|
||||
error_list.append(details)
|
||||
|
||||
except (ReceptorNodeNotFound, RuntimeError) as exc:
|
||||
error_list.append(str(exc))
|
||||
|
||||
# If we have a connection error, missing keys would be trivial consequence of that
|
||||
if not data['errors']:
|
||||
# see tasks.py usage of keys
|
||||
missing_keys = set(('runner_version', 'mem_in_bytes', 'cpu_count')) - set(data.keys())
|
||||
if missing_keys:
|
||||
data['errors'].append('Worker failed to return keys {}'.format(' '.join(missing_keys)))
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def _convert_args_to_cli(vargs):
|
||||
"""
|
||||
For the ansible-runner worker cleanup command
|
||||
converts the dictionary (parsed argparse variables) used for python interface
|
||||
into a string of CLI options, which has to be used on execution nodes.
|
||||
"""
|
||||
args = ['cleanup']
|
||||
for option in ('exclude_strings', 'remove_images'):
|
||||
if vargs.get(option):
|
||||
args.append('--{}={}'.format(option.replace('_', '-'), ' '.join(vargs.get(option))))
|
||||
for option in ('file_pattern', 'image_prune', 'process_isolation_executable', 'grace_period'):
|
||||
if vargs.get(option) is True:
|
||||
args.append('--{}'.format(option.replace('_', '-')))
|
||||
elif vargs.get(option) not in (None, ''):
|
||||
args.append('--{}={}'.format(option.replace('_', '-'), vargs.get(option)))
|
||||
return args
|
||||
|
||||
|
||||
def worker_cleanup(node_name, vargs, timeout=300.0):
|
||||
args = _convert_args_to_cli(vargs)
|
||||
|
||||
remote_command = ' '.join(args)
|
||||
logger.debug(f'Running command over receptor mesh on {node_name}: ansible-runner worker {remote_command}')
|
||||
|
||||
stdout = run_until_complete(node=node_name, params={"params": remote_command})
|
||||
|
||||
return stdout
|
||||
|
||||
|
||||
class TransmitterThread(threading.Thread):
|
||||
def run(self):
|
||||
self.exc = None
|
||||
|
||||
try:
|
||||
super().run()
|
||||
except Exception:
|
||||
self.exc = sys.exc_info()
|
||||
|
||||
|
||||
class AWXReceptorJob:
|
||||
def __init__(self, task, runner_params=None):
|
||||
self.task = task
|
||||
self.runner_params = runner_params
|
||||
self.unit_id = None
|
||||
|
||||
if self.task and not self.task.instance.is_container_group_task:
|
||||
execution_environment_params = self.task.build_execution_environment_params(self.task.instance, runner_params['private_data_dir'])
|
||||
self.runner_params.update(execution_environment_params)
|
||||
|
||||
if not settings.IS_K8S and self.work_type == 'local' and 'only_transmit_kwargs' not in self.runner_params:
|
||||
self.runner_params['only_transmit_kwargs'] = True
|
||||
|
||||
def run(self):
|
||||
# We establish a connection to the Receptor socket
|
||||
receptor_ctl = get_receptor_ctl()
|
||||
|
||||
res = None
|
||||
try:
|
||||
res = self._run_internal(receptor_ctl)
|
||||
return res
|
||||
finally:
|
||||
# Make sure to always release the work unit if we established it
|
||||
if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK:
|
||||
try:
|
||||
receptor_ctl.simple_command(f"work release {self.unit_id}")
|
||||
except Exception:
|
||||
logger.exception(f"Error releasing work unit {self.unit_id}.")
|
||||
|
||||
@property
|
||||
def sign_work(self):
|
||||
return False if settings.IS_K8S else True
|
||||
|
||||
def _run_internal(self, receptor_ctl):
|
||||
# Create a socketpair. Where the left side will be used for writing our payload
|
||||
# (private data dir, kwargs). The right side will be passed to Receptor for
|
||||
# reading.
|
||||
sockin, sockout = socket.socketpair()
|
||||
|
||||
transmitter_thread = TransmitterThread(target=self.transmit, args=[sockin])
|
||||
transmitter_thread.start()
|
||||
|
||||
# submit our work, passing
|
||||
# in the right side of our socketpair for reading.
|
||||
_kw = {}
|
||||
if self.work_type == 'ansible-runner':
|
||||
_kw['node'] = self.task.instance.execution_node
|
||||
use_stream_tls = get_conn_type(_kw['node'], receptor_ctl).name == "STREAMTLS"
|
||||
_kw['tlsclient'] = get_tls_client(use_stream_tls)
|
||||
result = receptor_ctl.submit_work(worktype=self.work_type, payload=sockout.makefile('rb'), params=self.receptor_params, signwork=self.sign_work, **_kw)
|
||||
self.unit_id = result['unitid']
|
||||
# Update the job with the work unit in-memory so that the log_lifecycle
|
||||
# will print out the work unit that is to be associated with the job in the database
|
||||
# via the update_model() call.
|
||||
# We want to log the work_unit_id as early as possible. A failure can happen in between
|
||||
# when we start the job in receptor and when we associate the job <-> work_unit_id.
|
||||
# In that case, there will be work running in receptor and Controller will not know
|
||||
# which Job it is associated with.
|
||||
# We do not programatically handle this case. Ideally, we would handle this with a reaper case.
|
||||
# The two distinct job lifecycle log events below allow for us to at least detect when this
|
||||
# edge case occurs. If the lifecycle event work_unit_id_received occurs without the
|
||||
# work_unit_id_assigned event then this case may have occured.
|
||||
self.task.instance.work_unit_id = result['unitid'] # Set work_unit_id in-memory only
|
||||
self.task.instance.log_lifecycle("work_unit_id_received")
|
||||
self.task.update_model(self.task.instance.pk, work_unit_id=result['unitid'])
|
||||
self.task.instance.log_lifecycle("work_unit_id_assigned")
|
||||
|
||||
sockin.close()
|
||||
sockout.close()
|
||||
|
||||
if transmitter_thread.exc:
|
||||
raise transmitter_thread.exc[1].with_traceback(transmitter_thread.exc[2])
|
||||
|
||||
transmitter_thread.join()
|
||||
|
||||
resultsock, resultfile = receptor_ctl.get_work_results(self.unit_id, return_socket=True, return_sockfile=True)
|
||||
# Both "processor" and "cancel_watcher" are spawned in separate threads.
|
||||
# We wait for the first one to return. If cancel_watcher returns first,
|
||||
# we yank the socket out from underneath the processor, which will cause it
|
||||
# to exit. A reference to the processor_future is passed into the cancel_watcher_future,
|
||||
# Which exits if the job has finished normally. The context manager ensures we do not
|
||||
# leave any threads laying around.
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
processor_future = executor.submit(self.processor, resultfile)
|
||||
cancel_watcher_future = executor.submit(self.cancel_watcher, processor_future)
|
||||
futures = [processor_future, cancel_watcher_future]
|
||||
first_future = concurrent.futures.wait(futures, return_when=concurrent.futures.FIRST_COMPLETED)
|
||||
|
||||
res = list(first_future.done)[0].result()
|
||||
if res.status == 'canceled':
|
||||
receptor_ctl.simple_command(f"work cancel {self.unit_id}")
|
||||
resultsock.shutdown(socket.SHUT_RDWR)
|
||||
resultfile.close()
|
||||
elif res.status == 'error':
|
||||
try:
|
||||
unit_status = receptor_ctl.simple_command(f'work status {self.unit_id}')
|
||||
detail = unit_status.get('Detail', None)
|
||||
state_name = unit_status.get('StateName', None)
|
||||
except Exception:
|
||||
detail = ''
|
||||
state_name = ''
|
||||
logger.exception(f'An error was encountered while getting status for work unit {self.unit_id}')
|
||||
|
||||
if 'exceeded quota' in detail:
|
||||
logger.warn(detail)
|
||||
log_name = self.task.instance.log_format
|
||||
logger.warn(f"Could not launch pod for {log_name}. Exceeded quota.")
|
||||
self.task.update_model(self.task.instance.pk, status='pending')
|
||||
return
|
||||
# If ansible-runner ran, but an error occured at runtime, the traceback information
|
||||
# is saved via the status_handler passed in to the processor.
|
||||
if state_name == 'Succeeded':
|
||||
return res
|
||||
|
||||
if not self.task.instance.result_traceback:
|
||||
try:
|
||||
resultsock = receptor_ctl.get_work_results(self.unit_id, return_sockfile=True)
|
||||
lines = resultsock.readlines()
|
||||
receptor_output = b"".join(lines).decode()
|
||||
if receptor_output:
|
||||
self.task.instance.result_traceback = receptor_output
|
||||
self.task.instance.save(update_fields=['result_traceback'])
|
||||
elif detail:
|
||||
self.task.instance.result_traceback = detail
|
||||
self.task.instance.save(update_fields=['result_traceback'])
|
||||
else:
|
||||
logger.warn(f'No result details or output from {self.task.instance.log_format}, status:\n{unit_status}')
|
||||
except Exception:
|
||||
raise RuntimeError(detail)
|
||||
|
||||
return res
|
||||
|
||||
# Spawned in a thread so Receptor can start reading before we finish writing, we
|
||||
# write our payload to the left side of our socketpair.
|
||||
@cleanup_new_process
|
||||
def transmit(self, _socket):
|
||||
try:
|
||||
ansible_runner.interface.run(streamer='transmit', _output=_socket.makefile('wb'), **self.runner_params)
|
||||
finally:
|
||||
# Socket must be shutdown here, or the reader will hang forever.
|
||||
_socket.shutdown(socket.SHUT_WR)
|
||||
|
||||
@cleanup_new_process
|
||||
def processor(self, resultfile):
|
||||
return ansible_runner.interface.run(
|
||||
streamer='process',
|
||||
quiet=True,
|
||||
_input=resultfile,
|
||||
event_handler=self.task.event_handler,
|
||||
finished_callback=self.task.finished_callback,
|
||||
status_handler=self.task.status_handler,
|
||||
**self.runner_params,
|
||||
)
|
||||
|
||||
@property
|
||||
def receptor_params(self):
|
||||
if self.task.instance.is_container_group_task:
|
||||
spec_yaml = yaml.dump(self.pod_definition, explicit_start=True)
|
||||
|
||||
receptor_params = {
|
||||
"secret_kube_pod": spec_yaml,
|
||||
"pod_pending_timeout": getattr(settings, 'AWX_CONTAINER_GROUP_POD_PENDING_TIMEOUT', "5m"),
|
||||
}
|
||||
|
||||
if self.credential:
|
||||
kubeconfig_yaml = yaml.dump(self.kube_config, explicit_start=True)
|
||||
receptor_params["secret_kube_config"] = kubeconfig_yaml
|
||||
else:
|
||||
private_data_dir = self.runner_params['private_data_dir']
|
||||
if self.work_type == 'ansible-runner' and settings.AWX_CLEANUP_PATHS:
|
||||
# on execution nodes, we rely on the private data dir being deleted
|
||||
cli_params = f"--private-data-dir={private_data_dir} --delete"
|
||||
else:
|
||||
# on hybrid nodes, we rely on the private data dir NOT being deleted
|
||||
cli_params = f"--private-data-dir={private_data_dir}"
|
||||
receptor_params = {"params": cli_params}
|
||||
|
||||
return receptor_params
|
||||
|
||||
@property
|
||||
def work_type(self):
|
||||
if self.task.instance.is_container_group_task:
|
||||
if self.credential:
|
||||
return 'kubernetes-runtime-auth'
|
||||
return 'kubernetes-incluster-auth'
|
||||
if self.task.instance.execution_node == settings.CLUSTER_HOST_ID or self.task.instance.execution_node == self.task.instance.controller_node:
|
||||
return 'local'
|
||||
return 'ansible-runner'
|
||||
|
||||
@cleanup_new_process
|
||||
def cancel_watcher(self, processor_future):
|
||||
while True:
|
||||
if processor_future.done():
|
||||
return processor_future.result()
|
||||
|
||||
if self.task.cancel_callback():
|
||||
result = namedtuple('result', ['status', 'rc'])
|
||||
return result('canceled', 1)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
@property
|
||||
def pod_definition(self):
|
||||
ee = self.task.instance.execution_environment
|
||||
|
||||
default_pod_spec = get_default_pod_spec()
|
||||
|
||||
pod_spec_override = {}
|
||||
if self.task and self.task.instance.instance_group.pod_spec_override:
|
||||
pod_spec_override = parse_yaml_or_json(self.task.instance.instance_group.pod_spec_override)
|
||||
# According to the deepmerge docstring, the second dictionary will override when
|
||||
# they share keys, which is the desired behavior.
|
||||
# This allows user to only provide elements they want to override, and for us to still provide any
|
||||
# defaults they don't want to change
|
||||
pod_spec = deepmerge(default_pod_spec, pod_spec_override)
|
||||
|
||||
pod_spec['spec']['containers'][0]['image'] = ee.image
|
||||
pod_spec['spec']['containers'][0]['args'] = ['ansible-runner', 'worker', '--private-data-dir=/runner']
|
||||
|
||||
# Enforce EE Pull Policy
|
||||
pull_options = {"always": "Always", "missing": "IfNotPresent", "never": "Never"}
|
||||
if self.task and self.task.instance.execution_environment:
|
||||
if self.task.instance.execution_environment.pull:
|
||||
pod_spec['spec']['containers'][0]['imagePullPolicy'] = pull_options[self.task.instance.execution_environment.pull]
|
||||
|
||||
if self.task and self.task.instance.is_container_group_task:
|
||||
# If EE credential is passed, create an imagePullSecret
|
||||
if self.task.instance.execution_environment and self.task.instance.execution_environment.credential:
|
||||
# Create pull secret in k8s cluster based on ee cred
|
||||
from awx.main.scheduler.kubernetes import PodManager # prevent circular import
|
||||
|
||||
pm = PodManager(self.task.instance)
|
||||
secret_name = pm.create_secret(job=self.task.instance)
|
||||
|
||||
# Inject secret name into podspec
|
||||
pod_spec['spec']['imagePullSecrets'] = [{"name": secret_name}]
|
||||
|
||||
if self.task:
|
||||
pod_spec['metadata'] = deepmerge(
|
||||
pod_spec.get('metadata', {}),
|
||||
dict(name=self.pod_name, labels={'ansible-awx': settings.INSTALL_UUID, 'ansible-awx-job-id': str(self.task.instance.id)}),
|
||||
)
|
||||
|
||||
return pod_spec
|
||||
|
||||
@property
|
||||
def pod_name(self):
|
||||
return f"automation-job-{self.task.instance.id}"
|
||||
|
||||
@property
|
||||
def credential(self):
|
||||
return self.task.instance.instance_group.credential
|
||||
|
||||
@property
|
||||
def namespace(self):
|
||||
return self.pod_definition['metadata']['namespace']
|
||||
|
||||
@property
|
||||
def kube_config(self):
|
||||
host_input = self.credential.get_input('host')
|
||||
config = {
|
||||
"apiVersion": "v1",
|
||||
"kind": "Config",
|
||||
"preferences": {},
|
||||
"clusters": [{"name": host_input, "cluster": {"server": host_input}}],
|
||||
"users": [{"name": host_input, "user": {"token": self.credential.get_input('bearer_token')}}],
|
||||
"contexts": [{"name": host_input, "context": {"cluster": host_input, "user": host_input, "namespace": self.namespace}}],
|
||||
"current-context": host_input,
|
||||
}
|
||||
|
||||
if self.credential.get_input('verify_ssl') and 'ssl_ca_cert' in self.credential.inputs:
|
||||
config["clusters"][0]["cluster"]["certificate-authority-data"] = b64encode(
|
||||
self.credential.get_input('ssl_ca_cert').encode() # encode to bytes
|
||||
).decode() # decode the base64 data into a str
|
||||
else:
|
||||
config["clusters"][0]["cluster"]["insecure-skip-tls-verify"] = True
|
||||
return config
|
||||
897
awx/main/tasks/system.py
Normal file
897
awx/main/tasks/system.py
Normal file
@@ -0,0 +1,897 @@
|
||||
# Python
|
||||
from collections import namedtuple
|
||||
import functools
|
||||
import importlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from io import StringIO
|
||||
from contextlib import redirect_stdout
|
||||
import shutil
|
||||
import time
|
||||
from distutils.version import LooseVersion as Version
|
||||
|
||||
# Django
|
||||
from django.conf import settings
|
||||
from django.db import transaction, DatabaseError, IntegrityError
|
||||
from django.db.models.fields.related import ForeignKey
|
||||
from django.utils.timezone import now
|
||||
from django.utils.encoding import smart_str
|
||||
from django.contrib.auth.models import User
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.utils.translation import gettext_noop
|
||||
from django.core.cache import cache
|
||||
from django.core.exceptions import ObjectDoesNotExist
|
||||
|
||||
# Django-CRUM
|
||||
from crum import impersonate
|
||||
|
||||
|
||||
# Runner
|
||||
import ansible_runner.cleanup
|
||||
|
||||
# dateutil
|
||||
from dateutil.parser import parse as parse_date
|
||||
|
||||
# AWX
|
||||
from awx import __version__ as awx_application_version
|
||||
from awx.main.access import access_registry
|
||||
from awx.main.models import (
|
||||
Schedule,
|
||||
TowerScheduleState,
|
||||
Instance,
|
||||
InstanceGroup,
|
||||
UnifiedJob,
|
||||
Notification,
|
||||
Inventory,
|
||||
SmartInventoryMembership,
|
||||
Job,
|
||||
)
|
||||
from awx.main.constants import ACTIVE_STATES
|
||||
from awx.main.dispatch.publish import task
|
||||
from awx.main.dispatch import get_local_queuename, reaper
|
||||
from awx.main.utils.common import (
|
||||
ignore_inventory_computed_fields,
|
||||
ignore_inventory_group_removal,
|
||||
schedule_task_manager,
|
||||
)
|
||||
|
||||
from awx.main.utils.external_logging import reconfigure_rsyslog
|
||||
from awx.main.utils.reload import stop_local_services
|
||||
from awx.main.utils.pglock import advisory_lock
|
||||
from awx.main.tasks.receptor import get_receptor_ctl, worker_info, worker_cleanup, administrative_workunit_reaper
|
||||
from awx.main.consumers import emit_channel_notification
|
||||
from awx.main import analytics
|
||||
from awx.conf import settings_registry
|
||||
from awx.main.analytics.subsystem_metrics import Metrics
|
||||
|
||||
from rest_framework.exceptions import PermissionDenied
|
||||
|
||||
logger = logging.getLogger('awx.main.tasks.system')
|
||||
|
||||
OPENSSH_KEY_ERROR = u'''\
|
||||
It looks like you're trying to use a private key in OpenSSH format, which \
|
||||
isn't supported by the installed version of OpenSSH on this instance. \
|
||||
Try upgrading OpenSSH or providing your private key in an different format. \
|
||||
'''
|
||||
|
||||
|
||||
def dispatch_startup():
|
||||
startup_logger = logging.getLogger('awx.main.tasks')
|
||||
startup_logger.debug("Syncing Schedules")
|
||||
for sch in Schedule.objects.all():
|
||||
try:
|
||||
sch.update_computed_fields()
|
||||
except Exception:
|
||||
logger.exception("Failed to rebuild schedule {}.".format(sch))
|
||||
|
||||
#
|
||||
# When the dispatcher starts, if the instance cannot be found in the database,
|
||||
# automatically register it. This is mostly useful for openshift-based
|
||||
# deployments where:
|
||||
#
|
||||
# 2 Instances come online
|
||||
# Instance B encounters a network blip, Instance A notices, and
|
||||
# deprovisions it
|
||||
# Instance B's connectivity is restored, the dispatcher starts, and it
|
||||
# re-registers itself
|
||||
#
|
||||
# In traditional container-less deployments, instances don't get
|
||||
# deprovisioned when they miss their heartbeat, so this code is mostly a
|
||||
# no-op.
|
||||
#
|
||||
apply_cluster_membership_policies()
|
||||
cluster_node_heartbeat()
|
||||
Metrics().clear_values()
|
||||
|
||||
# Update Tower's rsyslog.conf file based on loggins settings in the db
|
||||
reconfigure_rsyslog()
|
||||
|
||||
|
||||
def inform_cluster_of_shutdown():
|
||||
try:
|
||||
this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID)
|
||||
this_inst.mark_offline(update_last_seen=True, errors=_('Instance received normal shutdown signal'))
|
||||
try:
|
||||
reaper.reap(this_inst)
|
||||
except Exception:
|
||||
logger.exception('failed to reap jobs for {}'.format(this_inst.hostname))
|
||||
logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname))
|
||||
except Exception:
|
||||
logger.exception('Encountered problem with normal shutdown signal.')
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def apply_cluster_membership_policies():
|
||||
from awx.main.signals import disable_activity_stream
|
||||
|
||||
started_waiting = time.time()
|
||||
with advisory_lock('cluster_policy_lock', wait=True):
|
||||
lock_time = time.time() - started_waiting
|
||||
if lock_time > 1.0:
|
||||
to_log = logger.info
|
||||
else:
|
||||
to_log = logger.debug
|
||||
to_log('Waited {} seconds to obtain lock name: cluster_policy_lock'.format(lock_time))
|
||||
started_compute = time.time()
|
||||
# Hop nodes should never get assigned to an InstanceGroup.
|
||||
all_instances = list(Instance.objects.exclude(node_type='hop').order_by('id'))
|
||||
all_groups = list(InstanceGroup.objects.prefetch_related('instances'))
|
||||
|
||||
total_instances = len(all_instances)
|
||||
actual_groups = []
|
||||
actual_instances = []
|
||||
Group = namedtuple('Group', ['obj', 'instances', 'prior_instances'])
|
||||
Node = namedtuple('Instance', ['obj', 'groups'])
|
||||
|
||||
# Process policy instance list first, these will represent manually managed memberships
|
||||
instance_hostnames_map = {inst.hostname: inst for inst in all_instances}
|
||||
for ig in all_groups:
|
||||
group_actual = Group(obj=ig, instances=[], prior_instances=[instance.pk for instance in ig.instances.all()]) # obtained in prefetch
|
||||
for hostname in ig.policy_instance_list:
|
||||
if hostname not in instance_hostnames_map:
|
||||
logger.info("Unknown instance {} in {} policy list".format(hostname, ig.name))
|
||||
continue
|
||||
inst = instance_hostnames_map[hostname]
|
||||
group_actual.instances.append(inst.id)
|
||||
# NOTE: arguable behavior: policy-list-group is not added to
|
||||
# instance's group count for consideration in minimum-policy rules
|
||||
if group_actual.instances:
|
||||
logger.debug("Policy List, adding Instances {} to Group {}".format(group_actual.instances, ig.name))
|
||||
|
||||
actual_groups.append(group_actual)
|
||||
|
||||
# Process Instance minimum policies next, since it represents a concrete lower bound to the
|
||||
# number of instances to make available to instance groups
|
||||
actual_instances = [Node(obj=i, groups=[]) for i in all_instances if i.managed_by_policy]
|
||||
logger.debug("Total instances: {}, available for policy: {}".format(total_instances, len(actual_instances)))
|
||||
for g in sorted(actual_groups, key=lambda x: len(x.instances)):
|
||||
exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control'
|
||||
policy_min_added = []
|
||||
for i in sorted(actual_instances, key=lambda x: len(x.groups)):
|
||||
if i.obj.node_type == exclude_type:
|
||||
continue # never place execution instances in controlplane group or control instances in other groups
|
||||
if len(g.instances) >= g.obj.policy_instance_minimum:
|
||||
break
|
||||
if i.obj.id in g.instances:
|
||||
# If the instance is already _in_ the group, it was
|
||||
# applied earlier via the policy list
|
||||
continue
|
||||
g.instances.append(i.obj.id)
|
||||
i.groups.append(g.obj.id)
|
||||
policy_min_added.append(i.obj.id)
|
||||
if policy_min_added:
|
||||
logger.debug("Policy minimum, adding Instances {} to Group {}".format(policy_min_added, g.obj.name))
|
||||
|
||||
# Finally, process instance policy percentages
|
||||
for g in sorted(actual_groups, key=lambda x: len(x.instances)):
|
||||
exclude_type = 'execution' if g.obj.name == settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME else 'control'
|
||||
candidate_pool_ct = sum(1 for i in actual_instances if i.obj.node_type != exclude_type)
|
||||
if not candidate_pool_ct:
|
||||
continue
|
||||
policy_per_added = []
|
||||
for i in sorted(actual_instances, key=lambda x: len(x.groups)):
|
||||
if i.obj.node_type == exclude_type:
|
||||
continue
|
||||
if i.obj.id in g.instances:
|
||||
# If the instance is already _in_ the group, it was
|
||||
# applied earlier via a minimum policy or policy list
|
||||
continue
|
||||
if 100 * float(len(g.instances)) / candidate_pool_ct >= g.obj.policy_instance_percentage:
|
||||
break
|
||||
g.instances.append(i.obj.id)
|
||||
i.groups.append(g.obj.id)
|
||||
policy_per_added.append(i.obj.id)
|
||||
if policy_per_added:
|
||||
logger.debug("Policy percentage, adding Instances {} to Group {}".format(policy_per_added, g.obj.name))
|
||||
|
||||
# Determine if any changes need to be made
|
||||
needs_change = False
|
||||
for g in actual_groups:
|
||||
if set(g.instances) != set(g.prior_instances):
|
||||
needs_change = True
|
||||
break
|
||||
if not needs_change:
|
||||
logger.debug('Cluster policy no-op finished in {} seconds'.format(time.time() - started_compute))
|
||||
return
|
||||
|
||||
# On a differential basis, apply instances to groups
|
||||
with transaction.atomic():
|
||||
with disable_activity_stream():
|
||||
for g in actual_groups:
|
||||
if g.obj.is_container_group:
|
||||
logger.debug('Skipping containerized group {} for policy calculation'.format(g.obj.name))
|
||||
continue
|
||||
instances_to_add = set(g.instances) - set(g.prior_instances)
|
||||
instances_to_remove = set(g.prior_instances) - set(g.instances)
|
||||
if instances_to_add:
|
||||
logger.debug('Adding instances {} to group {}'.format(list(instances_to_add), g.obj.name))
|
||||
g.obj.instances.add(*instances_to_add)
|
||||
if instances_to_remove:
|
||||
logger.debug('Removing instances {} from group {}'.format(list(instances_to_remove), g.obj.name))
|
||||
g.obj.instances.remove(*instances_to_remove)
|
||||
logger.debug('Cluster policy computation finished in {} seconds'.format(time.time() - started_compute))
|
||||
|
||||
|
||||
@task(queue='tower_broadcast_all')
|
||||
def handle_setting_changes(setting_keys):
|
||||
orig_len = len(setting_keys)
|
||||
for i in range(orig_len):
|
||||
for dependent_key in settings_registry.get_dependent_settings(setting_keys[i]):
|
||||
setting_keys.append(dependent_key)
|
||||
cache_keys = set(setting_keys)
|
||||
logger.debug('cache delete_many(%r)', cache_keys)
|
||||
cache.delete_many(cache_keys)
|
||||
|
||||
if any([setting.startswith('LOG_AGGREGATOR') for setting in setting_keys]):
|
||||
reconfigure_rsyslog()
|
||||
|
||||
|
||||
@task(queue='tower_broadcast_all')
|
||||
def delete_project_files(project_path):
|
||||
# TODO: possibly implement some retry logic
|
||||
lock_file = project_path + '.lock'
|
||||
if os.path.exists(project_path):
|
||||
try:
|
||||
shutil.rmtree(project_path)
|
||||
logger.debug('Success removing project files {}'.format(project_path))
|
||||
except Exception:
|
||||
logger.exception('Could not remove project directory {}'.format(project_path))
|
||||
if os.path.exists(lock_file):
|
||||
try:
|
||||
os.remove(lock_file)
|
||||
logger.debug('Success removing {}'.format(lock_file))
|
||||
except Exception:
|
||||
logger.exception('Could not remove lock file {}'.format(lock_file))
|
||||
|
||||
|
||||
@task(queue='tower_broadcast_all')
|
||||
def profile_sql(threshold=1, minutes=1):
|
||||
if threshold <= 0:
|
||||
cache.delete('awx-profile-sql-threshold')
|
||||
logger.error('SQL PROFILING DISABLED')
|
||||
else:
|
||||
cache.set('awx-profile-sql-threshold', threshold, timeout=minutes * 60)
|
||||
logger.error('SQL QUERIES >={}s ENABLED FOR {} MINUTE(S)'.format(threshold, minutes))
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def send_notifications(notification_list, job_id=None):
|
||||
if not isinstance(notification_list, list):
|
||||
raise TypeError("notification_list should be of type list")
|
||||
if job_id is not None:
|
||||
job_actual = UnifiedJob.objects.get(id=job_id)
|
||||
|
||||
notifications = Notification.objects.filter(id__in=notification_list)
|
||||
if job_id is not None:
|
||||
job_actual.notifications.add(*notifications)
|
||||
|
||||
for notification in notifications:
|
||||
update_fields = ['status', 'notifications_sent']
|
||||
try:
|
||||
sent = notification.notification_template.send(notification.subject, notification.body)
|
||||
notification.status = "successful"
|
||||
notification.notifications_sent = sent
|
||||
if job_id is not None:
|
||||
job_actual.log_lifecycle("notifications_sent")
|
||||
except Exception as e:
|
||||
logger.exception("Send Notification Failed {}".format(e))
|
||||
notification.status = "failed"
|
||||
notification.error = smart_str(e)
|
||||
update_fields.append('error')
|
||||
finally:
|
||||
try:
|
||||
notification.save(update_fields=update_fields)
|
||||
except Exception:
|
||||
logger.exception('Error saving notification {} result.'.format(notification.id))
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def gather_analytics():
|
||||
from awx.conf.models import Setting
|
||||
from rest_framework.fields import DateTimeField
|
||||
|
||||
last_gather = Setting.objects.filter(key='AUTOMATION_ANALYTICS_LAST_GATHER').first()
|
||||
last_time = DateTimeField().to_internal_value(last_gather.value) if last_gather and last_gather.value else None
|
||||
gather_time = now()
|
||||
|
||||
if not last_time or ((gather_time - last_time).total_seconds() > settings.AUTOMATION_ANALYTICS_GATHER_INTERVAL):
|
||||
analytics.gather()
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def purge_old_stdout_files():
|
||||
nowtime = time.time()
|
||||
for f in os.listdir(settings.JOBOUTPUT_ROOT):
|
||||
if os.path.getctime(os.path.join(settings.JOBOUTPUT_ROOT, f)) < nowtime - settings.LOCAL_STDOUT_EXPIRE_TIME:
|
||||
os.unlink(os.path.join(settings.JOBOUTPUT_ROOT, f))
|
||||
logger.debug("Removing {}".format(os.path.join(settings.JOBOUTPUT_ROOT, f)))
|
||||
|
||||
|
||||
def _cleanup_images_and_files(**kwargs):
|
||||
if settings.IS_K8S:
|
||||
return
|
||||
this_inst = Instance.objects.me()
|
||||
runner_cleanup_kwargs = this_inst.get_cleanup_task_kwargs(**kwargs)
|
||||
if runner_cleanup_kwargs:
|
||||
stdout = ''
|
||||
with StringIO() as buffer:
|
||||
with redirect_stdout(buffer):
|
||||
ansible_runner.cleanup.run_cleanup(runner_cleanup_kwargs)
|
||||
stdout = buffer.getvalue()
|
||||
if '(changed: True)' in stdout:
|
||||
logger.info(f'Performed local cleanup with kwargs {kwargs}, output:\n{stdout}')
|
||||
|
||||
# if we are the first instance alphabetically, then run cleanup on execution nodes
|
||||
checker_instance = Instance.objects.filter(node_type__in=['hybrid', 'control'], enabled=True, capacity__gt=0).order_by('-hostname').first()
|
||||
if checker_instance and this_inst.hostname == checker_instance.hostname:
|
||||
for inst in Instance.objects.filter(node_type='execution', enabled=True, capacity__gt=0):
|
||||
runner_cleanup_kwargs = inst.get_cleanup_task_kwargs(**kwargs)
|
||||
if not runner_cleanup_kwargs:
|
||||
continue
|
||||
try:
|
||||
stdout = worker_cleanup(inst.hostname, runner_cleanup_kwargs)
|
||||
if '(changed: True)' in stdout:
|
||||
logger.info(f'Performed cleanup on execution node {inst.hostname} with output:\n{stdout}')
|
||||
except RuntimeError:
|
||||
logger.exception(f'Error running cleanup on execution node {inst.hostname}')
|
||||
|
||||
|
||||
@task(queue='tower_broadcast_all')
|
||||
def handle_removed_image(remove_images=None):
|
||||
"""Special broadcast invocation of this method to handle case of deleted EE"""
|
||||
_cleanup_images_and_files(remove_images=remove_images, file_pattern='')
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def cleanup_images_and_files():
|
||||
_cleanup_images_and_files()
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def cluster_node_health_check(node):
|
||||
"""
|
||||
Used for the health check endpoint, refreshes the status of the instance, but must be ran on target node
|
||||
"""
|
||||
if node == '':
|
||||
logger.warn('Local health check incorrectly called with blank string')
|
||||
return
|
||||
elif node != settings.CLUSTER_HOST_ID:
|
||||
logger.warn(f'Local health check for {node} incorrectly sent to {settings.CLUSTER_HOST_ID}')
|
||||
return
|
||||
try:
|
||||
this_inst = Instance.objects.me()
|
||||
except Instance.DoesNotExist:
|
||||
logger.warn(f'Instance record for {node} missing, could not check capacity.')
|
||||
return
|
||||
this_inst.local_health_check()
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def execution_node_health_check(node):
|
||||
if node == '':
|
||||
logger.warn('Remote health check incorrectly called with blank string')
|
||||
return
|
||||
try:
|
||||
instance = Instance.objects.get(hostname=node)
|
||||
except Instance.DoesNotExist:
|
||||
logger.warn(f'Instance record for {node} missing, could not check capacity.')
|
||||
return
|
||||
|
||||
if instance.node_type != 'execution':
|
||||
raise RuntimeError(f'Execution node health check ran against {instance.node_type} node {instance.hostname}')
|
||||
|
||||
data = worker_info(node)
|
||||
|
||||
prior_capacity = instance.capacity
|
||||
|
||||
instance.save_health_data(
|
||||
version='ansible-runner-' + data.get('runner_version', '???'),
|
||||
cpu=data.get('cpu_count', 0),
|
||||
memory=data.get('mem_in_bytes', 0),
|
||||
uuid=data.get('uuid'),
|
||||
errors='\n'.join(data.get('errors', [])),
|
||||
)
|
||||
|
||||
if data['errors']:
|
||||
formatted_error = "\n".join(data["errors"])
|
||||
if prior_capacity:
|
||||
logger.warn(f'Health check marking execution node {node} as lost, errors:\n{formatted_error}')
|
||||
else:
|
||||
logger.info(f'Failed to find capacity of new or lost execution node {node}, errors:\n{formatted_error}')
|
||||
else:
|
||||
logger.info('Set capacity of execution node {} to {}, worker info data:\n{}'.format(node, instance.capacity, json.dumps(data, indent=2)))
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def inspect_execution_nodes(instance_list):
|
||||
with advisory_lock('inspect_execution_nodes_lock', wait=False):
|
||||
node_lookup = {inst.hostname: inst for inst in instance_list}
|
||||
|
||||
ctl = get_receptor_ctl()
|
||||
mesh_status = ctl.simple_command('status')
|
||||
|
||||
nowtime = now()
|
||||
workers = mesh_status['Advertisements']
|
||||
for ad in workers:
|
||||
hostname = ad['NodeID']
|
||||
if not any(cmd['WorkType'] == 'ansible-runner' for cmd in ad['WorkCommands'] or []):
|
||||
continue
|
||||
|
||||
changed = False
|
||||
if hostname in node_lookup:
|
||||
instance = node_lookup[hostname]
|
||||
else:
|
||||
logger.warn(f"Unrecognized node on mesh advertising ansible-runner work type: {hostname}")
|
||||
continue
|
||||
|
||||
was_lost = instance.is_lost(ref_time=nowtime)
|
||||
last_seen = parse_date(ad['Time'])
|
||||
|
||||
if instance.last_seen and instance.last_seen >= last_seen:
|
||||
continue
|
||||
instance.last_seen = last_seen
|
||||
instance.save(update_fields=['last_seen'])
|
||||
|
||||
if changed:
|
||||
execution_node_health_check.apply_async([hostname])
|
||||
elif was_lost:
|
||||
# if the instance *was* lost, but has appeared again,
|
||||
# attempt to re-establish the initial capacity and version
|
||||
# check
|
||||
logger.warn(f'Execution node attempting to rejoin as instance {hostname}.')
|
||||
execution_node_health_check.apply_async([hostname])
|
||||
elif instance.capacity == 0 and instance.enabled:
|
||||
# nodes with proven connection but need remediation run health checks are reduced frequency
|
||||
if not instance.last_health_check or (nowtime - instance.last_health_check).total_seconds() >= settings.EXECUTION_NODE_REMEDIATION_CHECKS:
|
||||
# Periodically re-run the health check of errored nodes, in case someone fixed it
|
||||
# TODO: perhaps decrease the frequency of these checks
|
||||
logger.debug(f'Restarting health check for execution node {hostname} with known errors.')
|
||||
execution_node_health_check.apply_async([hostname])
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def cluster_node_heartbeat():
|
||||
logger.debug("Cluster node heartbeat task.")
|
||||
nowtime = now()
|
||||
instance_list = list(Instance.objects.all())
|
||||
this_inst = None
|
||||
lost_instances = []
|
||||
|
||||
for inst in instance_list:
|
||||
if inst.hostname == settings.CLUSTER_HOST_ID:
|
||||
this_inst = inst
|
||||
instance_list.remove(inst)
|
||||
break
|
||||
else:
|
||||
(changed, this_inst) = Instance.objects.get_or_register()
|
||||
if changed:
|
||||
logger.info("Registered tower control node '{}'".format(this_inst.hostname))
|
||||
|
||||
inspect_execution_nodes(instance_list)
|
||||
|
||||
for inst in list(instance_list):
|
||||
if inst.is_lost(ref_time=nowtime):
|
||||
lost_instances.append(inst)
|
||||
instance_list.remove(inst)
|
||||
|
||||
if this_inst:
|
||||
startup_event = this_inst.is_lost(ref_time=nowtime)
|
||||
this_inst.local_health_check()
|
||||
if startup_event and this_inst.capacity != 0:
|
||||
logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
|
||||
return
|
||||
else:
|
||||
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
|
||||
# IFF any node has a greater version than we do, then we'll shutdown services
|
||||
for other_inst in instance_list:
|
||||
if other_inst.version == "" or other_inst.version.startswith('ansible-runner') or other_inst.node_type == 'execution':
|
||||
continue
|
||||
if Version(other_inst.version.split('-', 1)[0]) > Version(awx_application_version.split('-', 1)[0]) and not settings.DEBUG:
|
||||
logger.error(
|
||||
"Host {} reports version {}, but this node {} is at {}, shutting down".format(
|
||||
other_inst.hostname, other_inst.version, this_inst.hostname, this_inst.version
|
||||
)
|
||||
)
|
||||
# Shutdown signal will set the capacity to zero to ensure no Jobs get added to this instance.
|
||||
# The heartbeat task will reset the capacity to the system capacity after upgrade.
|
||||
stop_local_services(communicate=False)
|
||||
raise RuntimeError("Shutting down.")
|
||||
|
||||
for other_inst in lost_instances:
|
||||
try:
|
||||
reaper.reap(other_inst)
|
||||
except Exception:
|
||||
logger.exception('failed to reap jobs for {}'.format(other_inst.hostname))
|
||||
try:
|
||||
# Capacity could already be 0 because:
|
||||
# * It's a new node and it never had a heartbeat
|
||||
# * It was set to 0 by another tower node running this method
|
||||
# * It was set to 0 by this node, but auto deprovisioning is off
|
||||
#
|
||||
# If auto deprovisining is on, don't bother setting the capacity to 0
|
||||
# since we will delete the node anyway.
|
||||
if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES:
|
||||
other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
|
||||
logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))
|
||||
elif settings.AWX_AUTO_DEPROVISION_INSTANCES:
|
||||
deprovision_hostname = other_inst.hostname
|
||||
other_inst.delete()
|
||||
logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))
|
||||
except DatabaseError as e:
|
||||
if 'did not affect any rows' in str(e):
|
||||
logger.debug('Another instance has marked {} as lost'.format(other_inst.hostname))
|
||||
else:
|
||||
logger.exception('Error marking {} as lost'.format(other_inst.hostname))
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def awx_receptor_workunit_reaper():
|
||||
"""
|
||||
When an AWX job is launched via receptor, files such as status, stdin, and stdout are created
|
||||
in a specific receptor directory. This directory on disk is a random 8 character string, e.g. qLL2JFNT
|
||||
This is also called the work Unit ID in receptor, and is used in various receptor commands,
|
||||
e.g. "work results qLL2JFNT"
|
||||
After an AWX job executes, the receptor work unit directory is cleaned up by
|
||||
issuing the work release command. In some cases the release process might fail, or
|
||||
if AWX crashes during a job's execution, the work release command is never issued to begin with.
|
||||
As such, this periodic task will obtain a list of all receptor work units, and find which ones
|
||||
belong to AWX jobs that are in a completed state (status is canceled, error, or succeeded).
|
||||
This task will call "work release" on each of these work units to clean up the files on disk.
|
||||
|
||||
Note that when we call "work release" on a work unit that actually represents remote work
|
||||
both the local and remote work units are cleaned up.
|
||||
|
||||
Since we are cleaning up jobs that controller considers to be inactive, we take the added
|
||||
precaution of calling "work cancel" in case the work unit is still active.
|
||||
"""
|
||||
if not settings.RECEPTOR_RELEASE_WORK:
|
||||
return
|
||||
logger.debug("Checking for unreleased receptor work units")
|
||||
receptor_ctl = get_receptor_ctl()
|
||||
receptor_work_list = receptor_ctl.simple_command("work list")
|
||||
|
||||
unit_ids = [id for id in receptor_work_list]
|
||||
jobs_with_unreleased_receptor_units = UnifiedJob.objects.filter(work_unit_id__in=unit_ids).exclude(status__in=ACTIVE_STATES)
|
||||
for job in jobs_with_unreleased_receptor_units:
|
||||
logger.debug(f"{job.log_format} is not active, reaping receptor work unit {job.work_unit_id}")
|
||||
receptor_ctl.simple_command(f"work cancel {job.work_unit_id}")
|
||||
receptor_ctl.simple_command(f"work release {job.work_unit_id}")
|
||||
|
||||
administrative_workunit_reaper(receptor_work_list)
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def awx_k8s_reaper():
|
||||
if not settings.RECEPTOR_RELEASE_WORK:
|
||||
return
|
||||
|
||||
from awx.main.scheduler.kubernetes import PodManager # prevent circular import
|
||||
|
||||
for group in InstanceGroup.objects.filter(is_container_group=True).iterator():
|
||||
logger.debug("Checking for orphaned k8s pods for {}.".format(group))
|
||||
pods = PodManager.list_active_jobs(group)
|
||||
for job in UnifiedJob.objects.filter(pk__in=pods.keys()).exclude(status__in=ACTIVE_STATES):
|
||||
logger.debug('{} is no longer active, reaping orphaned k8s pod'.format(job.log_format))
|
||||
try:
|
||||
pm = PodManager(job)
|
||||
pm.kube_api.delete_namespaced_pod(name=pods[job.id], namespace=pm.namespace, _request_timeout=settings.AWX_CONTAINER_GROUP_K8S_API_TIMEOUT)
|
||||
except Exception:
|
||||
logger.exception("Failed to delete orphaned pod {} from {}".format(job.log_format, group))
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def awx_periodic_scheduler():
|
||||
with advisory_lock('awx_periodic_scheduler_lock', wait=False) as acquired:
|
||||
if acquired is False:
|
||||
logger.debug("Not running periodic scheduler, another task holds lock")
|
||||
return
|
||||
logger.debug("Starting periodic scheduler")
|
||||
|
||||
run_now = now()
|
||||
state = TowerScheduleState.get_solo()
|
||||
last_run = state.schedule_last_run
|
||||
logger.debug("Last scheduler run was: %s", last_run)
|
||||
state.schedule_last_run = run_now
|
||||
state.save()
|
||||
|
||||
old_schedules = Schedule.objects.enabled().before(last_run)
|
||||
for schedule in old_schedules:
|
||||
schedule.update_computed_fields()
|
||||
schedules = Schedule.objects.enabled().between(last_run, run_now)
|
||||
|
||||
invalid_license = False
|
||||
try:
|
||||
access_registry[Job](None).check_license(quiet=True)
|
||||
except PermissionDenied as e:
|
||||
invalid_license = e
|
||||
|
||||
for schedule in schedules:
|
||||
template = schedule.unified_job_template
|
||||
schedule.update_computed_fields() # To update next_run timestamp.
|
||||
if template.cache_timeout_blocked:
|
||||
logger.warn("Cache timeout is in the future, bypassing schedule for template %s" % str(template.id))
|
||||
continue
|
||||
try:
|
||||
job_kwargs = schedule.get_job_kwargs()
|
||||
new_unified_job = schedule.unified_job_template.create_unified_job(**job_kwargs)
|
||||
logger.debug('Spawned {} from schedule {}-{}.'.format(new_unified_job.log_format, schedule.name, schedule.pk))
|
||||
|
||||
if invalid_license:
|
||||
new_unified_job.status = 'failed'
|
||||
new_unified_job.job_explanation = str(invalid_license)
|
||||
new_unified_job.save(update_fields=['status', 'job_explanation'])
|
||||
new_unified_job.websocket_emit_status("failed")
|
||||
raise invalid_license
|
||||
can_start = new_unified_job.signal_start()
|
||||
except Exception:
|
||||
logger.exception('Error spawning scheduled job.')
|
||||
continue
|
||||
if not can_start:
|
||||
new_unified_job.status = 'failed'
|
||||
new_unified_job.job_explanation = gettext_noop(
|
||||
"Scheduled job could not start because it \
|
||||
was not in the right state or required manual credentials"
|
||||
)
|
||||
new_unified_job.save(update_fields=['status', 'job_explanation'])
|
||||
new_unified_job.websocket_emit_status("failed")
|
||||
emit_channel_notification('schedules-changed', dict(id=schedule.id, group_name="schedules"))
|
||||
state.save()
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def handle_work_success(task_actual):
|
||||
try:
|
||||
instance = UnifiedJob.get_instance_by_type(task_actual['type'], task_actual['id'])
|
||||
except ObjectDoesNotExist:
|
||||
logger.warning('Missing {} `{}` in success callback.'.format(task_actual['type'], task_actual['id']))
|
||||
return
|
||||
if not instance:
|
||||
return
|
||||
|
||||
schedule_task_manager()
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def handle_work_error(task_id, *args, **kwargs):
|
||||
subtasks = kwargs.get('subtasks', None)
|
||||
logger.debug('Executing error task id %s, subtasks: %s' % (task_id, str(subtasks)))
|
||||
first_instance = None
|
||||
first_instance_type = ''
|
||||
if subtasks is not None:
|
||||
for each_task in subtasks:
|
||||
try:
|
||||
instance = UnifiedJob.get_instance_by_type(each_task['type'], each_task['id'])
|
||||
if not instance:
|
||||
# Unknown task type
|
||||
logger.warn("Unknown task type: {}".format(each_task['type']))
|
||||
continue
|
||||
except ObjectDoesNotExist:
|
||||
logger.warning('Missing {} `{}` in error callback.'.format(each_task['type'], each_task['id']))
|
||||
continue
|
||||
|
||||
if first_instance is None:
|
||||
first_instance = instance
|
||||
first_instance_type = each_task['type']
|
||||
|
||||
if instance.celery_task_id != task_id and not instance.cancel_flag and not instance.status == 'successful':
|
||||
instance.status = 'failed'
|
||||
instance.failed = True
|
||||
if not instance.job_explanation:
|
||||
instance.job_explanation = 'Previous Task Failed: {"job_type": "%s", "job_name": "%s", "job_id": "%s"}' % (
|
||||
first_instance_type,
|
||||
first_instance.name,
|
||||
first_instance.id,
|
||||
)
|
||||
instance.save()
|
||||
instance.websocket_emit_status("failed")
|
||||
|
||||
# We only send 1 job complete message since all the job completion message
|
||||
# handling does is trigger the scheduler. If we extend the functionality of
|
||||
# what the job complete message handler does then we may want to send a
|
||||
# completion event for each job here.
|
||||
if first_instance:
|
||||
schedule_task_manager()
|
||||
pass
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def handle_success_and_failure_notifications(job_id):
|
||||
uj = UnifiedJob.objects.get(pk=job_id)
|
||||
retries = 0
|
||||
while retries < 5:
|
||||
if uj.finished:
|
||||
uj.send_notification_templates('succeeded' if uj.status == 'successful' else 'failed')
|
||||
return
|
||||
else:
|
||||
# wait a few seconds to avoid a race where the
|
||||
# events are persisted _before_ the UJ.status
|
||||
# changes from running -> successful
|
||||
retries += 1
|
||||
time.sleep(1)
|
||||
uj = UnifiedJob.objects.get(pk=job_id)
|
||||
|
||||
logger.warn(f"Failed to even try to send notifications for job '{uj}' due to job not being in finished state.")
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def update_inventory_computed_fields(inventory_id):
|
||||
"""
|
||||
Signal handler and wrapper around inventory.update_computed_fields to
|
||||
prevent unnecessary recursive calls.
|
||||
"""
|
||||
i = Inventory.objects.filter(id=inventory_id)
|
||||
if not i.exists():
|
||||
logger.error("Update Inventory Computed Fields failed due to missing inventory: " + str(inventory_id))
|
||||
return
|
||||
i = i[0]
|
||||
try:
|
||||
i.update_computed_fields()
|
||||
except DatabaseError as e:
|
||||
if 'did not affect any rows' in str(e):
|
||||
logger.debug('Exiting duplicate update_inventory_computed_fields task.')
|
||||
return
|
||||
raise
|
||||
|
||||
|
||||
def update_smart_memberships_for_inventory(smart_inventory):
|
||||
current = set(SmartInventoryMembership.objects.filter(inventory=smart_inventory).values_list('host_id', flat=True))
|
||||
new = set(smart_inventory.hosts.values_list('id', flat=True))
|
||||
additions = new - current
|
||||
removals = current - new
|
||||
if additions or removals:
|
||||
with transaction.atomic():
|
||||
if removals:
|
||||
SmartInventoryMembership.objects.filter(inventory=smart_inventory, host_id__in=removals).delete()
|
||||
if additions:
|
||||
add_for_inventory = [SmartInventoryMembership(inventory_id=smart_inventory.id, host_id=host_id) for host_id in additions]
|
||||
SmartInventoryMembership.objects.bulk_create(add_for_inventory, ignore_conflicts=True)
|
||||
logger.debug(
|
||||
'Smart host membership cached for {}, {} additions, {} removals, {} total count.'.format(
|
||||
smart_inventory.pk, len(additions), len(removals), len(new)
|
||||
)
|
||||
)
|
||||
return True # changed
|
||||
return False
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def update_host_smart_inventory_memberships():
|
||||
smart_inventories = Inventory.objects.filter(kind='smart', host_filter__isnull=False, pending_deletion=False)
|
||||
changed_inventories = set([])
|
||||
for smart_inventory in smart_inventories:
|
||||
try:
|
||||
changed = update_smart_memberships_for_inventory(smart_inventory)
|
||||
if changed:
|
||||
changed_inventories.add(smart_inventory)
|
||||
except IntegrityError:
|
||||
logger.exception('Failed to update smart inventory memberships for {}'.format(smart_inventory.pk))
|
||||
# Update computed fields for changed inventories outside atomic action
|
||||
for smart_inventory in changed_inventories:
|
||||
smart_inventory.update_computed_fields()
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def delete_inventory(inventory_id, user_id, retries=5):
|
||||
# Delete inventory as user
|
||||
if user_id is None:
|
||||
user = None
|
||||
else:
|
||||
try:
|
||||
user = User.objects.get(id=user_id)
|
||||
except Exception:
|
||||
user = None
|
||||
with ignore_inventory_computed_fields(), ignore_inventory_group_removal(), impersonate(user):
|
||||
try:
|
||||
i = Inventory.objects.get(id=inventory_id)
|
||||
for host in i.hosts.iterator():
|
||||
host.job_events_as_primary_host.update(host=None)
|
||||
i.delete()
|
||||
emit_channel_notification('inventories-status_changed', {'group_name': 'inventories', 'inventory_id': inventory_id, 'status': 'deleted'})
|
||||
logger.debug('Deleted inventory {} as user {}.'.format(inventory_id, user_id))
|
||||
except Inventory.DoesNotExist:
|
||||
logger.exception("Delete Inventory failed due to missing inventory: " + str(inventory_id))
|
||||
return
|
||||
except DatabaseError:
|
||||
logger.exception('Database error deleting inventory {}, but will retry.'.format(inventory_id))
|
||||
if retries > 0:
|
||||
time.sleep(10)
|
||||
delete_inventory(inventory_id, user_id, retries=retries - 1)
|
||||
|
||||
|
||||
def with_path_cleanup(f):
|
||||
@functools.wraps(f)
|
||||
def _wrapped(self, *args, **kwargs):
|
||||
try:
|
||||
return f(self, *args, **kwargs)
|
||||
finally:
|
||||
for p in self.cleanup_paths:
|
||||
try:
|
||||
if os.path.isdir(p):
|
||||
shutil.rmtree(p, ignore_errors=True)
|
||||
elif os.path.exists(p):
|
||||
os.remove(p)
|
||||
except OSError:
|
||||
logger.exception("Failed to remove tmp file: {}".format(p))
|
||||
self.cleanup_paths = []
|
||||
|
||||
return _wrapped
|
||||
|
||||
|
||||
def _reconstruct_relationships(copy_mapping):
|
||||
for old_obj, new_obj in copy_mapping.items():
|
||||
model = type(old_obj)
|
||||
for field_name in getattr(model, 'FIELDS_TO_PRESERVE_AT_COPY', []):
|
||||
field = model._meta.get_field(field_name)
|
||||
if isinstance(field, ForeignKey):
|
||||
if getattr(new_obj, field_name, None):
|
||||
continue
|
||||
related_obj = getattr(old_obj, field_name)
|
||||
related_obj = copy_mapping.get(related_obj, related_obj)
|
||||
setattr(new_obj, field_name, related_obj)
|
||||
elif field.many_to_many:
|
||||
for related_obj in getattr(old_obj, field_name).all():
|
||||
logger.debug('Deep copy: Adding {} to {}({}).{} relationship'.format(related_obj, new_obj, model, field_name))
|
||||
getattr(new_obj, field_name).add(copy_mapping.get(related_obj, related_obj))
|
||||
new_obj.save()
|
||||
|
||||
|
||||
@task(queue=get_local_queuename)
|
||||
def deep_copy_model_obj(model_module, model_name, obj_pk, new_obj_pk, user_pk, uuid, permission_check_func=None):
|
||||
sub_obj_list = cache.get(uuid)
|
||||
if sub_obj_list is None:
|
||||
logger.error('Deep copy {} from {} to {} failed unexpectedly.'.format(model_name, obj_pk, new_obj_pk))
|
||||
return
|
||||
|
||||
logger.debug('Deep copy {} from {} to {}.'.format(model_name, obj_pk, new_obj_pk))
|
||||
from awx.api.generics import CopyAPIView
|
||||
from awx.main.signals import disable_activity_stream
|
||||
|
||||
model = getattr(importlib.import_module(model_module), model_name, None)
|
||||
if model is None:
|
||||
return
|
||||
try:
|
||||
obj = model.objects.get(pk=obj_pk)
|
||||
new_obj = model.objects.get(pk=new_obj_pk)
|
||||
creater = User.objects.get(pk=user_pk)
|
||||
except ObjectDoesNotExist:
|
||||
logger.warning("Object or user no longer exists.")
|
||||
return
|
||||
with transaction.atomic(), ignore_inventory_computed_fields(), disable_activity_stream():
|
||||
copy_mapping = {}
|
||||
for sub_obj_setup in sub_obj_list:
|
||||
sub_model = getattr(importlib.import_module(sub_obj_setup[0]), sub_obj_setup[1], None)
|
||||
if sub_model is None:
|
||||
continue
|
||||
try:
|
||||
sub_obj = sub_model.objects.get(pk=sub_obj_setup[2])
|
||||
except ObjectDoesNotExist:
|
||||
continue
|
||||
copy_mapping.update(CopyAPIView.copy_model_obj(obj, new_obj, sub_model, sub_obj, creater))
|
||||
_reconstruct_relationships(copy_mapping)
|
||||
if permission_check_func:
|
||||
permission_check_func = getattr(getattr(importlib.import_module(permission_check_func[0]), permission_check_func[1]), permission_check_func[2])
|
||||
permission_check_func(creater, copy_mapping.values())
|
||||
if isinstance(new_obj, Inventory):
|
||||
update_inventory_computed_fields.delay(new_obj.id)
|
||||
Reference in New Issue
Block a user