From efabc052708b67674781ab1fe8bfa7b2e7ada1ff Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Tue, 13 Apr 2021 13:58:35 -0400 Subject: [PATCH 1/7] Chop out the dev environment isolated node --- Makefile | 7 +-- awx/main/tests/functional/test_tasks.py | 64 +---------------------- tools/docker-isolated-override.yml | 17 ------- tools/docker-isolated/Dockerfile | 20 -------- tools/docker-isolated/README.md | 67 ------------------------- 5 files changed, 3 insertions(+), 172 deletions(-) delete mode 100644 tools/docker-isolated-override.yml delete mode 100644 tools/docker-isolated/Dockerfile delete mode 100644 tools/docker-isolated/README.md diff --git a/Makefile b/Makefile index 3b3b220c60..839b1cf749 100644 --- a/Makefile +++ b/Makefile @@ -174,12 +174,7 @@ init: . $(VENV_BASE)/awx/bin/activate; \ fi; \ $(MANAGEMENT_COMMAND) provision_instance --hostname=$(COMPOSE_HOST); \ - $(MANAGEMENT_COMMAND) register_queue --queuename=tower --instance_percent=100;\ - if [ "$(AWX_GROUP_QUEUES)" == "tower,thepentagon" ]; then \ - $(MANAGEMENT_COMMAND) provision_instance --hostname=isolated; \ - $(MANAGEMENT_COMMAND) register_queue --queuename='thepentagon' --hostnames=isolated --controller=tower; \ - $(MANAGEMENT_COMMAND) generate_isolated_key > /awx_devel/awx/main/isolated/authorized_keys; \ - fi; + $(MANAGEMENT_COMMAND) register_queue --queuename=tower --instance_percent=100; # Refresh development environment after pulling new code. refresh: clean requirements_dev version_file develop migrate diff --git a/awx/main/tests/functional/test_tasks.py b/awx/main/tests/functional/test_tasks.py index 23a79076bc..70223a09b7 100644 --- a/awx/main/tests/functional/test_tasks.py +++ b/awx/main/tests/functional/test_tasks.py @@ -2,10 +2,8 @@ import pytest from unittest import mock import os -from django.utils.timezone import now, timedelta - -from awx.main.tasks import RunProjectUpdate, RunInventoryUpdate, awx_isolated_heartbeat, isolated_manager -from awx.main.models import ProjectUpdate, InventoryUpdate, InventorySource, Instance, InstanceGroup +from awx.main.tasks import RunProjectUpdate, RunInventoryUpdate +from awx.main.models import ProjectUpdate, InventoryUpdate, InventorySource @pytest.fixture @@ -70,61 +68,3 @@ class TestDependentInventoryUpdate: # Verify that it bails after 1st update, detecting a cancel assert is2.inventory_updates.count() == 0 iu_run_mock.assert_called_once() - - -class MockSettings: - AWX_ISOLATED_PERIODIC_CHECK = 60 - CLUSTER_HOST_ID = 'tower_1' - - -@pytest.mark.django_db -class TestIsolatedManagementTask: - @pytest.fixture - def control_group(self): - return InstanceGroup.objects.create(name='alpha') - - @pytest.fixture - def control_instance(self, control_group): - return control_group.instances.create(hostname='tower_1') - - @pytest.fixture - def needs_updating(self, control_group): - ig = InstanceGroup.objects.create(name='thepentagon', controller=control_group) - inst = ig.instances.create(hostname='isolated', capacity=103) - inst.last_isolated_check = now() - timedelta(seconds=MockSettings.AWX_ISOLATED_PERIODIC_CHECK) - inst.save() - return ig - - @pytest.fixture - def just_updated(self, control_group): - ig = InstanceGroup.objects.create(name='thepentagon', controller=control_group) - inst = ig.instances.create(hostname='isolated', capacity=103) - inst.last_isolated_check = now() - inst.save() - return inst - - @pytest.fixture - def old_version(self, control_group): - ig = InstanceGroup.objects.create(name='thepentagon', controller=control_group) - inst = ig.instances.create(hostname='isolated-old', capacity=103) - inst.save() - return inst - - def test_takes_action(self, control_instance, needs_updating): - original_isolated_instance = needs_updating.instances.all().first() - with mock.patch('awx.main.tasks.settings', MockSettings()): - with mock.patch.object(isolated_manager.IsolatedManager, 'health_check') as check_mock: - awx_isolated_heartbeat() - iso_instance = Instance.objects.get(hostname='isolated') - call_args, _ = check_mock.call_args - assert call_args[0][0] == iso_instance - assert iso_instance.last_isolated_check > original_isolated_instance.last_isolated_check - assert iso_instance.modified == original_isolated_instance.modified - - def test_does_not_take_action(self, control_instance, just_updated): - with mock.patch('awx.main.tasks.settings', MockSettings()): - with mock.patch.object(isolated_manager.IsolatedManager, 'health_check') as check_mock: - awx_isolated_heartbeat() - iso_instance = Instance.objects.get(hostname='isolated') - check_mock.assert_not_called() - assert iso_instance.capacity == 103 diff --git a/tools/docker-isolated-override.yml b/tools/docker-isolated-override.yml deleted file mode 100644 index 9bca79da9c..0000000000 --- a/tools/docker-isolated-override.yml +++ /dev/null @@ -1,17 +0,0 @@ ---- -version: '2' -services: - # Primary Tower Development Container link - awx: - environment: - AWX_GROUP_QUEUES: tower,thepentagon - links: - - isolated - # Isolated Rampart Container - isolated: - image: ${DEV_DOCKER_TAG_BASE}/awx_isolated:${TAG} - container_name: tools_isolated_1 - hostname: isolated - volumes: - - "../awx/main/isolated:/awx_devel" - privileged: true diff --git a/tools/docker-isolated/Dockerfile b/tools/docker-isolated/Dockerfile deleted file mode 100644 index 3080117e5b..0000000000 --- a/tools/docker-isolated/Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -ARG TAG=latest -FROM ansible/awx_devel:${TAG} - -RUN dnf install -y gcc python36-devel openssh-server -RUN python3 -m ensurepip && pip3 install "virtualenv < 20" ansible-runner -RUN dnf remove -y gcc python36-devel && rm -rf /var/cache/dnf - -RUN rm -f /etc/ssh/ssh_host_ecdsa_key /etc/ssh/ssh_host_rsa_key -RUN ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_ecdsa_key -RUN ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key -RUN sed -i "s/#UsePrivilegeSeparation.*/UsePrivilegeSeparation no/g" /etc/ssh/sshd_config -RUN sed -i "s/UsePAM.*/UsePAM yes/g" /etc/ssh/sshd_config -RUN sed -i "s/#StrictModes.*/StrictModes no/g" /etc/ssh/sshd_config -RUN mkdir -p /root/.ssh -RUN ln -s /awx_devel/authorized_keys /root/.ssh/authorized_keys - -ENTRYPOINT ["tini", "--"] -CMD ["/usr/sbin/sshd", "-D"] - -EXPOSE 22 diff --git a/tools/docker-isolated/README.md b/tools/docker-isolated/README.md deleted file mode 100644 index 6a09d59bb4..0000000000 --- a/tools/docker-isolated/README.md +++ /dev/null @@ -1,67 +0,0 @@ -## Instructions on using an isolated node - -The building of the isolated node is done in the `make docker-compose-build` -target. Its image uses a different tag from the tools_awx container. - -Given that the images are built, you can run the combined docker compose target. This uses -the base `docker-compose.yml` with modifications found in `docker-isolated-override.yml`. -You will still need to give COMPOSE_TAG with whatever your intended -base branch is. For example: - -```bash -make docker-isolated COMPOSE_TAG=devel -``` - -This will automatically exchange the keys in order for the `tools_awx_1` -container to access the `tools_isolated_1` container over ssh. -After that, it will bring up all the containers like the normal docker-compose -workflow. - -### Running a job on the Isolated Node - -Create a job template that runs normally. Add the id of the instance -group named `thepentagon` to the JT's instance groups. To do this, POST -the id (probably id=2) to `/api/v2/job_templates/N/instance_groups/`. -After that, run the job template. - -The models are automatically created when running the Makefile target, -and they are structured as follows: - - +-------+ +-------------+ - | tower |<----+ thepentagon | - +-------+ +-------------+ - ^ ^ - | | - | | - +---+---+ +-----+----+ - | tower | | isolated | - +-------+ +----------+ - -The `controller` for the group "thepentagon" and all hosts therein is -determined by a ForeignKey within the instance group. - -### Run a playbook - -In order to run an isolated job, associate the instance group `thepentagon` with -a job template, inventory, or organization, then run a job that derives from -that resource. You should be able to confirm success by inspecting the -`instance_group` of the job. - -#### Advanced Manual Testing - -If you want to run a job manually inside of the isolated container with this -tooling, you need a private data directory. Normal isolated job runs will -clean up their private data directory, but you can temporarily disable this -by disabling some parts of the cleanup_isolated.yml playbook. - -Example location of a private data directory: - -`/tmp/awx_29_OM6Mnx/` - -The following command would run the playbook corresponding to that job. - -```bash -ansible-runner start /tmp/awx_29_OM6Mnx/ -p some_playbook.yml -``` - -Other ansible-runner commands include `start`, `is-alive`, and `stop`. From b0cdfe7625bc2f185b3ec05b37d13b544978f947 Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Thu, 15 Apr 2021 14:15:30 -0400 Subject: [PATCH 2/7] Clean up the management commands --- .../commands/generate_isolated_key.py | 38 --------------- .../management/commands/list_instances.py | 7 +-- .../management/commands/provision_instance.py | 14 ++---- .../management/commands/register_queue.py | 31 ++---------- .../management/commands/run_wsbroadcast.py | 1 + .../commands/test_isolated_connection.py | 47 ------------------- awx/main/managers.py | 2 +- awx/main/wsbroadcast.py | 1 + 8 files changed, 12 insertions(+), 129 deletions(-) delete mode 100644 awx/main/management/commands/generate_isolated_key.py delete mode 100644 awx/main/management/commands/test_isolated_connection.py diff --git a/awx/main/management/commands/generate_isolated_key.py b/awx/main/management/commands/generate_isolated_key.py deleted file mode 100644 index 51112ea3d7..0000000000 --- a/awx/main/management/commands/generate_isolated_key.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2015 Ansible, Inc. -# All Rights Reserved -import datetime -from django.utils.encoding import smart_str - -from cryptography.hazmat.backends import default_backend -from cryptography.hazmat.primitives import serialization -from cryptography.hazmat.primitives.asymmetric import rsa -from django.conf import settings -from django.core.management.base import BaseCommand - -from awx.conf.models import Setting - - -class Command(BaseCommand): - """Generate and store a randomized RSA key for SSH traffic to isolated instances""" - - help = 'Generates and stores a randomized RSA key for SSH traffic to isolated instances' - - def handle(self, *args, **kwargs): - if getattr(settings, 'AWX_ISOLATED_PRIVATE_KEY', False): - print(settings.AWX_ISOLATED_PUBLIC_KEY) - return - - key = rsa.generate_private_key(public_exponent=65537, key_size=4096, backend=default_backend()) - Setting.objects.create( - key='AWX_ISOLATED_PRIVATE_KEY', - value=key.private_bytes( - encoding=serialization.Encoding.PEM, format=serialization.PrivateFormat.TraditionalOpenSSL, encryption_algorithm=serialization.NoEncryption() - ), - ).save() - pemfile = Setting.objects.create( - key='AWX_ISOLATED_PUBLIC_KEY', - value=smart_str(key.public_key().public_bytes(encoding=serialization.Encoding.OpenSSH, format=serialization.PublicFormat.OpenSSH)) - + " generated-by-awx@%s" % datetime.datetime.utcnow().isoformat(), - ) - pemfile.save() - print(pemfile.value) diff --git a/awx/main/management/commands/list_instances.py b/awx/main/management/commands/list_instances.py index 95807cb5a9..473f0e379e 100644 --- a/awx/main/management/commands/list_instances.py +++ b/awx/main/management/commands/list_instances.py @@ -10,7 +10,6 @@ class Ungrouped(object): name = 'ungrouped' policy_instance_percentage = None policy_instance_minimum = None - controller = None @property def instances(self): @@ -18,7 +17,7 @@ class Ungrouped(object): @property def capacity(self): - return sum([x.capacity for x in self.instances]) + return sum(x.capacity for x in self.instances) class Command(BaseCommand): @@ -38,8 +37,6 @@ class Command(BaseCommand): fmt += ' policy={0.policy_instance_percentage}%' if instance_group.policy_instance_minimum: fmt += ' policy>={0.policy_instance_minimum}' - if instance_group.controller: - fmt += ' controller={0.controller.name}' print((fmt + ']').format(instance_group)) for x in instance_group.instances.all(): color = '\033[92m' @@ -48,8 +45,6 @@ class Command(BaseCommand): if x.enabled is False: color = '\033[90m[DISABLED] ' fmt = '\t' + color + '{0.hostname} capacity={0.capacity} version={1}' - if x.last_isolated_check: - fmt += ' last_isolated_check="{0.last_isolated_check:%Y-%m-%d %H:%M:%S}"' if x.capacity: fmt += ' heartbeat="{0.modified:%Y-%m-%d %H:%M:%S}"' print((fmt + '\033[0m').format(x, x.version or '?')) diff --git a/awx/main/management/commands/provision_instance.py b/awx/main/management/commands/provision_instance.py index 02435ee167..06ca4470ac 100644 --- a/awx/main/management/commands/provision_instance.py +++ b/awx/main/management/commands/provision_instance.py @@ -1,13 +1,11 @@ # Copyright (c) 2015 Ansible, Inc. # All Rights Reserved -from uuid import uuid4 +from django.conf import settings +from django.core.management.base import BaseCommand, CommandError +from django.db import transaction from awx.main.models import Instance -from django.conf import settings - -from django.db import transaction -from django.core.management.base import BaseCommand, CommandError class Command(BaseCommand): @@ -20,7 +18,6 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--hostname', dest='hostname', type=str, help='Hostname used during provisioning') - parser.add_argument('--is-isolated', dest='is_isolated', action='store_true', help='Specify whether the instance is isolated') def _register_hostname(self, hostname): if not hostname: @@ -36,10 +33,7 @@ class Command(BaseCommand): def handle(self, **options): if not options.get('hostname'): raise CommandError("Specify `--hostname` to use this command.") - if options['is_isolated']: - self.uuid = str(uuid4()) - else: - self.uuid = settings.SYSTEM_UUID + self.uuid = settings.SYSTEM_UUID self.changed = False self._register_hostname(options.get('hostname')) if self.changed: diff --git a/awx/main/management/commands/register_queue.py b/awx/main/management/commands/register_queue.py index 5369e4fe06..9c05020545 100644 --- a/awx/main/management/commands/register_queue.py +++ b/awx/main/management/commands/register_queue.py @@ -17,10 +17,9 @@ class InstanceNotFound(Exception): class RegisterQueue: - def __init__(self, queuename, controller, instance_percent, inst_min, hostname_list, is_container_group=None): + def __init__(self, queuename, instance_percent, inst_min, hostname_list, is_container_group=None): self.instance_not_found_err = None self.queuename = queuename - self.controller = controller self.instance_percent = instance_percent self.instance_min = inst_min self.hostname_list = hostname_list @@ -46,20 +45,6 @@ class RegisterQueue: return (ig, created, changed) - def update_instance_group_controller(self, ig): - changed = False - control_ig = None - - if self.controller: - control_ig = InstanceGroup.objects.filter(name=self.controller).first() - - if control_ig and ig.controller_id != control_ig.pk: - ig.controller = control_ig - ig.save() - changed = True - - return (control_ig, changed) - def add_instances_to_group(self, ig): changed = False @@ -88,26 +73,20 @@ class RegisterQueue: with advisory_lock('cluster_policy_lock'): with transaction.atomic(): changed2 = False - changed3 = False (ig, created, changed1) = self.get_create_update_instance_group() if created: print("Creating instance group {}".format(ig.name)) elif not created: print("Instance Group already registered {}".format(ig.name)) - if self.controller: - (ig_ctrl, changed2) = self.update_instance_group_controller(ig) - if changed2: - print("Set controller group {} on {}.".format(self.controller, self.queuename)) - try: - (instances, changed3) = self.add_instances_to_group(ig) + (instances, changed2) = self.add_instances_to_group(ig) for i in instances: print("Added instance {} to {}".format(i.hostname, ig.name)) except InstanceNotFound as e: self.instance_not_found_err = e - if any([changed1, changed2, changed3]): + if changed1 or changed2: print('(changed: True)') @@ -117,7 +96,6 @@ class Command(BaseCommand): parser.add_argument( '--hostnames', dest='hostnames', type=str, help='Comma-Delimited Hosts to add to the Queue (will not remove already assigned instances)' ) - parser.add_argument('--controller', dest='controller', type=str, default='', help='The controlling group (makes this an isolated group)') parser.add_argument( '--instance_percent', dest='instance_percent', type=int, default=0, help='The percentage of active instances that will be assigned to this group' ), @@ -133,14 +111,13 @@ class Command(BaseCommand): queuename = options.get('queuename') if not queuename: raise CommandError("Specify `--queuename` to use this command.") - ctrl = options.get('controller') inst_per = options.get('instance_percent') instance_min = options.get('instance_minimum') hostname_list = [] if options.get('hostnames'): hostname_list = options.get('hostnames').split(",") - rq = RegisterQueue(queuename, ctrl, inst_per, instance_min, hostname_list) + rq = RegisterQueue(queuename, inst_per, instance_min, hostname_list) rq.register() if rq.instance_not_found_err: print(rq.instance_not_found_err.message) diff --git a/awx/main/management/commands/run_wsbroadcast.py b/awx/main/management/commands/run_wsbroadcast.py index 60f262c86c..56a91f2a93 100644 --- a/awx/main/management/commands/run_wsbroadcast.py +++ b/awx/main/management/commands/run_wsbroadcast.py @@ -140,6 +140,7 @@ class Command(BaseCommand): data[family.name] = family.samples[0].value me = Instance.objects.me() + # TODO: drop the isolated groups exclusion when the model is updated hostnames = [i.hostname for i in Instance.objects.exclude(Q(hostname=me.hostname) | Q(rampart_groups__controller__isnull=False))] host_stats = Command.get_connection_status(me, hostnames, data) diff --git a/awx/main/management/commands/test_isolated_connection.py b/awx/main/management/commands/test_isolated_connection.py deleted file mode 100644 index c89b71a892..0000000000 --- a/awx/main/management/commands/test_isolated_connection.py +++ /dev/null @@ -1,47 +0,0 @@ -import os -import shutil -import sys -import tempfile - -from django.conf import settings -from django.core.management.base import BaseCommand, CommandError - -import ansible_runner - -from awx.main.isolated.manager import set_pythonpath - - -class Command(BaseCommand): - """Tests SSH connectivity between a controller and target isolated node""" - - help = 'Tests SSH connectivity between a controller and target isolated node' - - def add_arguments(self, parser): - parser.add_argument('--hostname', dest='hostname', type=str, help='Hostname of an isolated node') - - def handle(self, *args, **options): - hostname = options.get('hostname') - if not hostname: - raise CommandError("--hostname is a required argument") - - try: - path = tempfile.mkdtemp(prefix='awx_isolated_ssh', dir=settings.AWX_ISOLATION_BASE_PATH) - ssh_key = None - if all([getattr(settings, 'AWX_ISOLATED_KEY_GENERATION', False) is True, getattr(settings, 'AWX_ISOLATED_PRIVATE_KEY', None)]): - ssh_key = settings.AWX_ISOLATED_PRIVATE_KEY - env = dict(os.environ.items()) - env['ANSIBLE_HOST_KEY_CHECKING'] = str(settings.AWX_ISOLATED_HOST_KEY_CHECKING) - set_pythonpath(os.path.join(settings.ANSIBLE_VENV_PATH, 'lib'), env) - res = ansible_runner.interface.run( - private_data_dir=path, - host_pattern='all', - inventory='{} ansible_ssh_user={}'.format(hostname, settings.AWX_ISOLATED_USERNAME), - module='shell', - module_args='ansible-runner --version', - envvars=env, - verbosity=3, - ssh_key=ssh_key, - ) - sys.exit(res.rc) - finally: - shutil.rmtree(path) diff --git a/awx/main/managers.py b/awx/main/managers.py index 473ff6523b..b3ff96783a 100644 --- a/awx/main/managers.py +++ b/awx/main/managers.py @@ -142,7 +142,7 @@ class InstanceManager(models.Manager): pod_ip = os.environ.get('MY_POD_IP') registered = self.register(ip_address=pod_ip) is_container_group = settings.IS_K8S - RegisterQueue('tower', None, 100, 0, [], is_container_group).register() + RegisterQueue('tower', 100, 0, [], is_container_group).register() return registered else: return (False, self.me()) diff --git a/awx/main/wsbroadcast.py b/awx/main/wsbroadcast.py index 184ae06122..7e0ca3a7e2 100644 --- a/awx/main/wsbroadcast.py +++ b/awx/main/wsbroadcast.py @@ -33,6 +33,7 @@ def unwrap_broadcast_msg(payload: dict): def get_broadcast_hosts(): Instance = apps.get_model('main', 'Instance') instances = ( + # TODO: no longer filter for non-isolated after the models change Instance.objects.filter(rampart_groups__controller__isnull=True) .exclude(hostname=Instance.objects.me().hostname) .order_by('hostname') From 6a599695db4bf54e697289a85e611e6aee8dcbc8 Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Mon, 19 Apr 2021 17:22:45 -0400 Subject: [PATCH 3/7] Remove the IsolatedManager and its associated playbooks and plugins --- awx/main/analytics/collectors.py | 11 +- awx/main/analytics/metrics.py | 1 - awx/main/isolated/.gitignore | 1 - awx/main/isolated/__init__.py | 0 awx/main/isolated/manager.py | 365 ------------------ .../management/commands/run_wsbroadcast.py | 4 +- awx/main/managers.py | 3 - awx/main/tasks.py | 111 ++---- awx/main/tests/unit/test_tasks.py | 118 +----- awx/main/wsbroadcast.py | 9 +- awx/playbooks/check_isolated.yml | 70 ---- awx/playbooks/clean_isolated.yml | 31 -- awx/playbooks/heartbeat_isolated.yml | 12 - awx/playbooks/run_isolated.yml | 61 --- awx/plugins/isolated/awx_capacity.py | 73 ---- awx/plugins/isolated/awx_isolated_cleanup.py | 68 ---- awx/plugins/isolated/mkfifo.py | 31 -- awx/settings/defaults.py | 11 +- awx/settings/development.py | 14 - awx/settings/production.py | 10 - .../roles/dockerfile/files/settings.py | 1 - 21 files changed, 41 insertions(+), 964 deletions(-) delete mode 100644 awx/main/isolated/.gitignore delete mode 100644 awx/main/isolated/__init__.py delete mode 100644 awx/main/isolated/manager.py delete mode 100644 awx/playbooks/check_isolated.yml delete mode 100644 awx/playbooks/clean_isolated.yml delete mode 100644 awx/playbooks/heartbeat_isolated.yml delete mode 100644 awx/playbooks/run_isolated.yml delete mode 100644 awx/plugins/isolated/awx_capacity.py delete mode 100644 awx/plugins/isolated/awx_isolated_cleanup.py delete mode 100755 awx/plugins/isolated/mkfifo.py diff --git a/awx/main/analytics/collectors.py b/awx/main/analytics/collectors.py index e7fc7a8c90..ab59db5983 100644 --- a/awx/main/analytics/collectors.py +++ b/awx/main/analytics/collectors.py @@ -220,17 +220,11 @@ def projects_by_scm_type(since, **kwargs): return counts -def _get_isolated_datetime(last_check): - if last_check: - return last_check.isoformat() - return last_check - - -@register('instance_info', '1.0', description=_('Cluster topology and capacity')) +@register('instance_info', '1.1', description=_('Cluster topology and capacity')) def instance_info(since, include_hostnames=False, **kwargs): info = {} instances = models.Instance.objects.values_list('hostname').values( - 'uuid', 'version', 'capacity', 'cpu', 'memory', 'managed_by_policy', 'hostname', 'last_isolated_check', 'enabled' + 'uuid', 'version', 'capacity', 'cpu', 'memory', 'managed_by_policy', 'hostname', 'enabled' ) for instance in instances: consumed_capacity = sum(x.task_impact for x in models.UnifiedJob.objects.filter(execution_node=instance['hostname'], status__in=('running', 'waiting'))) @@ -241,7 +235,6 @@ def instance_info(since, include_hostnames=False, **kwargs): 'cpu': instance['cpu'], 'memory': instance['memory'], 'managed_by_policy': instance['managed_by_policy'], - 'last_isolated_check': _get_isolated_datetime(instance['last_isolated_check']), 'enabled': instance['enabled'], 'consumed_capacity': consumed_capacity, 'remaining_capacity': instance['capacity'] - consumed_capacity, diff --git a/awx/main/analytics/metrics.py b/awx/main/analytics/metrics.py index 0af34a60ea..03ea674db8 100644 --- a/awx/main/analytics/metrics.py +++ b/awx/main/analytics/metrics.py @@ -184,7 +184,6 @@ def metrics(): INSTANCE_INFO.labels(hostname=hostname, instance_uuid=uuid).info( { 'enabled': str(instance_data[uuid]['enabled']), - 'last_isolated_check': getattr(instance_data[uuid], 'last_isolated_check', 'None'), 'managed_by_policy': str(instance_data[uuid]['managed_by_policy']), 'version': instance_data[uuid]['version'], } diff --git a/awx/main/isolated/.gitignore b/awx/main/isolated/.gitignore deleted file mode 100644 index 05b023b41d..0000000000 --- a/awx/main/isolated/.gitignore +++ /dev/null @@ -1 +0,0 @@ -authorized_keys diff --git a/awx/main/isolated/__init__.py b/awx/main/isolated/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/awx/main/isolated/manager.py b/awx/main/isolated/manager.py deleted file mode 100644 index 3fbda06ab8..0000000000 --- a/awx/main/isolated/manager.py +++ /dev/null @@ -1,365 +0,0 @@ -import fnmatch -import json -import os -import shutil -import stat -import tempfile -import time -import logging -import datetime - -from django.conf import settings -import ansible_runner - -import awx -from awx.main.utils import get_system_task_capacity - -logger = logging.getLogger('awx.isolated.manager') -playbook_logger = logging.getLogger('awx.isolated.manager.playbooks') - - -def set_pythonpath(venv_libdir, env): - env.pop('PYTHONPATH', None) # default to none if no python_ver matches - for version in os.listdir(venv_libdir): - if fnmatch.fnmatch(version, 'python[23].*'): - if os.path.isdir(os.path.join(venv_libdir, version)): - env['PYTHONPATH'] = os.path.join(venv_libdir, version, "site-packages") + ":" - break - - -class IsolatedManager(object): - def __init__(self, event_handler, canceled_callback=None, check_callback=None): - """ - :param event_handler: a callable used to persist event data from isolated nodes - :param canceled_callback: a callable - which returns `True` or `False` - - signifying if the job has been prematurely - canceled - """ - self.event_handler = event_handler - self.canceled_callback = canceled_callback - self.check_callback = check_callback - self.started_at = None - self.captured_command_artifact = False - self.instance = None - - def build_inventory(self, hosts): - inventory = '\n'.join(['{} ansible_ssh_user={}'.format(host, settings.AWX_ISOLATED_USERNAME) for host in hosts]) - - return inventory - - def build_runner_params(self, hosts, verbosity=1): - env = dict(os.environ.items()) - env['ANSIBLE_RETRY_FILES_ENABLED'] = 'False' - env['ANSIBLE_HOST_KEY_CHECKING'] = str(settings.AWX_ISOLATED_HOST_KEY_CHECKING) - env['ANSIBLE_LIBRARY'] = os.path.join(os.path.dirname(awx.__file__), 'plugins', 'isolated') - env['ANSIBLE_COLLECTIONS_PATHS'] = settings.AWX_ANSIBLE_COLLECTIONS_PATHS - set_pythonpath(os.path.join(settings.ANSIBLE_VENV_PATH, 'lib'), env) - - def finished_callback(runner_obj): - if runner_obj.status == 'failed' and runner_obj.config.playbook != 'check_isolated.yml': - # failed for clean_isolated.yml just means the playbook hasn't - # exited on the isolated host - stdout = runner_obj.stdout.read() - playbook_logger.error(stdout) - elif runner_obj.status == 'timeout': - # this means that the default idle timeout of - # (2 * AWX_ISOLATED_CONNECTION_TIMEOUT) was exceeded - # (meaning, we tried to sync with an isolated node, and we got - # no new output for 2 * AWX_ISOLATED_CONNECTION_TIMEOUT seconds) - # this _usually_ means SSH key auth from the controller -> - # isolated didn't work, and ssh is hung waiting on interactive - # input e.g., - # - # awx@isolated's password: - stdout = runner_obj.stdout.read() - playbook_logger.error(stdout) - else: - playbook_logger.info(runner_obj.stdout.read()) - - return { - 'project_dir': os.path.abspath(os.path.join(os.path.dirname(awx.__file__), 'playbooks')), - 'inventory': self.build_inventory(hosts), - 'envvars': env, - 'finished_callback': finished_callback, - 'verbosity': verbosity, - 'cancel_callback': self.canceled_callback, - 'settings': { - 'job_timeout': settings.AWX_ISOLATED_LAUNCH_TIMEOUT, - 'suppress_ansible_output': True, - }, - } - - def path_to(self, *args): - return os.path.join(self.private_data_dir, *args) - - def run_management_playbook(self, playbook, private_data_dir, idle_timeout=None, **kw): - iso_dir = tempfile.mkdtemp(prefix=playbook, dir=private_data_dir) - params = self.runner_params.copy() - params.get('envvars', dict())['ANSIBLE_CALLBACK_WHITELIST'] = 'profile_tasks' - params['playbook'] = playbook - params['private_data_dir'] = iso_dir - if idle_timeout: - params['settings']['idle_timeout'] = idle_timeout - else: - params['settings'].pop('idle_timeout', None) - params.update(**kw) - if all([getattr(settings, 'AWX_ISOLATED_KEY_GENERATION', False) is True, getattr(settings, 'AWX_ISOLATED_PRIVATE_KEY', None)]): - params['ssh_key'] = settings.AWX_ISOLATED_PRIVATE_KEY - return ansible_runner.interface.run(**params) - - def dispatch(self, playbook=None, module=None, module_args=None): - """ - Ship the runner payload to a remote host for isolated execution. - """ - self.handled_events = set() - self.started_at = time.time() - - # exclude certain files from the rsync - rsync_exclude = [ - # don't rsync source control metadata (it can be huge!) - '- /project/.git', - '- /project/.svn', - # don't rsync job events that are in the process of being written - '- /artifacts/job_events/*-partial.json.tmp', - # don't rsync the ssh_key FIFO - '- /env/ssh_key', - # don't rsync kube config files - '- .kubeconfig*', - ] - - for filename, data in (['.rsync-filter', '\n'.join(rsync_exclude)],): - path = self.path_to(filename) - with open(path, 'w') as f: - f.write(data) - os.chmod(path, stat.S_IRUSR) - - extravars = { - 'src': self.private_data_dir, - 'dest': settings.AWX_ISOLATION_BASE_PATH, - 'ident': self.ident, - 'job_id': self.instance.id, - } - if playbook: - extravars['playbook'] = playbook - if module and module_args: - extravars['module'] = module - extravars['module_args'] = module_args - - logger.debug('Starting job {} on isolated host with `run_isolated.yml` playbook.'.format(self.instance.id)) - runner_obj = self.run_management_playbook( - 'run_isolated.yml', self.private_data_dir, idle_timeout=max(60, 2 * settings.AWX_ISOLATED_CONNECTION_TIMEOUT), extravars=extravars - ) - - if runner_obj.status == 'failed': - self.instance.result_traceback = runner_obj.stdout.read() - self.instance.save(update_fields=['result_traceback']) - return 'error', runner_obj.rc - - return runner_obj.status, runner_obj.rc - - def check(self, interval=None): - """ - Repeatedly poll the isolated node to determine if the job has run. - - On success, copy job artifacts to the controlling node. - On failure, continue to poll the isolated node (until the job timeout - is exceeded). - - For a completed job run, this function returns (status, rc), - representing the status and return code of the isolated - `ansible-playbook` run. - - :param interval: an interval (in seconds) to wait between status polls - """ - interval = interval if interval is not None else settings.AWX_ISOLATED_CHECK_INTERVAL - extravars = {'src': self.private_data_dir, 'job_id': self.instance.id} - status = 'failed' - rc = None - last_check = time.time() - - while status == 'failed': - canceled = self.canceled_callback() if self.canceled_callback else False - if not canceled and time.time() - last_check < interval: - # If the job isn't canceled, but we haven't waited `interval` seconds, wait longer - time.sleep(1) - continue - - if canceled: - logger.warning('Isolated job {} was manually canceled.'.format(self.instance.id)) - - logger.debug('Checking on isolated job {} with `check_isolated.yml`.'.format(self.instance.id)) - time_start = datetime.datetime.now() - runner_obj = self.run_management_playbook('check_isolated.yml', self.private_data_dir, extravars=extravars) - time_end = datetime.datetime.now() - time_diff = time_end - time_start - logger.debug('Finished checking on isolated job {} with `check_isolated.yml` took {} seconds.'.format(self.instance.id, time_diff.total_seconds())) - status, rc = runner_obj.status, runner_obj.rc - - if self.check_callback is not None and not self.captured_command_artifact: - command_path = self.path_to('artifacts', self.ident, 'command') - # If the configuration artifact has been synced back, update the model - if os.path.exists(command_path): - try: - with open(command_path, 'r') as f: - data = json.load(f) - self.check_callback(data) - self.captured_command_artifact = True - except json.decoder.JSONDecodeError: # Just in case it's not fully here yet. - pass - - self.consume_events() - - last_check = time.time() - - if status == 'successful': - status_path = self.path_to('artifacts', self.ident, 'status') - rc_path = self.path_to('artifacts', self.ident, 'rc') - if os.path.exists(status_path): - with open(status_path, 'r') as f: - status = f.readline() - with open(rc_path, 'r') as f: - rc = int(f.readline()) - else: - # if there's no status file, it means that runner _probably_ - # exited with a traceback (which should be logged to - # daemon.log) Record it so we can see how runner failed. - daemon_path = self.path_to('daemon.log') - if os.path.exists(daemon_path): - with open(daemon_path, 'r') as f: - self.instance.result_traceback = f.read() - self.instance.save(update_fields=['result_traceback']) - else: - logger.error('Failed to rsync daemon.log (is ansible-runner installed on the isolated host?)') - status = 'failed' - rc = 1 - - # consume events one last time just to be sure we didn't miss anything - # in the final sync - self.consume_events() - - return status, rc - - def consume_events(self): - # discover new events and ingest them - events_path = self.path_to('artifacts', self.ident, 'job_events') - - # it's possible that `events_path` doesn't exist *yet*, because runner - # hasn't actually written any events yet (if you ran e.g., a sleep 30) - # only attempt to consume events if any were rsynced back - if os.path.exists(events_path): - for event in set(os.listdir(events_path)) - self.handled_events: - path = os.path.join(events_path, event) - if os.path.exists(path) and os.path.isfile(path): - try: - event_data = json.load(open(os.path.join(events_path, event), 'r')) - except json.decoder.JSONDecodeError: - # This means the event we got back isn't valid JSON - # that can happen if runner is still partially - # writing an event file while it's rsyncing - # these event writes are _supposed_ to be atomic - # but it doesn't look like they actually are in - # practice - # in this scenario, just ignore this event and try it - # again on the next sync - continue - self.event_handler(event_data) - self.handled_events.add(event) - - def cleanup(self): - extravars = { - 'private_data_dir': self.private_data_dir, - 'cleanup_dirs': [ - self.private_data_dir, - ], - } - logger.debug('Cleaning up job {} on isolated host with `clean_isolated.yml` playbook.'.format(self.instance.id)) - self.run_management_playbook('clean_isolated.yml', self.private_data_dir, extravars=extravars) - - @classmethod - def update_capacity(cls, instance, task_result): - instance.version = 'ansible-runner-{}'.format(task_result['version']) - - if instance.capacity == 0 and task_result['capacity_cpu']: - logger.warning('Isolated instance {} has re-joined.'.format(instance.hostname)) - instance.cpu = int(task_result['cpu']) - instance.memory = int(task_result['mem']) - instance.cpu_capacity = int(task_result['capacity_cpu']) - instance.mem_capacity = int(task_result['capacity_mem']) - instance.capacity = get_system_task_capacity( - scale=instance.capacity_adjustment, cpu_capacity=int(task_result['capacity_cpu']), mem_capacity=int(task_result['capacity_mem']) - ) - instance.save(update_fields=['cpu', 'memory', 'cpu_capacity', 'mem_capacity', 'capacity', 'version', 'modified']) - - def health_check(self, instance_qs): - """ - :param instance_qs: List of Django objects representing the - isolated instances to manage - Runs playbook that will - - determine if instance is reachable - - find the instance capacity - - clean up orphaned private files - Performs save on each instance to update its capacity. - """ - instance_qs = [i for i in instance_qs if i.enabled] - if not len(instance_qs): - return - try: - private_data_dir = tempfile.mkdtemp(prefix='awx_iso_heartbeat_', dir=settings.AWX_ISOLATION_BASE_PATH) - self.runner_params = self.build_runner_params([instance.hostname for instance in instance_qs]) - self.runner_params['private_data_dir'] = private_data_dir - self.runner_params['forks'] = len(instance_qs) - runner_obj = self.run_management_playbook('heartbeat_isolated.yml', private_data_dir) - - for instance in instance_qs: - task_result = {} - try: - task_result = runner_obj.get_fact_cache(instance.hostname) - except Exception: - logger.exception('Failed to read status from isolated instances') - if 'awx_capacity_cpu' in task_result and 'awx_capacity_mem' in task_result: - task_result = { - 'cpu': task_result['awx_cpu'], - 'mem': task_result['awx_mem'], - 'capacity_cpu': task_result['awx_capacity_cpu'], - 'capacity_mem': task_result['awx_capacity_mem'], - 'version': task_result['awx_capacity_version'], - } - IsolatedManager.update_capacity(instance, task_result) - logger.debug('Isolated instance {} successful heartbeat'.format(instance.hostname)) - elif instance.capacity == 0: - logger.debug('Isolated instance {} previously marked as lost, could not re-join.'.format(instance.hostname)) - else: - logger.warning('Could not update status of isolated instance {}'.format(instance.hostname)) - if instance.is_lost(isolated=True): - instance.capacity = 0 - instance.save(update_fields=['capacity']) - logger.error('Isolated instance {} last checked in at {}, marked as lost.'.format(instance.hostname, instance.modified)) - finally: - if os.path.exists(private_data_dir): - shutil.rmtree(private_data_dir) - - def run(self, instance, private_data_dir, playbook, module, module_args, ident=None): - """ - Run a job on an isolated host. - - :param instance: a `model.Job` instance - :param private_data_dir: an absolute path on the local file system - where job-specific data should be written - (i.e., `/tmp/awx_N_xyz/`) - :param playbook: the playbook to run - :param module: the module to run - :param module_args: the module args to use - - For a completed job run, this function returns (status, rc), - representing the status and return code of the isolated - `ansible-playbook` run. - """ - self.ident = ident - self.instance = instance - self.private_data_dir = private_data_dir - self.runner_params = self.build_runner_params([instance.execution_node], verbosity=min(5, self.instance.verbosity)) - - status, rc = self.dispatch(playbook, module, module_args) - if status == 'successful': - status, rc = self.check() - return status, rc diff --git a/awx/main/management/commands/run_wsbroadcast.py b/awx/main/management/commands/run_wsbroadcast.py index 56a91f2a93..7ca47d77ff 100644 --- a/awx/main/management/commands/run_wsbroadcast.py +++ b/awx/main/management/commands/run_wsbroadcast.py @@ -10,7 +10,6 @@ from datetime import datetime as dt from django.core.management.base import BaseCommand from django.db import connection -from django.db.models import Q from django.db.migrations.executor import MigrationExecutor from awx.main.analytics.broadcast_websocket import ( @@ -140,8 +139,7 @@ class Command(BaseCommand): data[family.name] = family.samples[0].value me = Instance.objects.me() - # TODO: drop the isolated groups exclusion when the model is updated - hostnames = [i.hostname for i in Instance.objects.exclude(Q(hostname=me.hostname) | Q(rampart_groups__controller__isnull=False))] + hostnames = [i.hostname for i in Instance.objects.exclude(hostname=me.hostname)] host_stats = Command.get_connection_status(me, hostnames, data) lines = Command._format_lines(host_stats) diff --git a/awx/main/managers.py b/awx/main/managers.py index b3ff96783a..ada38ddd18 100644 --- a/awx/main/managers.py +++ b/awx/main/managers.py @@ -155,9 +155,6 @@ class InstanceManager(models.Manager): # NOTE: TODO: Likely to repurpose this once standalone ramparts are a thing return "tower" - def all_non_isolated(self): - return self.exclude(rampart_groups__controller__isnull=False) - class InstanceGroupManager(models.Manager): """A custom manager class for the Instance model. diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 063b5f47d7..a937ec6e59 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -33,7 +33,7 @@ import subprocess from django.conf import settings from django.db import transaction, DatabaseError, IntegrityError, ProgrammingError, connection from django.db.models.fields.related import ForeignKey -from django.utils.timezone import now, timedelta +from django.utils.timezone import now from django.utils.encoding import smart_str from django.contrib.auth.models import User from django.utils.translation import ugettext_lazy as _, gettext_noop @@ -84,7 +84,6 @@ from awx.main.models import ( from awx.main.constants import ACTIVE_STATES from awx.main.exceptions import AwxTaskError, PostRunError from awx.main.queue import CallbackQueueDispatcher -from awx.main.isolated import manager as isolated_manager from awx.main.dispatch.publish import task from awx.main.dispatch import get_local_queuename, reaper from awx.main.utils import ( @@ -170,8 +169,6 @@ def dispatch_startup(): # apply_cluster_membership_policies() cluster_node_heartbeat() - if Instance.objects.me().is_controller(): - awx_isolated_heartbeat() Metrics().clear_values() # Update Tower's rsyslog.conf file based on loggins settings in the db @@ -205,13 +202,8 @@ def apply_cluster_membership_policies(): started_compute = time.time() all_instances = list(Instance.objects.order_by('id')) all_groups = list(InstanceGroup.objects.prefetch_related('instances')) - iso_hostnames = set([]) - for ig in all_groups: - if ig.controller_id is not None: - iso_hostnames.update(ig.policy_instance_list) - considered_instances = [inst for inst in all_instances if inst.hostname not in iso_hostnames] - total_instances = len(considered_instances) + total_instances = len(all_instances) actual_groups = [] actual_instances = [] Group = namedtuple('Group', ['obj', 'instances', 'prior_instances']) @@ -232,18 +224,12 @@ def apply_cluster_membership_policies(): if group_actual.instances: logger.debug("Policy List, adding Instances {} to Group {}".format(group_actual.instances, ig.name)) - if ig.controller_id is None: - actual_groups.append(group_actual) - else: - # For isolated groups, _only_ apply the policy_instance_list - # do not add to in-memory list, so minimum rules not applied - logger.debug('Committing instances to isolated group {}'.format(ig.name)) - ig.instances.set(group_actual.instances) + actual_groups.append(group_actual) # Process Instance minimum policies next, since it represents a concrete lower bound to the # number of instances to make available to instance groups - actual_instances = [Node(obj=i, groups=[]) for i in considered_instances if i.managed_by_policy] - logger.debug("Total non-isolated instances:{} available for policy: {}".format(total_instances, len(actual_instances))) + actual_instances = [Node(obj=i, groups=[]) for i in all_instances if i.managed_by_policy] + logger.debug("Total instances: {}, available for policy: {}".format(total_instances, len(actual_instances))) for g in sorted(actual_groups, key=lambda x: len(x.instances)): policy_min_added = [] for i in sorted(actual_instances, key=lambda x: len(x.groups)): @@ -285,7 +271,7 @@ def apply_cluster_membership_policies(): logger.debug('Cluster policy no-op finished in {} seconds'.format(time.time() - started_compute)) return - # On a differential basis, apply instances to non-isolated groups + # On a differential basis, apply instances to groups with transaction.atomic(): for g in actual_groups: if g.obj.is_container_group: @@ -419,7 +405,7 @@ def cleanup_execution_environment_images(): def cluster_node_heartbeat(): logger.debug("Cluster node heartbeat task.") nowtime = now() - instance_list = list(Instance.objects.all_non_isolated()) + instance_list = list(Instance.objects.all()) this_inst = None lost_instances = [] @@ -503,30 +489,6 @@ def awx_k8s_reaper(): logger.exception("Failed to delete orphaned pod {} from {}".format(job.log_format, group)) -@task(queue=get_local_queuename) -def awx_isolated_heartbeat(): - local_hostname = settings.CLUSTER_HOST_ID - logger.debug("Controlling node checking for any isolated management tasks.") - poll_interval = settings.AWX_ISOLATED_PERIODIC_CHECK - # Get isolated instances not checked since poll interval - some buffer - nowtime = now() - accept_before = nowtime - timedelta(seconds=(poll_interval - 10)) - isolated_instance_qs = Instance.objects.filter( - rampart_groups__controller__instances__hostname=local_hostname, - ) - isolated_instance_qs = isolated_instance_qs.filter(last_isolated_check__lt=accept_before) | isolated_instance_qs.filter(last_isolated_check=None) - # Fast pass of isolated instances, claiming the nodes to update - with transaction.atomic(): - for isolated_instance in isolated_instance_qs: - isolated_instance.last_isolated_check = nowtime - # Prevent modified time from being changed, as in normal heartbeat - isolated_instance.save(update_fields=['last_isolated_check']) - # Slow pass looping over isolated IGs and their isolated instances - if len(isolated_instance_qs) > 0: - logger.debug("Managing isolated instances {}.".format(','.join([inst.hostname for inst in isolated_instance_qs]))) - isolated_manager.IsolatedManager(CallbackQueueDispatcher.dispatch).health_check(isolated_instance_qs) - - @task(queue=get_local_queuename) def awx_periodic_scheduler(): with advisory_lock('awx_periodic_scheduler_lock', wait=False) as acquired: @@ -1017,7 +979,7 @@ class BaseTask(object): else: env['PATH'] = os.path.join(settings.AWX_VENV_PATH, "bin") - def build_env(self, instance, private_data_dir, isolated, private_data_files=None): + def build_env(self, instance, private_data_dir, private_data_files=None): """ Build environment dictionary for ansible-playbook. """ @@ -1138,7 +1100,7 @@ class BaseTask(object): """ instance.log_lifecycle("post_run") - def final_run_hook(self, instance, status, private_data_dir, fact_modification_times, isolated_manager_instance=None): + def final_run_hook(self, instance, status, private_data_dir, fact_modification_times): """ Hook for any steps to run after job/task is marked as complete. """ @@ -1280,16 +1242,6 @@ class BaseTask(object): job_env[k] = v self.instance = self.update_model(self.instance.pk, job_args=json.dumps(runner_config.command), job_cwd=runner_config.cwd, job_env=job_env) - def check_handler(self, config): - """ - IsolatedManager callback triggered by the repeated checks of the isolated node - """ - job_env = build_safe_env(config['env']) - for k, v in self.safe_cred_env.items(): - if k in job_env: - job_env[k] = v - self.instance = self.update_model(self.instance.pk, job_args=json.dumps(config['command']), job_cwd=config['cwd'], job_env=job_env) - @with_path_cleanup def run(self, pk, **kwargs): """ @@ -1317,7 +1269,6 @@ class BaseTask(object): self.safe_env = {} self.safe_cred_env = {} private_data_dir = None - isolated_manager_instance = None # store a reference to the parent workflow job (if any) so we can include # it in event data JSON @@ -1325,7 +1276,6 @@ class BaseTask(object): self.parent_workflow_job_id = self.instance.get_workflow_job().id try: - isolated = self.instance.is_isolated() self.instance.send_notification_templates("running") private_data_dir = self.build_private_data_dir(self.instance) self.pre_run_hook(self.instance, private_data_dir) @@ -1359,7 +1309,7 @@ class BaseTask(object): passwords = self.build_passwords(self.instance, kwargs) self.build_extra_vars_file(self.instance, private_data_dir) args = self.build_args(self.instance, private_data_dir, passwords) - env = self.build_env(self.instance, private_data_dir, isolated, private_data_files=private_data_files) + env = self.build_env(self.instance, private_data_dir, private_data_files=private_data_files) self.safe_env = build_safe_env(env) credentials = self.build_credentials_list(self.instance) @@ -1460,7 +1410,7 @@ class BaseTask(object): self.instance = self.update_model(pk, status=status, emitted_events=self.event_ct, **extra_update_fields) try: - self.final_run_hook(self.instance, status, private_data_dir, fact_modification_times, isolated_manager_instance=isolated_manager_instance) + self.final_run_hook(self.instance, status, private_data_dir, fact_modification_times) except Exception: logger.exception('{} Final run hook errored.'.format(self.instance.log_format)) @@ -1545,11 +1495,11 @@ class RunJob(BaseTask): return passwords - def build_env(self, job, private_data_dir, isolated=False, private_data_files=None): + def build_env(self, job, private_data_dir, private_data_files=None): """ Build environment dictionary for ansible-playbook. """ - env = super(RunJob, self).build_env(job, private_data_dir, isolated=isolated, private_data_files=private_data_files) + env = super(RunJob, self).build_env(job, private_data_dir, private_data_files=private_data_files) if private_data_files is None: private_data_files = {} # Set environment variables needed for inventory and job event @@ -1560,10 +1510,9 @@ class RunJob(BaseTask): env['PROJECT_REVISION'] = job.project.scm_revision env['ANSIBLE_RETRY_FILES_ENABLED'] = "False" env['MAX_EVENT_RES'] = str(settings.MAX_EVENT_RES_DATA) - if not isolated: - if hasattr(settings, 'AWX_ANSIBLE_CALLBACK_PLUGINS') and settings.AWX_ANSIBLE_CALLBACK_PLUGINS: - env['ANSIBLE_CALLBACK_PLUGINS'] = ':'.join(settings.AWX_ANSIBLE_CALLBACK_PLUGINS) - env['AWX_HOST'] = settings.TOWER_URL_BASE + if hasattr(settings, 'AWX_ANSIBLE_CALLBACK_PLUGINS') and settings.AWX_ANSIBLE_CALLBACK_PLUGINS: + env['ANSIBLE_CALLBACK_PLUGINS'] = ':'.join(settings.AWX_ANSIBLE_CALLBACK_PLUGINS) + env['AWX_HOST'] = settings.TOWER_URL_BASE # Create a directory for ControlPath sockets that is unique to each job cp_dir = os.path.join(private_data_dir, 'cp') @@ -1794,9 +1743,6 @@ class RunJob(BaseTask): if sync_needs: pu_ig = job.instance_group pu_en = job.execution_node - if job.is_isolated() is True: - pu_ig = pu_ig.controller - pu_en = settings.CLUSTER_HOST_ID sync_metafields = dict( launch_type="sync", @@ -1851,7 +1797,7 @@ class RunJob(BaseTask): # ran inside of the event saving code update_smart_memberships_for_inventory(job.inventory) - def final_run_hook(self, job, status, private_data_dir, fact_modification_times, isolated_manager_instance=None): + def final_run_hook(self, job, status, private_data_dir, fact_modification_times): super(RunJob, self).final_run_hook(job, status, private_data_dir, fact_modification_times) if not private_data_dir: # If there's no private data dir, that means we didn't get into the @@ -1863,8 +1809,6 @@ class RunJob(BaseTask): os.path.join(private_data_dir, 'artifacts', 'fact_cache'), fact_modification_times, ) - if isolated_manager_instance and not job.is_container_group_task: - isolated_manager_instance.cleanup() try: inventory = job.inventory @@ -1928,11 +1872,11 @@ class RunProjectUpdate(BaseTask): passwords['scm_password'] = project_update.credential.get_input('password', default='') return passwords - def build_env(self, project_update, private_data_dir, isolated=False, private_data_files=None): + def build_env(self, project_update, private_data_dir, private_data_files=None): """ Build environment dictionary for ansible-playbook. """ - env = super(RunProjectUpdate, self).build_env(project_update, private_data_dir, isolated=isolated, private_data_files=private_data_files) + env = super(RunProjectUpdate, self).build_env(project_update, private_data_dir, private_data_files=private_data_files) env['ANSIBLE_RETRY_FILES_ENABLED'] = str(False) env['ANSIBLE_ASK_PASS'] = str(False) env['ANSIBLE_BECOME_ASK_PASS'] = str(False) @@ -2387,14 +2331,14 @@ class RunInventoryUpdate(BaseTask): injector = InventorySource.injectors[inventory_update.source]() return injector.build_private_data(inventory_update, private_data_dir) - def build_env(self, inventory_update, private_data_dir, isolated, private_data_files=None): + def build_env(self, inventory_update, private_data_dir, private_data_files=None): """Build environment dictionary for ansible-inventory. Most environment variables related to credentials or configuration are accomplished by the inventory source injectors (in this method) or custom credential type injectors (in main run method). """ - env = super(RunInventoryUpdate, self).build_env(inventory_update, private_data_dir, isolated, private_data_files=private_data_files) + env = super(RunInventoryUpdate, self).build_env(inventory_update, private_data_dir, private_data_files=private_data_files) if private_data_files is None: private_data_files = {} @@ -2711,11 +2655,11 @@ class RunAdHocCommand(BaseTask): passwords[field] = value return passwords - def build_env(self, ad_hoc_command, private_data_dir, isolated=False, private_data_files=None): + def build_env(self, ad_hoc_command, private_data_dir, private_data_files=None): """ Build environment dictionary for ansible. """ - env = super(RunAdHocCommand, self).build_env(ad_hoc_command, private_data_dir, isolated=isolated, private_data_files=private_data_files) + env = super(RunAdHocCommand, self).build_env(ad_hoc_command, private_data_dir, private_data_files=private_data_files) # Set environment variables needed for inventory and ad hoc event # callbacks to work. env['AD_HOC_COMMAND_ID'] = str(ad_hoc_command.pk) @@ -2821,11 +2765,6 @@ class RunAdHocCommand(BaseTask): d[r'Password:\s*?$'] = 'ssh_password' return d - def final_run_hook(self, adhoc_job, status, private_data_dir, fact_modification_times, isolated_manager_instance=None): - super(RunAdHocCommand, self).final_run_hook(adhoc_job, status, private_data_dir, fact_modification_times) - if isolated_manager_instance: - isolated_manager_instance.cleanup() - @task(queue=get_local_queuename) class RunSystemJob(BaseTask): @@ -2867,8 +2806,8 @@ class RunSystemJob(BaseTask): os.chmod(path, stat.S_IRUSR) return path - def build_env(self, instance, private_data_dir, isolated=False, private_data_files=None): - base_env = super(RunSystemJob, self).build_env(instance, private_data_dir, isolated=isolated, private_data_files=private_data_files) + def build_env(self, instance, private_data_dir, private_data_files=None): + base_env = super(RunSystemJob, self).build_env(instance, private_data_dir, private_data_files=private_data_files) # TODO: this is able to run by turning off isolation # the goal is to run it a container instead env = dict(os.environ.items()) diff --git a/awx/main/tests/unit/test_tasks.py b/awx/main/tests/unit/test_tasks.py index e6460dbc07..1cd7dd9069 100644 --- a/awx/main/tests/unit/test_tasks.py +++ b/awx/main/tests/unit/test_tasks.py @@ -645,102 +645,6 @@ class TestAdhocRun(TestJobExecution): assert extra_vars['awx_user_name'] == "angry-spud" -@pytest.mark.skip(reason="Isolated code path needs updating after runner integration") -class TestIsolatedExecution(TestJobExecution): - - ISOLATED_HOST = 'some-isolated-host' - ISOLATED_CONTROLLER_HOST = 'some-isolated-controller-host' - - @pytest.fixture - def job(self): - job = Job(pk=1, id=1, project=Project(), inventory=Inventory(), job_template=JobTemplate(id=1, name='foo')) - job.controller_node = self.ISOLATED_CONTROLLER_HOST - job.execution_node = self.ISOLATED_HOST - return job - - def test_with_ssh_credentials(self, job): - ssh = CredentialType.defaults['ssh']() - credential = Credential(pk=1, credential_type=ssh, inputs={'username': 'bob', 'password': 'secret', 'ssh_key_data': self.EXAMPLE_PRIVATE_KEY}) - credential.inputs['password'] = encrypt_field(credential, 'password') - job.credentials.add(credential) - - private_data = tempfile.mkdtemp(prefix='awx_') - self.task.build_private_data_dir = mock.Mock(return_value=private_data) - - def _mock_job_artifacts(*args, **kw): - artifacts = os.path.join(private_data, 'artifacts') - if not os.path.exists(artifacts): - os.makedirs(artifacts) - if 'run_isolated.yml' in args[0]: - for filename, data in ( - ['status', 'successful'], - ['rc', '0'], - ['stdout', 'IT WORKED!'], - ): - with open(os.path.join(artifacts, filename), 'w') as f: - f.write(data) - return ('successful', 0) - - self.run_pexpect.side_effect = _mock_job_artifacts - self.task.run(self.pk) - - playbook_run = self.run_pexpect.call_args_list[0][0] - assert ' '.join(playbook_run[0]).startswith( - ' '.join( - [ - 'ansible-playbook', - 'run_isolated.yml', - '-u', - settings.AWX_ISOLATED_USERNAME, - '-T', - str(settings.AWX_ISOLATED_CONNECTION_TIMEOUT), - '-i', - self.ISOLATED_HOST + ',', - '-e', - ] - ) - ) - extra_vars = playbook_run[0][playbook_run[0].index('-e') + 1] - extra_vars = json.loads(extra_vars) - assert extra_vars['dest'] == '/tmp' - assert extra_vars['src'] == private_data - - def test_systemctl_failure(self): - # If systemctl fails, read the contents of `artifacts/systemctl_logs` - mock_get = mock.Mock() - ssh = CredentialType.defaults['ssh']() - credential = Credential( - pk=1, - credential_type=ssh, - inputs={ - 'username': 'bob', - }, - ) - self.instance.credentials.add(credential) - - private_data = tempfile.mkdtemp(prefix='awx_') - self.task.build_private_data_dir = mock.Mock(return_value=private_data) - inventory = json.dumps({"all": {"hosts": ["localhost"]}}) - - def _mock_job_artifacts(*args, **kw): - artifacts = os.path.join(private_data, 'artifacts') - if not os.path.exists(artifacts): - os.makedirs(artifacts) - if 'run_isolated.yml' in args[0]: - for filename, data in (['daemon.log', 'ERROR IN RUN.PY'],): - with open(os.path.join(artifacts, filename), 'w') as f: - f.write(data) - return ('successful', 0) - - self.run_pexpect.side_effect = _mock_job_artifacts - - with mock.patch('time.sleep'): - with mock.patch('requests.get') as mock_get: - mock_get.return_value = mock.Mock(content=inventory) - with pytest.raises(Exception): - self.task.run(self.pk, self.ISOLATED_HOST) - - class TestJobCredentials(TestJobExecution): @pytest.fixture def job(self, execution_environment): @@ -1625,7 +1529,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): inventory_update.get_extra_credentials = mocker.Mock(return_value=[]) private_data_files = task.build_private_data_files(inventory_update, private_data_dir) - env = task.build_env(inventory_update, private_data_dir, False, private_data_files) + env = task.build_env(inventory_update, private_data_dir, private_data_files) assert 'AWS_ACCESS_KEY_ID' not in env assert 'AWS_SECRET_ACCESS_KEY' not in env @@ -1645,7 +1549,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): inventory_update.get_extra_credentials = mocker.Mock(return_value=[]) private_data_files = task.build_private_data_files(inventory_update, private_data_dir) - env = task.build_env(inventory_update, private_data_dir, False, private_data_files) + env = task.build_env(inventory_update, private_data_dir, private_data_files) safe_env = build_safe_env(env) @@ -1669,7 +1573,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): inventory_update.get_extra_credentials = mocker.Mock(return_value=[]) private_data_files = task.build_private_data_files(inventory_update, private_data_dir) - env = task.build_env(inventory_update, private_data_dir, False, private_data_files) + env = task.build_env(inventory_update, private_data_dir, private_data_files) safe_env = {} credentials = task.build_credentials_list(inventory_update) @@ -1706,7 +1610,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): inventory_update.get_extra_credentials = mocker.Mock(return_value=[]) private_data_files = task.build_private_data_files(inventory_update, private_data_dir) - env = task.build_env(inventory_update, private_data_dir, False, private_data_files) + env = task.build_env(inventory_update, private_data_dir, private_data_files) safe_env = build_safe_env(env) @@ -1736,7 +1640,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): inventory_update.get_extra_credentials = mocker.Mock(return_value=[]) private_data_files = task.build_private_data_files(inventory_update, private_data_dir) - env = task.build_env(inventory_update, private_data_dir, False, private_data_files) + env = task.build_env(inventory_update, private_data_dir, private_data_files) safe_env = build_safe_env(env) @@ -1763,7 +1667,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): def run(expected_gce_zone): private_data_files = task.build_private_data_files(inventory_update, private_data_dir) - env = task.build_env(inventory_update, private_data_dir, False, private_data_files) + env = task.build_env(inventory_update, private_data_dir, private_data_files) safe_env = {} credentials = task.build_credentials_list(inventory_update) for credential in credentials: @@ -1797,7 +1701,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): inventory_update.get_extra_credentials = mocker.Mock(return_value=[]) private_data_files = task.build_private_data_files(inventory_update, private_data_dir) - env = task.build_env(inventory_update, private_data_dir, False, private_data_files) + env = task.build_env(inventory_update, private_data_dir, private_data_files) path = os.path.join(private_data_dir, os.path.basename(env['OS_CLIENT_CONFIG_FILE'])) shade_config = open(path, 'r').read() @@ -1832,7 +1736,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): inventory_update.get_extra_credentials = mocker.Mock(return_value=[]) private_data_files = task.build_private_data_files(inventory_update, private_data_dir) - env = task.build_env(inventory_update, private_data_dir, False, private_data_files) + env = task.build_env(inventory_update, private_data_dir, private_data_files) safe_env = build_safe_env(env) assert env["FOREMAN_SERVER"] == "https://example.org" @@ -1856,7 +1760,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): inventory_update.get_cloud_credential = get_cred inventory_update.get_extra_credentials = mocker.Mock(return_value=[]) - env = task.build_env(inventory_update, private_data_dir, False) + env = task.build_env(inventory_update, private_data_dir) safe_env = build_safe_env(env) @@ -1888,7 +1792,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): inventory_update.get_cloud_credential = get_cred inventory_update.get_extra_credentials = mocker.Mock(return_value=[]) - env = task.build_env(inventory_update, private_data_dir, False) + env = task.build_env(inventory_update, private_data_dir) safe_env = {} credentials = task.build_credentials_list(inventory_update) for credential in credentials: @@ -1919,7 +1823,7 @@ class TestInventoryUpdateCredentials(TestJobExecution): settings.AWX_TASK_ENV = {'FOO': 'BAR'} private_data_files = task.build_private_data_files(inventory_update, private_data_dir) - env = task.build_env(inventory_update, private_data_dir, False, private_data_files) + env = task.build_env(inventory_update, private_data_dir, private_data_files) assert env['FOO'] == 'BAR' diff --git a/awx/main/wsbroadcast.py b/awx/main/wsbroadcast.py index 7e0ca3a7e2..b35747aee3 100644 --- a/awx/main/wsbroadcast.py +++ b/awx/main/wsbroadcast.py @@ -32,14 +32,7 @@ def unwrap_broadcast_msg(payload: dict): def get_broadcast_hosts(): Instance = apps.get_model('main', 'Instance') - instances = ( - # TODO: no longer filter for non-isolated after the models change - Instance.objects.filter(rampart_groups__controller__isnull=True) - .exclude(hostname=Instance.objects.me().hostname) - .order_by('hostname') - .values('hostname', 'ip_address') - .distinct() - ) + instances = Instance.objects.exclude(hostname=Instance.objects.me().hostname).order_by('hostname').values('hostname', 'ip_address').distinct() return {i['hostname']: i['ip_address'] or i['hostname'] for i in instances} diff --git a/awx/playbooks/check_isolated.yml b/awx/playbooks/check_isolated.yml deleted file mode 100644 index 472b772fbb..0000000000 --- a/awx/playbooks/check_isolated.yml +++ /dev/null @@ -1,70 +0,0 @@ ---- -# The following variables will be set by the runner of this playbook: -# src: /tmp/some/path/private_data_dir/ - -- name: Poll for status of active job. - hosts: all - gather_facts: false - collections: - - ansible.posix - - tasks: - - name: "Output job the playbook is running for" - debug: - msg: "Checking on job {{ job_id }}" - - - name: Determine if daemon process is alive. - shell: "ansible-runner is-alive {{src}}" - register: is_alive - ignore_errors: true - - - name: Copy artifacts from the isolated host. - synchronize: - src: "{{src}}/artifacts/" - dest: "{{src}}/artifacts/" - mode: pull - delete: true - recursive: true - when: ansible_kubectl_config is not defined - - - name: Copy daemon log from the isolated host - synchronize: - src: "{{src}}/daemon.log" - dest: "{{src}}/daemon.log" - mode: pull - when: ansible_kubectl_config is not defined - - - name: Copy artifacts from pod - synchronize: - src: "{{src}}/artifacts/" - dest: "{{src}}/artifacts/" - mode: pull - delete: true - recursive: true - set_remote_user: false - rsync_opts: - - "--blocking-io" - - "--rsh=$RSH" - environment: - RSH: "oc rsh --config={{ ansible_kubectl_config }}" - delegate_to: localhost - when: ansible_kubectl_config is defined - - - name: Copy daemon log from pod - synchronize: - src: "{{src}}/daemon.log" - dest: "{{src}}/daemon.log" - mode: pull - set_remote_user: false - rsync_opts: - - "--blocking-io" - - "--rsh=$RSH" - environment: - RSH: "oc rsh --config={{ ansible_kubectl_config }}" - delegate_to: localhost - when: ansible_kubectl_config is defined - - - name: Fail if previous check determined that process is not alive. - fail: - msg: "isolated task is still running" - when: "is_alive.rc == 0" diff --git a/awx/playbooks/clean_isolated.yml b/awx/playbooks/clean_isolated.yml deleted file mode 100644 index 63c044b8a1..0000000000 --- a/awx/playbooks/clean_isolated.yml +++ /dev/null @@ -1,31 +0,0 @@ ---- - -# The following variables will be set by the runner of this playbook: -# cleanup_dirs: ['/tmp/path/private_data_dir/', '/tmp//path/proot_temp_dir/'] -# private_data_dir: '/tmp/path/private_data_dir/' - -- name: Clean up from isolated job run. - hosts: all - gather_facts: false - - tasks: - - - name: cancel the job - command: "ansible-runner stop {{private_data_dir}}" - ignore_errors: true - - - name: remove build artifacts - file: - path: '{{item}}' - state: absent - register: result - with_items: "{{cleanup_dirs}}" - until: result is succeeded - ignore_errors: true - retries: 3 - delay: 5 - - - name: fail if build artifacts were not cleaned - fail: - msg: 'Unable to cleanup build artifacts' - when: not result is succeeded diff --git a/awx/playbooks/heartbeat_isolated.yml b/awx/playbooks/heartbeat_isolated.yml deleted file mode 100644 index 7963d5fbe2..0000000000 --- a/awx/playbooks/heartbeat_isolated.yml +++ /dev/null @@ -1,12 +0,0 @@ ---- -- name: Periodic background status check of isolated instances. - hosts: all - gather_facts: false - - tasks: - - - name: Get capacity of the instance - awx_capacity: - - - name: Remove any stale temporary files - awx_isolated_cleanup: diff --git a/awx/playbooks/run_isolated.yml b/awx/playbooks/run_isolated.yml deleted file mode 100644 index 76ea42d17c..0000000000 --- a/awx/playbooks/run_isolated.yml +++ /dev/null @@ -1,61 +0,0 @@ ---- - -# The following variables will be set by the runner of this playbook: -# src: /tmp/some/path/private_data_dir -# dest: /tmp/some/path/ - -- name: Prepare data, dispatch job in isolated environment. - hosts: all - gather_facts: false - vars: - secret: "{{ lookup('pipe', 'cat ' + src + '/env/ssh_key') }}" - collections: - - ansible.posix - - tasks: - - name: "Output job the playbook is running for" - debug: - msg: "Checking on job {{ job_id }}" - - - name: synchronize job environment with isolated host - synchronize: - copy_links: true - src: "{{ src }}" - dest: "{{ dest }}" - when: ansible_kubectl_config is not defined - - - name: synchronize job environment with remote job container - synchronize: - copy_links: true - src: "{{ src }}" - dest: "{{ dest }}" - set_remote_user: false - rsync_opts: - - "--blocking-io" - - "--rsh=$RSH" - environment: - RSH: "oc rsh --config={{ ansible_kubectl_config }}" - delegate_to: localhost - when: ansible_kubectl_config is defined - - - local_action: stat path="{{src}}/env/ssh_key" - register: key - - - name: create a named pipe for secret environment data - command: "mkfifo {{src}}/env/ssh_key" - when: key.stat.exists - - - name: spawn the playbook - command: "ansible-runner start {{src}} -p '{{playbook}}' -i {{ident}}" - when: playbook is defined - - - name: spawn the adhoc command - command: "ansible-runner start {{src}} -m {{module}} -a {{module_args}} -i {{ident}}" - when: module is defined - - - name: write the secret environment data - mkfifo: - content: "{{secret}}" - path: "{{src}}/env/ssh_key" - when: key.stat.exists - no_log: true diff --git a/awx/plugins/isolated/awx_capacity.py b/awx/plugins/isolated/awx_capacity.py deleted file mode 100644 index 2f33a8ffad..0000000000 --- a/awx/plugins/isolated/awx_capacity.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2017 Ansible by Red Hat -# -# This file is part of Ansible Tower, but depends on code imported from Ansible. -# -# Ansible is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Ansible is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Ansible. If not, see . - -from ansible.module_utils._text import to_text -from ansible.module_utils.basic import AnsibleModule - -import subprocess -import os -import psutil - - -def get_cpu_capacity(): - env_forkcpu = os.getenv('SYSTEM_TASK_FORKS_CPU', None) - cpu = psutil.cpu_count() - - if env_forkcpu: - forkcpu = int(env_forkcpu) - else: - forkcpu = 4 - return (cpu, cpu * forkcpu) - - -def get_mem_capacity(): - env_forkmem = os.getenv('SYSTEM_TASK_FORKS_MEM', None) - if env_forkmem: - forkmem = int(env_forkmem) - else: - forkmem = 100 - - mem = psutil.virtual_memory().total - return (mem, max(1, ((mem / 1024 / 1024) - 2048) / forkmem)) - - -def main(): - module = AnsibleModule(argument_spec=dict()) - - ar = module.get_bin_path('ansible-runner', required=True) - - try: - version = subprocess.check_output([ar, '--version'], stderr=subprocess.STDOUT).strip() - except subprocess.CalledProcessError as e: - module.fail_json(msg=to_text(e)) - return - # NOTE: Duplicated with awx.main.utils.common capacity utilities - cpu, capacity_cpu = get_cpu_capacity() - mem, capacity_mem = get_mem_capacity() - - # Module never results in a change - module.exit_json( - changed=False, - capacity_cpu=capacity_cpu, - capacity_mem=capacity_mem, - version=version, - ansible_facts=dict(awx_cpu=cpu, awx_mem=mem, awx_capacity_cpu=capacity_cpu, awx_capacity_mem=capacity_mem, awx_capacity_version=version), - ) - - -if __name__ == '__main__': - main() diff --git a/awx/plugins/isolated/awx_isolated_cleanup.py b/awx/plugins/isolated/awx_isolated_cleanup.py deleted file mode 100644 index 7f58a1f74a..0000000000 --- a/awx/plugins/isolated/awx_isolated_cleanup.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2017 Ansible by Red Hat -# -# This file is part of Ansible Tower, but depends on code imported from Ansible. -# -# Ansible is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Ansible is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Ansible. If not, see . - -from ansible.module_utils.basic import AnsibleModule - -import glob -import os -import re -import shutil -import datetime -import subprocess - - -def main(): - module = AnsibleModule(argument_spec=dict()) - changed = False - paths_removed = set([]) - - # If a folder was last modified before this datetime, it will always be deleted - folder_cutoff = datetime.datetime.now() - datetime.timedelta(days=7) - # If a folder does not have an associated job running and is older than - # this datetime, then it will be deleted because its job has finished - job_cutoff = datetime.datetime.now() - datetime.timedelta(hours=1) - - for search_pattern in ['/tmp/awx_[0-9]*_*', '/tmp/ansible_runner_pi_*']: - for path in glob.iglob(search_pattern): - st = os.stat(path) - modtime = datetime.datetime.fromtimestamp(st.st_mtime) - - if modtime > job_cutoff: - continue - elif modtime > folder_cutoff: - try: - re_match = re.match(r'\/tmp\/awx_\d+_.+', path) - if re_match is not None: - try: - if subprocess.check_call(['ansible-runner', 'is-alive', path]) == 0: - continue - except subprocess.CalledProcessError: - # the job isn't running anymore, clean up this path - module.debug('Deleting path {} its job has completed.'.format(path)) - except (ValueError, IndexError): - continue - else: - module.debug('Deleting path {} because modification date is too old.'.format(path)) - changed = True - paths_removed.add(path) - shutil.rmtree(path) - - module.exit_json(changed=changed, paths_removed=list(paths_removed)) - - -if __name__ == '__main__': - main() diff --git a/awx/plugins/isolated/mkfifo.py b/awx/plugins/isolated/mkfifo.py deleted file mode 100755 index 45741c2ad3..0000000000 --- a/awx/plugins/isolated/mkfifo.py +++ /dev/null @@ -1,31 +0,0 @@ -import os -import stat - -from ansible.module_utils.basic import AnsibleModule - - -# -# the purpose of this plugin is to call mkfifo and -# write raw SSH key data into the fifo created on the remote isolated host -# - - -def main(): - module = AnsibleModule(argument_spec={'path': {'required': True, 'type': 'str'}, 'content': {'required': True, 'type': 'str'}}, supports_check_mode=False) - - path = module.params['path'] - os.chmod(path, stat.S_IRUSR | stat.S_IWUSR) - with open(path, 'w') as fifo: - data = module.params['content'] - if 'OPENSSH PRIVATE KEY' in data and not data.endswith('\n'): - # we use ansible's lookup() to read this file from the disk, - # but ansible's lookup() *strips* newlines - # OpenSSH wants certain private keys to end with a newline (or it - # won't accept them) - data += '\n' - fifo.write(data) - module.exit_json(dest=path, changed=True) - - -if __name__ == '__main__': - main() diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index aa6e0ae4a0..4c758b5ca1 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -126,7 +126,7 @@ LOGIN_URL = '/api/login/' PROJECTS_ROOT = '/var/lib/awx/projects/' # Absolute filesystem path to the directory to host collections for -# running inventory imports, isolated playbooks +# running inventory imports AWX_ANSIBLE_COLLECTIONS_PATHS = os.path.join(BASE_DIR, 'vendor', 'awx_ansible_collections') # Absolute filesystem path to the directory for job status stdout (default for @@ -440,7 +440,6 @@ CELERYBEAT_SCHEDULE = { 'k8s_reaper': {'task': 'awx.main.tasks.awx_k8s_reaper', 'schedule': timedelta(seconds=60), 'options': {'expires': 50}}, 'send_subsystem_metrics': {'task': 'awx.main.analytics.analytics_tasks.send_subsystem_metrics', 'schedule': timedelta(seconds=20)}, 'cleanup_images': {'task': 'awx.main.tasks.cleanup_execution_environment_images', 'schedule': timedelta(hours=3)}, - # 'isolated_heartbeat': set up at the end of production.py and development.py } # Django Caching Configuration @@ -854,12 +853,6 @@ LOGGING = { 'filename': os.path.join(LOG_ROOT, 'tower_rbac_migrations.log'), 'formatter': 'simple', }, - 'isolated_manager': { - 'level': 'WARNING', - 'class': 'logging.handlers.WatchedFileHandler', - 'filename': os.path.join(LOG_ROOT, 'isolated_manager.log'), - 'formatter': 'simple', - }, 'job_lifecycle': { 'level': 'DEBUG', 'class': 'logging.handlers.WatchedFileHandler', @@ -881,8 +874,6 @@ LOGGING = { 'awx.main.dispatch': {'handlers': ['dispatcher']}, 'awx.main.consumers': {'handlers': ['console', 'file', 'tower_warnings'], 'level': 'INFO'}, 'awx.main.wsbroadcast': {'handlers': ['wsbroadcast']}, - 'awx.isolated.manager': {'level': 'WARNING', 'handlers': ['console', 'file', 'isolated_manager'], 'propagate': True}, - 'awx.isolated.manager.playbooks': {'handlers': ['management_playbooks'], 'propagate': False}, 'awx.main.commands.inventory_import': {'handlers': ['inventory_import'], 'propagate': False}, 'awx.main.tasks': {'handlers': ['task_system', 'external_logger'], 'propagate': False}, 'awx.main.analytics': {'handlers': ['task_system', 'external_logger'], 'level': 'INFO', 'propagate': False}, diff --git a/awx/settings/development.py b/awx/settings/development.py index 66dc12f50f..e836a723f6 100644 --- a/awx/settings/development.py +++ b/awx/settings/development.py @@ -35,9 +35,6 @@ LOGGING['handlers']['console']['()'] = 'awx.main.utils.handlers.ColorHandler' # LOGGING['handlers']['task_system'] = LOGGING['handlers']['console'].copy() # noqa COLOR_LOGS = True -# Pipe management playbook output to console -LOGGING['loggers']['awx.isolated.manager.playbooks']['propagate'] = True # noqa - # celery is annoyingly loud when docker containers start LOGGING['loggers'].pop('celery', None) # noqa # avoid awx.main.dispatch WARNING-level scaling worker up/down messages @@ -136,17 +133,6 @@ if "pytest" in sys.modules: } } - -CELERYBEAT_SCHEDULE.update( - { # noqa - 'isolated_heartbeat': { - 'task': 'awx.main.tasks.awx_isolated_heartbeat', - 'schedule': timedelta(seconds=AWX_ISOLATED_PERIODIC_CHECK), # noqa - 'options': {'expires': AWX_ISOLATED_PERIODIC_CHECK * 2}, # noqa - } - } -) - CLUSTER_HOST_ID = socket.gethostname() AWX_CALLBACK_PROFILE = True diff --git a/awx/settings/production.py b/awx/settings/production.py index f5a1bd7a7f..c6511cb5b1 100644 --- a/awx/settings/production.py +++ b/awx/settings/production.py @@ -91,14 +91,4 @@ except IOError: # The below runs AFTER all of the custom settings are imported. -CELERYBEAT_SCHEDULE.update( - { # noqa - 'isolated_heartbeat': { - 'task': 'awx.main.tasks.awx_isolated_heartbeat', - 'schedule': timedelta(seconds=AWX_ISOLATED_PERIODIC_CHECK), # noqa - 'options': {'expires': AWX_ISOLATED_PERIODIC_CHECK * 2}, # noqa - } - } -) - DATABASES['default'].setdefault('OPTIONS', dict()).setdefault('application_name', f'{CLUSTER_HOST_ID}-{os.getpid()}-{" ".join(sys.argv)}'[:63]) # noqa diff --git a/tools/ansible/roles/dockerfile/files/settings.py b/tools/ansible/roles/dockerfile/files/settings.py index c2abeb2df2..e2bbd8cb5b 100644 --- a/tools/ansible/roles/dockerfile/files/settings.py +++ b/tools/ansible/roles/dockerfile/files/settings.py @@ -58,7 +58,6 @@ LOGGING['loggers']['django_auth_ldap']['handlers'] = ['console'] LOGGING['loggers']['social']['handlers'] = ['console'] LOGGING['loggers']['system_tracking_migrations']['handlers'] = ['console'] LOGGING['loggers']['rbac_migrations']['handlers'] = ['console'] -LOGGING['loggers']['awx.isolated.manager.playbooks']['handlers'] = ['console'] LOGGING['handlers']['callback_receiver'] = {'class': 'logging.NullHandler'} LOGGING['handlers']['task_system'] = {'class': 'logging.NullHandler'} LOGGING['handlers']['tower_warnings'] = {'class': 'logging.NullHandler'} From 1819a7963a7117c48a4813a782e3b586e7c141d5 Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Wed, 21 Apr 2021 11:03:38 -0400 Subject: [PATCH 4/7] Make the necessary changes to the models - remove InstanceGroup.controller - remove Instance.last_isolated_check - remove .is_isolated and .is_controller methods/properties - remove .choose_online_controller_node() method - remove .supports_isolation() and replace with .can_run_containerized - simplify .can_run_containerized --- awx/main/migrations/0139_isolated_removal.py | 26 ++++++++++++++ awx/main/models/ad_hoc_commands.py | 4 --- awx/main/models/ha.py | 37 ++------------------ awx/main/models/jobs.py | 6 +--- awx/main/models/unified_jobs.py | 14 ++------ awx/main/scheduler/task_manager.py | 37 ++++---------------- 6 files changed, 39 insertions(+), 85 deletions(-) create mode 100644 awx/main/migrations/0139_isolated_removal.py diff --git a/awx/main/migrations/0139_isolated_removal.py b/awx/main/migrations/0139_isolated_removal.py new file mode 100644 index 0000000000..06bd0521cb --- /dev/null +++ b/awx/main/migrations/0139_isolated_removal.py @@ -0,0 +1,26 @@ +# Generated by Django 2.2.16 on 2021-04-21 15:02 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('main', '0138_custom_inventory_scripts_removal'), + ] + + operations = [ + migrations.RemoveField( + model_name='instance', + name='last_isolated_check', + ), + migrations.RemoveField( + model_name='instancegroup', + name='controller', + ), + migrations.AlterField( + model_name='unifiedjob', + name='controller_node', + field=models.TextField(blank=True, default='', editable=False, help_text='The instance that managed the execution environment.'), + ), + ] diff --git a/awx/main/models/ad_hoc_commands.py b/awx/main/models/ad_hoc_commands.py index 105991a8a0..5ee08857f6 100644 --- a/awx/main/models/ad_hoc_commands.py +++ b/awx/main/models/ad_hoc_commands.py @@ -146,10 +146,6 @@ class AdHocCommand(UnifiedJob, JobNotificationMixin): return RunAdHocCommand - @classmethod - def supports_isolation(cls): - return True - @property def is_container_group_task(self): return bool(self.instance_group and self.instance_group.is_container_group) diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index b8e5ab27a6..fd5ec56596 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -1,7 +1,6 @@ # Copyright (c) 2015 Ansible, Inc. # All Rights Reserved. -import random from decimal import Decimal from django.core.validators import MinValueValidator @@ -63,10 +62,6 @@ class Instance(HasPolicyEditsMixin, BaseModel): ) created = models.DateTimeField(auto_now_add=True) modified = models.DateTimeField(auto_now=True) - last_isolated_check = models.DateTimeField( - null=True, - editable=False, - ) version = models.CharField(max_length=120, blank=True) capacity = models.PositiveIntegerField( default=100, @@ -128,20 +123,12 @@ class Instance(HasPolicyEditsMixin, BaseModel): def jobs_total(self): return UnifiedJob.objects.filter(execution_node=self.hostname).count() - def is_lost(self, ref_time=None, isolated=False): + def is_lost(self, ref_time=None): if ref_time is None: ref_time = now() grace_period = 120 - if isolated: - grace_period = settings.AWX_ISOLATED_PERIODIC_CHECK * 2 return self.modified < ref_time - timedelta(seconds=grace_period) - def is_controller(self): - return Instance.objects.filter(rampart_groups__controller__instances=self).exists() - - def is_isolated(self): - return self.rampart_groups.filter(controller__isnull=False).exists() - def refresh_capacity(self): if settings.IS_K8S: self.capacity = self.cpu = self.memory = self.cpu_capacity = self.mem_capacity = 0 # noqa @@ -185,15 +172,6 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): editable=False, help_text=_('Instances that are members of this InstanceGroup'), ) - controller = models.ForeignKey( - 'InstanceGroup', - related_name='controlled_groups', - help_text=_('Instance Group to remotely control this group.'), - editable=False, - default=None, - null=True, - on_delete=models.CASCADE, - ) is_container_group = models.BooleanField(default=False) credential = models.ForeignKey( 'Credential', @@ -215,7 +193,7 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): default=[], blank=True, help_text=_("List of exact-match Instances that will always be automatically assigned to this group") ) - POLICY_FIELDS = frozenset(('policy_instance_list', 'policy_instance_minimum', 'policy_instance_percentage', 'controller')) + POLICY_FIELDS = frozenset(('policy_instance_list', 'policy_instance_minimum', 'policy_instance_percentage')) def get_absolute_url(self, request=None): return reverse('api:instance_group_detail', kwargs={'pk': self.pk}, request=request) @@ -232,14 +210,6 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): def jobs_total(self): return UnifiedJob.objects.filter(instance_group=self).count() - @property - def is_controller(self): - return self.controlled_groups.exists() - - @property - def is_isolated(self): - return bool(self.controller) - ''' RelatedJobsMixin ''' @@ -271,9 +241,6 @@ class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin): largest_instance = i return largest_instance - def choose_online_controller_node(self): - return random.choice(list(self.controller.instances.filter(capacity__gt=0, enabled=True).values_list('hostname', flat=True))) - def set_default_policy_fields(self): self.policy_instance_list = [] self.policy_instance_minimum = 0 diff --git a/awx/main/models/jobs.py b/awx/main/models/jobs.py index 807c276930..2b0bb6fd20 100644 --- a/awx/main/models/jobs.py +++ b/awx/main/models/jobs.py @@ -587,10 +587,6 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana return RunJob - @classmethod - def supports_isolation(cls): - return True - def _global_timeout_setting(self): return 'DEFAULT_JOB_TIMEOUT' @@ -759,7 +755,7 @@ class Job(UnifiedJob, JobOptions, SurveyJobMixin, JobNotificationMixin, TaskMana @property def can_run_containerized(self): - return any([ig for ig in self.preferred_instance_groups if ig.is_container_group]) + return True @property def is_container_group_task(self): diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index 6bc85e3162..65c8c8f39b 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -588,7 +588,7 @@ class UnifiedJob( blank=True, default='', editable=False, - help_text=_("The instance that managed the isolated execution environment."), + help_text=_("The instance that managed the execution environment."), ) notifications = models.ManyToManyField( 'Notification', @@ -737,10 +737,6 @@ class UnifiedJob( def _get_task_class(cls): raise NotImplementedError # Implement in subclasses. - @classmethod - def supports_isolation(cls): - return False - @property def can_run_containerized(self): return False @@ -1402,12 +1398,11 @@ class UnifiedJob( @property def preferred_instance_groups(self): """ - Return Instance/Rampart Groups preferred by this unified job templates + Return Instance/Rampart Groups preferred by this unified job template """ if not self.unified_job_template: return [] - template_groups = [x for x in self.unified_job_template.instance_groups.all()] - return template_groups + return list(self.unified_job_template.instance_groups.all()) @property def global_instance_groups(self): @@ -1467,9 +1462,6 @@ class UnifiedJob( def get_queue_name(self): return self.controller_node or self.execution_node or get_local_queuename() - def is_isolated(self): - return bool(self.controller_node) - @property def is_container_group_task(self): return False diff --git a/awx/main/scheduler/task_manager.py b/awx/main/scheduler/task_manager.py index 0757132cf9..115838c66c 100644 --- a/awx/main/scheduler/task_manager.py +++ b/awx/main/scheduler/task_manager.py @@ -6,7 +6,6 @@ from datetime import timedelta import logging import uuid import json -import random from types import SimpleNamespace # Django @@ -253,14 +252,6 @@ class TaskManager: } dependencies = [{'type': get_type_for_model(type(t)), 'id': t.id} for t in dependent_tasks] - controller_node = None - if task.supports_isolation() and rampart_group.controller_id: - try: - controller_node = rampart_group.choose_online_controller_node() - except IndexError: - logger.debug("No controllers available in group {} to run {}".format(rampart_group.name, task.log_format)) - return - task.status = 'waiting' (start_status, opts) = task.pre_start() @@ -277,38 +268,24 @@ class TaskManager: task.send_notification_templates('running') logger.debug('Transitioning %s to running status.', task.log_format) schedule_task_manager() - elif not task.supports_isolation() and rampart_group.controller_id: - # non-Ansible jobs on isolated instances run on controller - task.instance_group = rampart_group.controller - task.execution_node = random.choice(list(rampart_group.controller.instances.all().values_list('hostname', flat=True))) - logger.debug('Submitting isolated {} to queue {} on node {}.'.format(task.log_format, task.instance_group.name, task.execution_node)) - elif controller_node: - task.instance_group = rampart_group - task.execution_node = instance.hostname - task.controller_node = controller_node - logger.debug('Submitting isolated {} to queue {} controlled by {}.'.format(task.log_format, task.execution_node, controller_node)) elif rampart_group.is_container_group: # find one real, non-containerized instance with capacity to # act as the controller for k8s API interaction match = None - for group in InstanceGroup.objects.all(): - if group.is_container_group or group.controller_id: - continue + for group in InstanceGroup.objects.filter(is_container_group=False): match = group.fit_task_to_most_remaining_capacity_instance(task, group.instances.all()) if match: break task.instance_group = rampart_group if match is None: logger.warn('No available capacity to run containerized <{}>.'.format(task.log_format)) + elif task.can_run_containerized and any(ig.is_container_group for ig in task.preferred_instance_groups): + task.controller_node = match.hostname else: - if task.supports_isolation(): - task.controller_node = match.hostname - else: - # project updates and inventory updates don't *actually* run in pods, - # so just pick *any* non-isolated, non-containerized host and use it - # as the execution node - task.execution_node = match.hostname - logger.debug('Submitting containerized {} to queue {}.'.format(task.log_format, task.execution_node)) + # project updates and inventory updates don't *actually* run in pods, so + # just pick *any* non-containerized host and use it as the execution node + task.execution_node = match.hostname + logger.debug('Submitting containerized {} to queue {}.'.format(task.log_format, task.execution_node)) else: task.instance_group = rampart_group if instance is not None: From a17c34f041c0990d999358779f7cabb179e5aa78 Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Wed, 21 Apr 2021 11:42:35 -0400 Subject: [PATCH 5/7] Remove the isolation-specific settings - AWX_ISOLATED_PUBLIC_KEY - AWX_ISOLATED_PRIVATE_KEY - AWX_ISOLATED_KEY_GENERATION - AWX_ISOLATED_HOST_KEY_CHECKING - AWX_ISOLATED_USERNAME - AWX_ISOLATED_CONNECTION_TIMEOUT - AWX_ISOLATED_LAUNCH_TIMEOUT - AWX_ISOLATED_PERIODIC_CHECK - AWX_ISOLATED_CHECK_INTERVAL --- awx/conf/registry.py | 6 +- awx/conf/serializers.py | 6 +- awx/conf/settings.py | 9 +- awx/main/conf.py | 89 ------------------- .../tests/functional/api/test_settings.py | 56 ------------ awx/settings/defaults.py | 17 ---- awx/settings/development.py | 4 - awx/settings/production.py | 2 - 8 files changed, 5 insertions(+), 184 deletions(-) diff --git a/awx/conf/registry.py b/awx/conf/registry.py index 627099a57a..36f6eba6d2 100644 --- a/awx/conf/registry.py +++ b/awx/conf/registry.py @@ -92,11 +92,7 @@ class SettingsRegistry(object): continue if kwargs.get('category_slug', None) in slugs_to_ignore: continue - if ( - read_only in {True, False} - and kwargs.get('read_only', False) != read_only - and setting not in ('INSTALL_UUID', 'AWX_ISOLATED_PRIVATE_KEY', 'AWX_ISOLATED_PUBLIC_KEY') - ): + if read_only in {True, False} and kwargs.get('read_only', False) != read_only and setting != 'INSTALL_UUID': # Note: Doesn't catch fields that set read_only via __init__; # read-only field kwargs should always include read_only=True. continue diff --git a/awx/conf/serializers.py b/awx/conf/serializers.py index 838a636aaa..03e0f7e714 100644 --- a/awx/conf/serializers.py +++ b/awx/conf/serializers.py @@ -81,10 +81,8 @@ class SettingSingletonSerializer(serializers.Serializer): if self.instance and not hasattr(self.instance, key): continue extra_kwargs = {} - # Make LICENSE and AWX_ISOLATED_KEY_GENERATION read-only here; - # LICENSE is only updated via /api/v2/config/ - # AWX_ISOLATED_KEY_GENERATION is only set/unset via the setup playbook - if key in ('LICENSE', 'AWX_ISOLATED_KEY_GENERATION'): + # Make LICENSE read-only here; LICENSE is only updated via /api/v2/config/ + if key == 'LICENSE': extra_kwargs['read_only'] = True field = settings_registry.get_setting_field(key, mixin_class=SettingFieldMixin, for_user=bool(category_slug == 'user'), **extra_kwargs) fields[key] = field diff --git a/awx/conf/settings.py b/awx/conf/settings.py index 57d3265d72..cd8b2efe16 100644 --- a/awx/conf/settings.py +++ b/awx/conf/settings.py @@ -350,13 +350,8 @@ class SettingsWrapper(UserSettingsHolder): if value is empty: setting = None setting_id = None - if not field.read_only or name in ( - # these values are read-only - however - we *do* want - # to fetch their value from the database - 'INSTALL_UUID', - 'AWX_ISOLATED_PRIVATE_KEY', - 'AWX_ISOLATED_PUBLIC_KEY', - ): + # this value is read-only, however we *do* want to fetch its value from the database + if not field.read_only or name == 'INSTALL_UUID': setting = Setting.objects.filter(key=name, user__isnull=True).order_by('pk').first() if setting: if getattr(field, 'encrypted', False): diff --git a/awx/main/conf.py b/awx/main/conf.py index f50f813533..2644e2cdd7 100644 --- a/awx/main/conf.py +++ b/awx/main/conf.py @@ -250,95 +250,6 @@ register( category_slug='jobs', ) -register( - 'AWX_ISOLATED_CHECK_INTERVAL', - field_class=fields.IntegerField, - min_value=0, - label=_('Isolated status check interval'), - help_text=_('The number of seconds to sleep between status checks for jobs running on isolated instances.'), - category=_('Jobs'), - category_slug='jobs', - unit=_('seconds'), -) - -register( - 'AWX_ISOLATED_LAUNCH_TIMEOUT', - field_class=fields.IntegerField, - min_value=0, - label=_('Isolated launch timeout'), - help_text=_( - 'The timeout (in seconds) for launching jobs on isolated instances. ' - 'This includes the time needed to copy source control files (playbooks) to the isolated instance.' - ), - category=_('Jobs'), - category_slug='jobs', - unit=_('seconds'), -) - -register( - 'AWX_ISOLATED_CONNECTION_TIMEOUT', - field_class=fields.IntegerField, - min_value=0, - default=10, - label=_('Isolated connection timeout'), - help_text=_( - 'Ansible SSH connection timeout (in seconds) to use when communicating with isolated instances. ' - 'Value should be substantially greater than expected network latency.' - ), - category=_('Jobs'), - category_slug='jobs', - unit=_('seconds'), -) - -register( - 'AWX_ISOLATED_HOST_KEY_CHECKING', - field_class=fields.BooleanField, - label=_('Isolated host key checking'), - help_text=_('When set to True, AWX will enforce strict host key checking for communication with isolated nodes.'), - category=_('Jobs'), - category_slug='jobs', - default=False, -) - -register( - 'AWX_ISOLATED_KEY_GENERATION', - field_class=fields.BooleanField, - default=True, - label=_('Generate RSA keys for isolated instances'), - help_text=_( - 'If set, a random RSA key will be generated and distributed to ' - 'isolated instances. To disable this behavior and manage authentication ' - 'for isolated instances outside of Tower, disable this setting.' - ), # noqa - category=_('Jobs'), - category_slug='jobs', -) - -register( - 'AWX_ISOLATED_PRIVATE_KEY', - field_class=fields.CharField, - default='', - allow_blank=True, - encrypted=True, - read_only=True, - label=_('The RSA private key for SSH traffic to isolated instances'), - help_text=_('The RSA private key for SSH traffic to isolated instances'), # noqa - category=_('Jobs'), - category_slug='jobs', -) - -register( - 'AWX_ISOLATED_PUBLIC_KEY', - field_class=fields.CharField, - default='', - allow_blank=True, - read_only=True, - label=_('The RSA public key for SSH traffic to isolated instances'), - help_text=_('The RSA public key for SSH traffic to isolated instances'), # noqa - category=_('Jobs'), - category_slug='jobs', -) - register( 'AWX_TASK_ENV', field_class=fields.KeyValueField, diff --git a/awx/main/tests/functional/api/test_settings.py b/awx/main/tests/functional/api/test_settings.py index 84bfff2d18..a1ae7398a5 100644 --- a/awx/main/tests/functional/api/test_settings.py +++ b/awx/main/tests/functional/api/test_settings.py @@ -5,8 +5,6 @@ # Python import pytest -from django.conf import settings - # AWX from awx.api.versioning import reverse from awx.conf.models import Setting @@ -322,60 +320,6 @@ def test_logging_aggregator_connection_test_valid(put, post, admin): post(url, {}, user=admin, expect=202) -@pytest.mark.django_db -@pytest.mark.parametrize( - 'setting_name', - [ - 'AWX_ISOLATED_CHECK_INTERVAL', - 'AWX_ISOLATED_LAUNCH_TIMEOUT', - 'AWX_ISOLATED_CONNECTION_TIMEOUT', - ], -) -def test_isolated_job_setting_validation(get, patch, admin, setting_name): - url = reverse('api:setting_singleton_detail', kwargs={'category_slug': 'jobs'}) - patch(url, user=admin, data={setting_name: -1}, expect=400) - - data = get(url, user=admin).data - assert data[setting_name] != -1 - - -@pytest.mark.django_db -@pytest.mark.parametrize( - 'key, expected', - [ - ['AWX_ISOLATED_PRIVATE_KEY', '$encrypted$'], - ['AWX_ISOLATED_PUBLIC_KEY', 'secret'], - ], -) -def test_isolated_keys_readonly(get, patch, delete, admin, key, expected): - Setting.objects.create(key=key, value='secret').save() - assert getattr(settings, key) == 'secret' - - url = reverse('api:setting_singleton_detail', kwargs={'category_slug': 'jobs'}) - resp = get(url, user=admin) - assert resp.data[key] == expected - - patch(url, user=admin, data={key: 'new-secret'}) - assert getattr(settings, key) == 'secret' - - delete(url, user=admin) - assert getattr(settings, key) == 'secret' - - -@pytest.mark.django_db -def test_isolated_key_flag_readonly(get, patch, delete, admin): - settings.AWX_ISOLATED_KEY_GENERATION = True - url = reverse('api:setting_singleton_detail', kwargs={'category_slug': 'jobs'}) - resp = get(url, user=admin) - assert resp.data['AWX_ISOLATED_KEY_GENERATION'] is True - - patch(url, user=admin, data={'AWX_ISOLATED_KEY_GENERATION': False}) - assert settings.AWX_ISOLATED_KEY_GENERATION is True - - delete(url, user=admin) - assert settings.AWX_ISOLATED_KEY_GENERATION is True - - @pytest.mark.django_db @pytest.mark.parametrize('headers', [True, False]) def test_saml_x509cert_validation(patch, get, admin, headers): diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 4c758b5ca1..01d572cbe0 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -408,23 +408,6 @@ AUTH_BASIC_ENABLED = True # when trying to access a UI page that requries authentication. LOGIN_REDIRECT_OVERRIDE = '' -# Default to skipping isolated host key checking (the initial connection will -# hang on an interactive "The authenticity of host example.org can't be -# established" message) -AWX_ISOLATED_HOST_KEY_CHECKING = False - -# The number of seconds to sleep between status checks for jobs running on isolated nodes -AWX_ISOLATED_CHECK_INTERVAL = 30 - -# The timeout (in seconds) for launching jobs on isolated nodes -AWX_ISOLATED_LAUNCH_TIMEOUT = 600 - -# Ansible connection timeout (in seconds) for communicating with isolated instances -AWX_ISOLATED_CONNECTION_TIMEOUT = 10 - -# The time (in seconds) between the periodic isolated heartbeat status check -AWX_ISOLATED_PERIODIC_CHECK = 600 - DEVSERVER_DEFAULT_ADDR = '0.0.0.0' DEVSERVER_DEFAULT_PORT = '8013' diff --git a/awx/settings/development.py b/awx/settings/development.py index e836a723f6..12658ed602 100644 --- a/awx/settings/development.py +++ b/awx/settings/development.py @@ -64,10 +64,6 @@ CALLBACK_QUEUE = "callback_tasks" # Note: This setting may be overridden by database settings. AWX_ROLES_ENABLED = True -AWX_ISOLATED_USERNAME = 'root' -AWX_ISOLATED_CHECK_INTERVAL = 1 -AWX_ISOLATED_PERIODIC_CHECK = 30 - # Disable Pendo on the UI for development/test. # Note: This setting may be overridden by database settings. PENDO_TRACKING_STATE = "off" diff --git a/awx/settings/production.py b/awx/settings/production.py index c6511cb5b1..d74f1a4a85 100644 --- a/awx/settings/production.py +++ b/awx/settings/production.py @@ -40,8 +40,6 @@ ANSIBLE_VENV_PATH = os.path.join(BASE_VENV_PATH, "ansible") # Tower base virtualenv paths and enablement AWX_VENV_PATH = os.path.join(BASE_VENV_PATH, "awx") -AWX_ISOLATED_USERNAME = 'awx' - # Store a snapshot of default settings at this point before loading any # customizable config files. DEFAULTS_SNAPSHOT = {} From 17e3279f1ca6e63dc9dc3c17af374a6d628cd86b Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Wed, 21 Apr 2021 13:49:40 -0400 Subject: [PATCH 6/7] Remove isolated nodes from the API views and serializers --- awx/api/serializers.py | 15 +-- awx/api/views/__init__.py | 8 -- awx/api/views/mixin.py | 9 -- .../functional/api/test_instance_group.py | 96 +------------------ 4 files changed, 3 insertions(+), 125 deletions(-) diff --git a/awx/api/serializers.py b/awx/api/serializers.py index 5b50bc999c..941209ed99 100644 --- a/awx/api/serializers.py +++ b/awx/api/serializers.py @@ -169,7 +169,7 @@ SUMMARIZABLE_FK_FIELDS = { 'inventory_source': ('id', 'name', 'source', 'last_updated', 'status'), 'role': ('id', 'role_field'), 'notification_template': DEFAULT_SUMMARY_FIELDS, - 'instance_group': ('id', 'name', 'controller_id', 'is_container_group'), + 'instance_group': ('id', 'name', 'is_container_group'), 'insights_credential': DEFAULT_SUMMARY_FIELDS, 'source_credential': DEFAULT_SUMMARY_FIELDS + ('kind', 'cloud', 'credential_type_id'), 'target_credential': DEFAULT_SUMMARY_FIELDS + ('kind', 'cloud', 'credential_type_id'), @@ -4816,10 +4816,6 @@ class InstanceGroupSerializer(BaseSerializer): ) jobs_total = serializers.IntegerField(help_text=_('Count of all jobs that target this instance group'), read_only=True) instances = serializers.SerializerMethodField() - is_controller = serializers.BooleanField(help_text=_('Indicates whether instance group controls any other group'), read_only=True) - is_isolated = serializers.BooleanField( - help_text=_('Indicates whether instances in this group are isolated.' 'Isolated groups have a designated controller group.'), read_only=True - ) is_container_group = serializers.BooleanField( required=False, help_text=_('Indicates whether instances in this group are containerized.' 'Containerized groups have a designated Openshift or Kubernetes cluster.'), @@ -4867,9 +4863,6 @@ class InstanceGroupSerializer(BaseSerializer): "jobs_running", "jobs_total", "instances", - "controller", - "is_controller", - "is_isolated", "is_container_group", "credential", "policy_instance_percentage", @@ -4883,8 +4876,6 @@ class InstanceGroupSerializer(BaseSerializer): res = super(InstanceGroupSerializer, self).get_related(obj) res['jobs'] = self.reverse('api:instance_group_unified_jobs_list', kwargs={'pk': obj.pk}) res['instances'] = self.reverse('api:instance_group_instance_list', kwargs={'pk': obj.pk}) - if obj.controller_id: - res['controller'] = self.reverse('api:instance_group_detail', kwargs={'pk': obj.controller_id}) if obj.credential: res['credential'] = self.reverse('api:credential_detail', kwargs={'pk': obj.credential_id}) @@ -4896,10 +4887,6 @@ class InstanceGroupSerializer(BaseSerializer): raise serializers.ValidationError(_('Duplicate entry {}.').format(instance_name)) if not Instance.objects.filter(hostname=instance_name).exists(): raise serializers.ValidationError(_('{} is not a valid hostname of an existing instance.').format(instance_name)) - if Instance.objects.get(hostname=instance_name).is_isolated(): - raise serializers.ValidationError(_('Isolated instances may not be added or removed from instances groups via the API.')) - if self.instance and self.instance.controller_id is not None: - raise serializers.ValidationError(_('Isolated instance group membership may not be managed via the API.')) if value and self.instance and self.instance.is_container_group: raise serializers.ValidationError(_('Containerized instances may not be managed via the API')) return value diff --git a/awx/api/views/__init__.py b/awx/api/views/__init__.py index 789abe11e6..58c6746b6c 100644 --- a/awx/api/views/__init__.py +++ b/awx/api/views/__init__.py @@ -418,14 +418,6 @@ class InstanceGroupDetail(RelatedJobsPreventDeleteMixin, RetrieveUpdateDestroyAP data.pop('policy_instance_list', None) return super(InstanceGroupDetail, self).update_raw_data(data) - def destroy(self, request, *args, **kwargs): - instance = self.get_object() - if instance.controller is not None: - raise PermissionDenied(detail=_("Isolated Groups can not be removed from the API")) - if instance.controlled_groups.count(): - raise PermissionDenied(detail=_("Instance Groups acting as a controller for an Isolated Group can not be removed from the API")) - return super(InstanceGroupDetail, self).destroy(request, *args, **kwargs) - class InstanceGroupUnifiedJobsList(SubListAPIView): diff --git a/awx/api/views/mixin.py b/awx/api/views/mixin.py index ea2e8b38d4..0ab35e71b5 100644 --- a/awx/api/views/mixin.py +++ b/awx/api/views/mixin.py @@ -85,15 +85,6 @@ class InstanceGroupMembershipMixin(object): ig_obj.save(update_fields=['policy_instance_list']) return response - def is_valid_relation(self, parent, sub, created=False): - if sub.is_isolated(): - return {'error': _('Isolated instances may not be added or removed from instances groups via the API.')} - if self.parent_model is InstanceGroup: - ig_obj = self.get_parent_object() - if ig_obj.controller_id is not None: - return {'error': _('Isolated instance group membership may not be managed via the API.')} - return None - def unattach_validate(self, request): (sub_id, res) = super(InstanceGroupMembershipMixin, self).unattach_validate(request) if res: diff --git a/awx/main/tests/functional/api/test_instance_group.py b/awx/main/tests/functional/api/test_instance_group.py index c3cf44fd74..884856eed0 100644 --- a/awx/main/tests/functional/api/test_instance_group.py +++ b/awx/main/tests/functional/api/test_instance_group.py @@ -20,13 +20,7 @@ def tower_instance_group(): @pytest.fixture def instance(): - instance = Instance.objects.create(hostname='iso') - return instance - - -@pytest.fixture -def non_iso_instance(): - return Instance.objects.create(hostname='iamnotanisolatedinstance') + return Instance.objects.create(hostname='instance') @pytest.fixture @@ -36,15 +30,6 @@ def instance_group(job_factory): return ig -@pytest.fixture -def isolated_instance_group(instance_group, instance): - ig = InstanceGroup(name="iso", controller=instance_group) - ig.save() - ig.instances.set([instance]) - ig.save() - return ig - - @pytest.fixture def containerized_instance_group(instance_group, kube_credential): ig = InstanceGroup(name="container") @@ -97,26 +82,6 @@ def source_model(request): return request.getfixturevalue(request.param) -@pytest.mark.django_db -def test_instance_group_is_controller(instance_group, isolated_instance_group, non_iso_instance): - assert not isolated_instance_group.is_controller - assert instance_group.is_controller - - instance_group.instances.set([non_iso_instance]) - - assert instance_group.is_controller - - -@pytest.mark.django_db -def test_instance_group_is_isolated(instance_group, isolated_instance_group): - assert not instance_group.is_isolated - assert isolated_instance_group.is_isolated - - isolated_instance_group.instances.set([]) - - assert isolated_instance_group.is_isolated - - @pytest.mark.django_db def test_delete_instance_group_jobs(delete, instance_group_jobs_successful, instance_group, admin): url = reverse("api:instance_group_detail", kwargs={'pk': instance_group.pk}) @@ -160,61 +125,6 @@ def test_delete_rename_tower_instance_group_prevented(delete, options, tower_ins patch(url, {'name': 'foobar'}, super_user, expect=200) -@pytest.mark.django_db -def test_prevent_delete_iso_and_control_groups(delete, isolated_instance_group, admin): - iso_url = reverse("api:instance_group_detail", kwargs={'pk': isolated_instance_group.pk}) - controller_url = reverse("api:instance_group_detail", kwargs={'pk': isolated_instance_group.controller.pk}) - delete(iso_url, None, admin, expect=403) - delete(controller_url, None, admin, expect=403) - - -@pytest.mark.django_db -def test_prevent_isolated_instance_added_to_non_isolated_instance_group(post, admin, instance, instance_group, isolated_instance_group): - url = reverse("api:instance_group_instance_list", kwargs={'pk': instance_group.pk}) - - assert True is instance.is_isolated() - resp = post(url, {'associate': True, 'id': instance.id}, admin, expect=400) - assert u"Isolated instances may not be added or removed from instances groups via the API." == resp.data['error'] - - -@pytest.mark.django_db -def test_prevent_isolated_instance_added_to_non_isolated_instance_group_via_policy_list(patch, admin, instance, instance_group, isolated_instance_group): - url = reverse("api:instance_group_detail", kwargs={'pk': instance_group.pk}) - - assert True is instance.is_isolated() - resp = patch(url, {'policy_instance_list': [instance.hostname]}, user=admin, expect=400) - assert [u"Isolated instances may not be added or removed from instances groups via the API."] == resp.data['policy_instance_list'] - assert instance_group.policy_instance_list == [] - - -@pytest.mark.django_db -def test_prevent_isolated_instance_removal_from_isolated_instance_group(post, admin, instance, instance_group, isolated_instance_group): - url = reverse("api:instance_group_instance_list", kwargs={'pk': isolated_instance_group.pk}) - - assert True is instance.is_isolated() - resp = post(url, {'disassociate': True, 'id': instance.id}, admin, expect=400) - assert u"Isolated instances may not be added or removed from instances groups via the API." == resp.data['error'] - - -@pytest.mark.django_db -def test_prevent_non_isolated_instance_added_to_isolated_instance_group(post, admin, non_iso_instance, isolated_instance_group): - url = reverse("api:instance_group_instance_list", kwargs={'pk': isolated_instance_group.pk}) - - assert False is non_iso_instance.is_isolated() - resp = post(url, {'associate': True, 'id': non_iso_instance.id}, admin, expect=400) - assert u"Isolated instance group membership may not be managed via the API." == resp.data['error'] - - -@pytest.mark.django_db -def test_prevent_non_isolated_instance_added_to_isolated_instance_group_via_policy_list(patch, admin, non_iso_instance, isolated_instance_group): - url = reverse("api:instance_group_detail", kwargs={'pk': isolated_instance_group.pk}) - - assert False is non_iso_instance.is_isolated() - resp = patch(url, {'policy_instance_list': [non_iso_instance.hostname]}, user=admin, expect=400) - assert [u"Isolated instance group membership may not be managed via the API."] == resp.data['policy_instance_list'] - assert isolated_instance_group.policy_instance_list == [] - - @pytest.mark.django_db @pytest.mark.parametrize('source_model', ['job_template', 'inventory', 'organization'], indirect=True) def test_instance_group_order_persistence(get, post, admin, source_model): @@ -222,7 +132,7 @@ def test_instance_group_order_persistence(get, post, admin, source_model): total = 5 pks = list(range(total)) random.shuffle(pks) - instances = [InstanceGroup.objects.create(name='iso-%d' % i) for i in pks] + instances = [InstanceGroup.objects.create(name='group-%d' % i) for i in pks] view_name = camelcase_to_underscore(source_model.__class__.__name__) url = reverse('api:{}_instance_groups_list'.format(view_name), kwargs={'pk': source_model.pk}) @@ -252,7 +162,6 @@ def test_instance_group_update_fields(patch, instance, instance_group, admin, co # instance group (not containerized) ig_url = reverse("api:instance_group_detail", kwargs={'pk': instance_group.pk}) assert not instance_group.is_container_group - assert not containerized_instance_group.is_isolated resp = patch(ig_url, {'policy_instance_percentage': 15}, admin, expect=200) assert 15 == resp.data['policy_instance_percentage'] resp = patch(ig_url, {'policy_instance_minimum': 15}, admin, expect=200) @@ -263,7 +172,6 @@ def test_instance_group_update_fields(patch, instance, instance_group, admin, co # containerized instance group cg_url = reverse("api:instance_group_detail", kwargs={'pk': containerized_instance_group.pk}) assert containerized_instance_group.is_container_group - assert not containerized_instance_group.is_isolated resp = patch(cg_url, {'policy_instance_percentage': 15}, admin, expect=400) assert ["Containerized instances may not be managed via the API"] == resp.data['policy_instance_percentage'] resp = patch(cg_url, {'policy_instance_minimum': 15}, admin, expect=400) From 65cee65faddaadc38d00e82cb3b0b5f5f9a65608 Mon Sep 17 00:00:00 2001 From: Jeff Bradberry Date: Wed, 21 Apr 2021 15:26:21 -0400 Subject: [PATCH 7/7] Update the docs to remove references to isolated nodes --- docs/clustering.md | 56 --------------------------------- docs/container_groups.md | 14 ++++----- docs/container_groups/README.md | 14 ++++----- docs/tasks.md | 11 ------- 4 files changed, 12 insertions(+), 83 deletions(-) diff --git a/docs/clustering.md b/docs/clustering.md index 93c2d55ec0..c65882b46e 100644 --- a/docs/clustering.md +++ b/docs/clustering.md @@ -71,62 +71,6 @@ Recommendations and constraints: - Do not name any instance the same as a group name. -### Security-Isolated Rampart Groups - -In Tower versions 3.2+, customers may optionally define isolated groups inside of security-restricted networking zones from which to run jobs and ad hoc commands. Instances in these groups will _not_ have a full install of Tower, but will have a minimal set of utilities used to run jobs. Isolated groups must be specified in the inventory file prefixed with `isolated_group_`. An example inventory file is shown below: - -``` -[tower] -towerA -towerB -towerC - -[instance_group_security] -towerB -towerC - -[isolated_group_govcloud] -isolatedA -isolatedB - -[isolated_group_govcloud:vars] -controller=security -``` - -In the isolated rampart model, "controller" instances interact with "isolated" instances via a series of Ansible playbooks over SSH. At installation time, a randomized RSA key is generated and distributed as an authorized key to all "isolated" instances. The private half of the key is encrypted and stored within Tower, and is used to authenticate from "controller" instances to "isolated" instances when jobs are run. - -When a job is scheduled to run on an "isolated" instance: - -* The "controller" instance compiles metadata required to run the job and copies it to the "isolated" instance via `rsync` (any related project or inventory updates are run on the controller instance). This metadata includes: - - - the entire SCM checkout directory for the project - - a static inventory file - - pexpect passwords - - environment variables - - the `ansible`/`ansible-playbook` command invocation, _i.e._, `ansible-playbook -i /path/to/inventory /path/to/playbook.yml -e ...` - -* Once the metadata has been `rsync`ed to the isolated host, the "controller instance" starts a process on the "isolated" instance which consumes the metadata and starts running `ansible`/`ansible-playbook`. As the playbook runs, job artifacts (such as `stdout` and job events) are written to disk on the "isolated" instance. - -* While the job runs on the "isolated" instance, the "controller" instance periodically copies job artifacts (`stdout` and job events) from the "isolated" instance using `rsync`. It consumes these until the job finishes running on the "isolated" instance. - -Isolated groups are architected such that they may exist inside of a VPC with security rules that _only_ permit the instances in its `controller` group to access them; only ingress SSH traffic from "controller" instances to "isolated" instances is required. - -Recommendations for system configuration with isolated groups: - - Do not create a group named `isolated_group_tower`. - - Do not put any isolated instances inside the `tower` group or other ordinary instance groups. - - Define the `controller` variable as either a group var or as a hostvar on all the instances in the isolated group. Please _do not_ allow isolated instances in the same group have a different value for this variable - the behavior in this case can not be predicted. - - Do not put an isolated instance in more than one isolated group. - - -Isolated Instance Authentication --------------------------------- -At installation time, by default, a randomized RSA key is generated and distributed as an authorized key to all "isolated" instances. The private half of the key is encrypted and stored within Tower, and is used to authenticate from "controller" instances to "isolated" instances when jobs are run. - -For users who wish to manage SSH authentication from controlling instances to isolated instances via some system _outside_ of Tower (such as externally-managed, password-less SSH keys), this behavior can be disabled by unsetting two Tower API settings values: - -`HTTP PATCH /api/v2/settings/jobs/ {'AWX_ISOLATED_PRIVATE_KEY': '', 'AWX_ISOLATED_PUBLIC_KEY': ''}` - - ### Provisioning and Deprovisioning Instances and Groups * **Provisioning** - Provisioning Instances after installation is supported by updating the `inventory` file and re-running the setup playbook. It's important that this file contain all passwords and information used when installing the cluster, or other instances may be reconfigured (this can be done intentionally). diff --git a/docs/container_groups.md b/docs/container_groups.md index 5a9d88e58c..30f1f869be 100644 --- a/docs/container_groups.md +++ b/docs/container_groups.md @@ -1,13 +1,11 @@ # Container Groups -In a traditional AWX installation, jobs (ansible-playbook runs) are executed -either directly on a member of the cluster or on a pre-provisioned "isolated" -node. - -The concept of a Container Group (working name) allows for job environments to -be provisioned on-demand as a Pod that exists only for the duration of the -playbook run. This is known as the ephemeral execution model and ensures a clean -environment for every job run. +In a traditional AWX installation, jobs (ansible-playbook runs) are +executed directly on a member of the cluster. The concept of a +Container Group (working name) allows for job environments to be +provisioned on-demand as a Pod that exists only for the duration of +the playbook run. This is known as the ephemeral execution model and +ensures a clean environment for every job run. In some cases it is desireable to have the execution environment be "always-on", this is is done by manually creating an instance through the AWX API or UI. diff --git a/docs/container_groups/README.md b/docs/container_groups/README.md index a13644abb2..0949379bed 100644 --- a/docs/container_groups/README.md +++ b/docs/container_groups/README.md @@ -1,13 +1,11 @@ # Container Groups -In a traditional AWX installation, jobs (ansible-playbook runs) are executed -either directly on a member of the cluster or on a pre-provisioned "isolated" -node. - -The concept of a Container Group (working name) allows for job environments to -be provisioned on-demand as a Pod that exists only for the duration of the -playbook run. This is known as the ephemeral execution model and ensures a clean -environment for every job run. +In a traditional AWX installation, jobs (ansible-playbook runs) are +executed directly on a member of the cluster. The concept of a +Container Group (working name) allows for job environments to be +provisioned on-demand as a Pod that exists only for the duration of +the playbook run. This is known as the ephemeral execution model and +ensures a clean environment for every job run. ## Configuration diff --git a/docs/tasks.md b/docs/tasks.md index f2e29ec777..7ff5847052 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -157,17 +157,6 @@ One of the most important tasks in a clustered AWX installation is the periodic If a node in an AWX cluster discovers that one of its peers has not updated its heartbeat within a certain grace period, it is assumed to be offline, and its capacity is set to zero to avoid scheduling new tasks on that node. Additionally, jobs allegedly running or scheduled to run on that node are assumed to be lost, and "reaped", or marked as failed. -#### Isolated Tasks and Their Heartbeats - -AWX reports as much status as it can via the browsable API at `/api/v2/ping` in order to provide validation of the health of an instance, including the timestamps of the last heartbeat. Since isolated nodes don't have access to the AWX database, their heartbeats are performed by controller nodes instead. A periodic task, `awx_isolated_heartbeat`, is responsible for periodically connecting from a controller to each isolated node and retrieving its capacity (via SSH). - -When a job is scheduled to run on an isolated instance, the controller instance puts together the metadata required to run the job and then transfers it to the isolated instance. Once the metadata has been synchronized to the isolated host, the controller instance starts a process on the isolated instance, which consumes the metadata and starts running `ansible/ansible-playbook`. As the playbook runs, job artifacts (such as `stdout` and job events) are written to disk on the isolated instance. - -Alternatively: "While the job runs on the isolated instance, the controller instance periodically checks for and copies the job artifacts (_e.g._, `stdout` and job events) that it produces. It processes these until the job finishes running." - -To read more about Isolated Instances, refer to the [Isolated Instance Groups](https://docs.ansible.com/ansible-tower/latest/html/administration/clustering.html#isolated-instance-groups) section of the Clustering page in the Ansible Tower Administration guide. - - ## AWX Jobs ### Unified Jobs