From dd1a261bc35853eaf4b1668d4dddfcb8d3ff7d89 Mon Sep 17 00:00:00 2001 From: AlanCoding Date: Mon, 12 Jun 2017 08:43:46 -0400 Subject: [PATCH 1/5] setup playbook and heartbeat for isolated deployments * Allow isolated_group_ use in setup playbook * Tweaks to host/queue registration commands complementing setup * Create isolated heartbeat task and check capacity * Add content about isolated instances to acceptance docs --- Makefile | 4 +- awx/lib/management_modules/tower_capacity.py | 40 +++++++++++ awx/main/isolated/isolated_manager.py | 34 ++++++++++ .../management/commands/deprovision_node.py | 2 + .../management/commands/register_instance.py | 28 ++++++-- .../management/commands/register_queue.py | 24 +++++-- .../management/commands/unregister_queue.py | 1 + .../migrations/0043_v320_instancegroups.py | 5 ++ awx/main/models/ha.py | 5 ++ awx/main/models/unified_jobs.py | 12 ++++ awx/main/tasks.py | 30 ++++++++- .../functional/models/test_unified_job.py | 15 ++++- awx/main/tests/functional/test_tasks.py | 67 ++++++++++++++++++- awx/playbooks/heartbeat_isolated.yml | 16 +++++ awx/settings/defaults.py | 2 + awx/settings/development.py | 7 ++ docs/clustering.md | 52 ++++++++++++++ 17 files changed, 325 insertions(+), 19 deletions(-) create mode 100644 awx/lib/management_modules/tower_capacity.py create mode 100644 awx/playbooks/heartbeat_isolated.yml diff --git a/Makefile b/Makefile index 9d8d6f665b..b5e070b6ec 100644 --- a/Makefile +++ b/Makefile @@ -950,10 +950,10 @@ docker-isolated: TAG=$(COMPOSE_TAG) docker-compose -f tools/docker-compose.yml -f tools/docker-isolated-override.yml create docker start tools_tower_1 docker start tools_isolated_1 - if [ "`docker exec -i -t tools_isolated_1 cat /root/.ssh/authorized_keys`" != "" ]; then \ + if [ "`docker exec -i -t tools_isolated_1 cat /root/.ssh/authorized_keys`" == "`docker exec -t tools_tower_1 cat /root/.ssh/id_rsa.pub`" ]; then \ echo "SSH keys already copied to isolated instance"; \ else \ - docker exec "tools_isolated_1" bash -c "mkdir -p /root/.ssh && echo $$(docker exec -t tools_tower_1 cat /root/.ssh/id_rsa.pub) >> /root/.ssh/authorized_keys"; \ + docker exec "tools_isolated_1" bash -c "mkdir -p /root/.ssh && rm -f /root/.ssh/authorized_keys && echo $$(docker exec -t tools_tower_1 cat /root/.ssh/id_rsa.pub) >> /root/.ssh/authorized_keys"; \ fi TAG=$(COMPOSE_TAG) docker-compose -f tools/docker-compose.yml -f tools/docker-isolated-override.yml up diff --git a/awx/lib/management_modules/tower_capacity.py b/awx/lib/management_modules/tower_capacity.py new file mode 100644 index 0000000000..bd8dcab80e --- /dev/null +++ b/awx/lib/management_modules/tower_capacity.py @@ -0,0 +1,40 @@ +# Copyright (c) 2017 Ansible by Red Hat +# +# This file is part of Ansible Tower, but depends on code imported from Ansible. +# +# Ansible is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Ansible is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Ansible. If not, see . + +from ansible.module_utils.basic import AnsibleModule + +import subprocess + + +def main(): + module = AnsibleModule( + argument_spec = dict() + ) + # Duplicated with awx.main.utils.common.get_system_task_capacity + proc = subprocess.Popen(['free', '-m'], stdout=subprocess.PIPE) + out,err = proc.communicate() + total_mem_value = out.split()[7] + if int(total_mem_value) <= 2048: + cap = 50 + cap = 50 + ((int(total_mem_value) / 1024) - 2) * 75 + + # Module never results in a change and (hopefully) never fails + module.exit_json(changed=False, capacity=cap) + + +if __name__ == '__main__': + main() diff --git a/awx/main/isolated/isolated_manager.py b/awx/main/isolated/isolated_manager.py index caa2e1db01..44aa11e1a3 100644 --- a/awx/main/isolated/isolated_manager.py +++ b/awx/main/isolated/isolated_manager.py @@ -304,6 +304,40 @@ class IsolatedManager(object): # stdout_handle is closed by this point so writing output to logs is our only option logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(buff.getvalue())) + @staticmethod + def health_check(instance, cutoff_pk=0): + ''' + :param instance: Django object representing the isolated instance + :param cutoff_pk: Job id of the oldest job still in the running state + Method logic not yet written. + returns the instance's capacity or None if it is not reachable + ''' + start_delimiter = 'wNqCXG6uul' + end_delimiter = 'n6kmoFyyAP' + extra_vars = dict( + cutoff_pk=cutoff_pk, + start_delimiter=start_delimiter, + end_delimiter=end_delimiter + ) + args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', + '%s,' % instance.hostname, 'heartbeat_isolated.yml', '-e', + json.dumps(extra_vars)] + module_path = os.path.join(os.path.dirname(awx.__file__), 'lib', 'management_modules') + playbook_path = os.path.join(os.path.dirname(awx.__file__), 'playbooks') + env = {'ANSIBLE_LIBRARY': module_path} + buff = cStringIO.StringIO() + status, rc = run.run_pexpect( + args, playbook_path, env, buff, + idle_timeout=60, job_timeout=60, + pexpect_timeout=5 + ) + output = buff.getvalue() + if status != 'successful': + return 0 # recognized by task manager as 'unreachable' + result = re.search('{}(.*){}'.format(start_delimiter, end_delimiter), output) + cap = result.group(1) + return cap + @staticmethod def wrap_stdout_handle(instance, private_data_dir, stdout_handle): dispatcher = CallbackQueueDispatcher() diff --git a/awx/main/management/commands/deprovision_node.py b/awx/main/management/commands/deprovision_node.py index 8412b5bd86..d64a780434 100644 --- a/awx/main/management/commands/deprovision_node.py +++ b/awx/main/management/commands/deprovision_node.py @@ -23,11 +23,13 @@ class Command(BaseCommand): instance = Instance.objects.filter(hostname=options.get('name')) if instance.exists(): instance.delete() + print("Instance Removed") result = subprocess.Popen("rabbitmqctl forget_cluster_node rabbitmq@{}".format(options.get('name')), shell=True).wait() if result != 0: print("Node deprovisioning may have failed when attempting to remove the RabbitMQ instance from the cluster") else: print('Successfully deprovisioned {}'.format(options.get('name'))) + print('(changed: True)') else: print('No instance found matching name {}'.format(options.get('name'))) diff --git a/awx/main/management/commands/register_instance.py b/awx/main/management/commands/register_instance.py index 7ce6be787b..6895aa644f 100644 --- a/awx/main/management/commands/register_instance.py +++ b/awx/main/management/commands/register_instance.py @@ -17,14 +17,32 @@ class Command(BaseCommand): option_list = BaseCommand.option_list + ( make_option('--hostname', dest='hostname', type='string', help='Hostname used during provisioning'), + make_option('--hostnames', dest='hostnames', type='string', + help='Alternatively hostnames can be provided with ' + 'this option as a comma-Delimited list'), ) - def handle(self, **options): - uuid = settings.SYSTEM_UUID - instance = Instance.objects.filter(hostname=options.get('hostname')) + def _register_hostname(self, hostname): + if not hostname: + return + instance = Instance.objects.filter(hostname=hostname) if instance.exists(): print("Instance already registered {}".format(instance[0])) return - instance = Instance(uuid=uuid, hostname=options.get('hostname')) + instance = Instance(uuid=self.uuid, hostname=hostname) instance.save() - print('Successfully registered instance {}'.format(instance)) + print('Successfully registered instance {}'.format(hostname)) + self.changed = True + + def handle(self, **options): + self.uuid = settings.SYSTEM_UUID + self.changed = False + self._register_hostname(options.get('hostname')) + hostname_list = [] + if options.get('hostnames'): + hostname_list = options.get('hostnames').split(",") + instance_list = [x.strip() for x in hostname_list if x] + for inst_name in instance_list: + self._register_hostname(inst_name) + if self.changed: + print('(changed: True)') diff --git a/awx/main/management/commands/register_queue.py b/awx/main/management/commands/register_queue.py index e0ca862a37..3601b009d4 100644 --- a/awx/main/management/commands/register_queue.py +++ b/awx/main/management/commands/register_queue.py @@ -5,7 +5,7 @@ import sys from awx.main.models import Instance, InstanceGroup from optparse import make_option -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandError class Command(BaseCommand): @@ -20,34 +20,44 @@ class Command(BaseCommand): ) def handle(self, **options): + if not options.get('queuename'): + raise CommandError("Specify `--queuename` to use this command.") + changed = False ig = InstanceGroup.objects.filter(name=options.get('queuename')) control_ig = None if options.get('controller'): control_ig = InstanceGroup.objects.filter(name=options.get('controller')).first() if ig.exists(): - print("Instance Group already registered {}".format(ig[0])) + print("Instance Group already registered {}".format(ig[0].name)) ig = ig[0] if control_ig and ig.controller_id != control_ig.pk: ig.controller = control_ig ig.save() - print("Set controller group {} on {}.".format(control_ig, ig)) + print("Set controller group {} on {}.".format(control_ig.name, ig.name)) + changed = True else: print("Creating instance group {}".format(options.get('queuename'))) ig = InstanceGroup(name=options.get('queuename')) if control_ig: ig.controller = control_ig ig.save() + changed = True hostname_list = [] if options.get('hostnames'): hostname_list = options.get('hostnames').split(",") - instance_list = [x.strip() for x in hostname_list] + instance_list = [x.strip() for x in hostname_list if x] for inst_name in instance_list: instance = Instance.objects.filter(hostname=inst_name) - if instance.exists() and instance not in ig.instances.all(): + if instance.exists() and instance[0] not in ig.instances.all(): ig.instances.add(instance[0]) - print("Added instance {} to {}".format(instance[0], ig)) + print("Added instance {} to {}".format(instance[0].hostname, ig.name)) + changed = True elif not instance.exists(): print("Instance does not exist: {}".format(inst_name)) + if changed: + print('(changed: True)') sys.exit(1) else: - print("Instance already registered {}".format(instance[0])) + print("Instance already registered {}".format(instance[0].hostname)) + if changed: + print('(changed: True)') diff --git a/awx/main/management/commands/unregister_queue.py b/awx/main/management/commands/unregister_queue.py index 388a8f0588..335ce38dbc 100644 --- a/awx/main/management/commands/unregister_queue.py +++ b/awx/main/management/commands/unregister_queue.py @@ -30,3 +30,4 @@ class Command(BaseCommand): ig = ig.first() ig.delete() print("Instance Group Removed") + print('(changed: True)') diff --git a/awx/main/migrations/0043_v320_instancegroups.py b/awx/main/migrations/0043_v320_instancegroups.py index bf0585acf6..3f7d279307 100644 --- a/awx/main/migrations/0043_v320_instancegroups.py +++ b/awx/main/migrations/0043_v320_instancegroups.py @@ -48,4 +48,9 @@ class Migration(migrations.Migration): name='instance_group', field=models.ManyToManyField(to='main.InstanceGroup', blank=True), ), + migrations.AddField( + model_name='instance', + name='last_isolated_check', + field=models.DateTimeField(auto_now_add=True, null=True), + ), ] diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 134646d2cb..600a72af27 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -26,6 +26,11 @@ class Instance(models.Model): hostname = models.CharField(max_length=250, unique=True) created = models.DateTimeField(auto_now_add=True) modified = models.DateTimeField(auto_now=True) + last_isolated_check = models.DateTimeField( + null=True, + editable=False, + auto_now_add=True + ) version = models.CharField(max_length=24, blank=True) capacity = models.PositiveIntegerField( default=100, diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index 55c53f1d8f..c8259c7590 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -726,6 +726,18 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique pass super(UnifiedJob, self).delete() + @classmethod + def lowest_running_id(cls): + oldest_running_job = cls.objects.filter(status__in=ACTIVE_STATES).order_by('id').only('id').first() + if oldest_running_job is not None: + return oldest_running_job.id + else: + newest_finished_job = cls.objects.order_by('id').only('id').last() + if newest_finished_job is None: + return 1 # System has no finished jobs + else: + return newest_finished_job.id + 1 + def copy_unified_job(self): ''' Returns saved object, including related fields. diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 8445e6ce19..27f16a5d38 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -19,7 +19,6 @@ import traceback import urlparse import uuid from distutils.version import LooseVersion as Version -from datetime import timedelta import yaml import fcntl try: @@ -34,7 +33,7 @@ from celery.signals import celeryd_init, worker_process_init # Django from django.conf import settings from django.db import transaction, DatabaseError, IntegrityError -from django.utils.timezone import now +from django.utils.timezone import now, timedelta from django.utils.encoding import smart_str from django.core.mail import send_mail from django.contrib.auth.models import User @@ -197,6 +196,33 @@ def cluster_node_heartbeat(self): stop_local_services(['uwsgi', 'celery', 'beat', 'callback', 'fact']) +@task(bind=True) +def tower_isolated_heartbeat(self): + local_hostname = settings.CLUSTER_HOST_ID + logger.debug("Controlling node checking for any isolated management tasks.") + poll_interval = settings.AWX_ISOLATED_PERIODIC_CHECK + # Add in some task buffer time + nowtime = now() + accept_before = nowtime - timedelta(seconds=(poll_interval - 10)) + isolated_instance_qs = Instance.objects.filter( + rampart_groups__controller__instances__hostname=local_hostname, + last_isolated_check__lt=accept_before + ) + # Fast pass of isolated instances, claiming the nodes to update + with transaction.atomic(): + for isolated_instance in isolated_instance_qs: + isolated_instance.last_isolated_check = nowtime + isolated_instance.save(update_fields=['last_isolated_check']) + # Find the oldest job in the system and pass that to the cleanup + if not isolated_instance_qs: + return + cutoff_pk = UnifiedJob.lowest_running_id() + # Slow pass looping over isolated IGs and their isolated instances + for isolated_instance in isolated_instance_qs: + logger.debug("Managing isolated instance {}.".format(isolated_instance.hostname)) + isolated_instance.capacity = isolated_manager.IsolatedManager.health_check(isolated_instance, cutoff_pk=cutoff_pk) + isolated_instance.save(update_fields=['capacity']) + @task(bind=True, queue='tower') def tower_periodic_scheduler(self): diff --git a/awx/main/tests/functional/models/test_unified_job.py b/awx/main/tests/functional/models/test_unified_job.py index 5b89644457..358dfc02ab 100644 --- a/awx/main/tests/functional/models/test_unified_job.py +++ b/awx/main/tests/functional/models/test_unified_job.py @@ -4,7 +4,7 @@ import pytest from django.contrib.contenttypes.models import ContentType # AWX -from awx.main.models import UnifiedJobTemplate, Job, JobTemplate, WorkflowJobTemplate, Project +from awx.main.models import UnifiedJobTemplate, Job, JobTemplate, WorkflowJobTemplate, Project, UnifiedJob @pytest.mark.django_db @@ -65,3 +65,16 @@ class TestCreateUnifiedJob: assert second_job.inventory == job_with_links.inventory assert second_job.limit == 'my_server' assert net_credential in second_job.extra_credentials.all() + + +@pytest.mark.django_db +def test_lowest_running_id(): + assert UnifiedJob.lowest_running_id() == 1 + Job.objects.create(status='finished') + old_job = Job.objects.create(status='finished') + assert UnifiedJob.lowest_running_id() == old_job.id + 1 + old_running_job = Job.objects.create(status='running') + Job.objects.create(status='running') + assert UnifiedJob.lowest_running_id() == old_running_job.id + Job.objects.create(status='finished') + assert UnifiedJob.lowest_running_id() == old_running_job.id diff --git a/awx/main/tests/functional/test_tasks.py b/awx/main/tests/functional/test_tasks.py index f09847c2d2..da1f774047 100644 --- a/awx/main/tests/functional/test_tasks.py +++ b/awx/main/tests/functional/test_tasks.py @@ -2,8 +2,17 @@ import pytest import mock import os -from awx.main.tasks import RunProjectUpdate, RunInventoryUpdate -from awx.main.models import ProjectUpdate, InventoryUpdate, InventorySource +from django.utils.timezone import now, timedelta + +from awx.main.tasks import ( + RunProjectUpdate, RunInventoryUpdate, + tower_isolated_heartbeat, + isolated_manager +) +from awx.main.models import ( + ProjectUpdate, InventoryUpdate, InventorySource, + Instance, InstanceGroup +) @pytest.fixture @@ -73,3 +82,57 @@ class TestDependentInventoryUpdate: # Verify that it bails after 1st update, detecting a cancel assert is2.inventory_updates.count() == 0 iu_run_mock.assert_called_once() + + + +class MockSettings: + AWX_ISOLATED_PERIODIC_CHECK = 60 + CLUSTER_HOST_ID = 'tower_1' + + +@pytest.mark.django_db +class TestIsolatedManagementTask: + + @pytest.fixture + def control_group(self): + return InstanceGroup.objects.create(name='alpha') + + @pytest.fixture + def control_instance(self, control_group): + return control_group.instances.create(hostname='tower_1') + + @pytest.fixture + def needs_updating(self, control_group): + ig = InstanceGroup.objects.create(name='thepentagon', controller=control_group) + inst = ig.instances.create( + hostname='isolated', capacity=103) + inst.last_isolated_check=now() - timedelta(seconds=MockSettings.AWX_ISOLATED_PERIODIC_CHECK) + inst.save() + return ig + + @pytest.fixture + def just_updated(self, control_group): + ig = InstanceGroup.objects.create(name='thepentagon', controller=control_group) + inst = ig.instances.create( + hostname='isolated', capacity=103) + inst.last_isolated_check=now() + inst.save() + return inst + + def test_takes_action(self, control_instance, needs_updating): + with mock.patch('awx.main.tasks.settings', MockSettings()): + with mock.patch.object(isolated_manager.IsolatedManager, 'health_check') as check_mock: + check_mock.return_value = 98 + tower_isolated_heartbeat() + iso_instance = Instance.objects.get(hostname='isolated') + check_mock.assert_called_once_with(iso_instance, cutoff_pk=mock.ANY) + assert iso_instance.capacity == 98 + + def test_does_not_take_action(self, control_instance, just_updated): + with mock.patch('awx.main.tasks.settings', MockSettings()): + with mock.patch.object(isolated_manager.IsolatedManager, 'health_check') as check_mock: + check_mock.return_value = 98 + tower_isolated_heartbeat() + iso_instance = Instance.objects.get(hostname='isolated') + check_mock.assert_not_called() + assert iso_instance.capacity == 103 diff --git a/awx/playbooks/heartbeat_isolated.yml b/awx/playbooks/heartbeat_isolated.yml new file mode 100644 index 0000000000..403e3d7439 --- /dev/null +++ b/awx/playbooks/heartbeat_isolated.yml @@ -0,0 +1,16 @@ +--- + +# The following variables will be set by the runner of this playbook: +# job_id_cutoff: + +- hosts: all + gather_facts: false + + tasks: + + - name: Get capacity of the instance + tower_capacity: + register: result + + - name: Print capacity in escaped string to scrape out + debug: msg="{{ start_delimiter|default('') }}{{ result['capacity'] }}{{ end_delimiter|default('') }}" diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 6530331863..537f5f07b6 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -609,6 +609,8 @@ AWX_ISOLATED_CHECK_INTERVAL = 30 # The timeout (in seconds) for launching jobs on isolated nodes AWX_ISOLATED_LAUNCH_TIMEOUT = 600 +# The time between the background isolated heartbeat status check +AWX_ISOLATED_PERIODIC_CHECK = 600 # Enable Pendo on the UI, possible values are 'off', 'anonymous', and 'detailed' # Note: This setting may be overridden by database settings. diff --git a/awx/settings/development.py b/awx/settings/development.py index 414bcfb48f..0edf353f6a 100644 --- a/awx/settings/development.py +++ b/awx/settings/development.py @@ -114,6 +114,13 @@ except ImportError: CLUSTER_HOST_ID = socket.gethostname() CELERY_ROUTES['awx.main.tasks.cluster_node_heartbeat'] = {'queue': CLUSTER_HOST_ID, 'routing_key': CLUSTER_HOST_ID} +# Production only runs this schedule on controlling nodes +# but development will just run it on all nodes +CELERYBEAT_SCHEDULE['isolated_heartbeat'] = { + 'task': 'awx.main.tasks.tower_isolated_heartbeat', + 'schedule': timedelta(seconds = AWX_ISOLATED_PERIODIC_CHECK), + 'options': {'expires': AWX_ISOLATED_PERIODIC_CHECK * 2,} +} # Supervisor service name dictionary used for programatic restart SERVICE_NAME_DICT = { diff --git a/docs/clustering.md b/docs/clustering.md index 56fe723cf0..05dba666a1 100644 --- a/docs/clustering.md +++ b/docs/clustering.md @@ -112,6 +112,58 @@ rabbitmq_use_long_name=false rabbitmq_enable_manager=false ``` +### Security Isolated Rampart Groups + +In Tower versions 3.2+ customers may optionally define isolated groups +inside security-restricted networking zones to run jobs from. +Instances in these groups will _not_ have a full install of Tower, but will have a minimal +set of utilities used to run jobs on them. These must be specified +in the inventory file prefixed with `isolated_group_`. An example inventory +file is shown below. + +``` +[tower] +towerA +towerB +towerC + +[instance_group_security] +towerB +towerC + +[isolated_group_govcloud] +isolatedA +isolatedB + +[isolated_group_govcloud:vars] +controller=security +``` + +In this example, when a job runs inside of the `govcloud` isolated group, a +managing task runs simultaneously on either one of the two instances in +the `security` ordinary instance group. + +Networking security rules must allow +connections to all nodes in an isolated group from all nodes in its controller +group. The system is designed such that +isolated instances never make requests to any of their controllers. +The controlling instance for a particular job will send management commands to +a daemon that runs the job, and will slurp job artifacts. + +Isolated groups are architected such that they may exist inside of a VPC +with security rules that _only_ permit the instances in its `controller` +group to access them. + +Recommendations for system configuration with isolated groups: + - Do not put any isolated instances inside the `tower` group or other + ordinary instance groups. + - Define the `controller` variable as either a group var or as a hostvar + on all the instances in the isolated group. Please _do not_ allow + isolated instances in the same group have a different value for this + variable - the behavior in this case can not be predicted. + - Do not put an isolated instance in more than 1 isolated group. + + ### Provisioning and Deprovisioning Instances and Groups * Provisioning From 1d9f2be0e4067fc2d79e01c06892f77ddbd2ac90 Mon Sep 17 00:00:00 2001 From: AlanCoding Date: Mon, 19 Jun 2017 09:23:02 -0400 Subject: [PATCH 2/5] add task in isolated heartbeat to delete rogue files --- .../tower_isolated_cleanup.py | 75 +++++++++++++++++++ awx/playbooks/heartbeat_isolated.yml | 6 +- 2 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 awx/lib/management_modules/tower_isolated_cleanup.py diff --git a/awx/lib/management_modules/tower_isolated_cleanup.py b/awx/lib/management_modules/tower_isolated_cleanup.py new file mode 100644 index 0000000000..94b4f4063a --- /dev/null +++ b/awx/lib/management_modules/tower_isolated_cleanup.py @@ -0,0 +1,75 @@ +# Copyright (c) 2017 Ansible by Red Hat +# +# This file is part of Ansible Tower, but depends on code imported from Ansible. +# +# Ansible is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Ansible is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Ansible. If not, see . + +from ansible.module_utils.basic import AnsibleModule + +import glob +import os +import re +import shutil +import datetime + + +def main(): + module = AnsibleModule( + argument_spec = dict( + cutoff_pk = dict(required=False, default=0, type='int'), + ) + ) + cutoff_pk = module.params.get('cutoff_pk') + changed = False + jobs_removed = set([]) + + cutoff_time = datetime.datetime.now() - datetime.timedelta(days=7) + + for search_pattern, extract_pattern in [ + ('/tmp/ansible_tower/jobs/*', r'\/tmp\/ansible_tower\/jobs\/(?P\d+)'), + ('/tmp/ansible_tower_*', r'\/tmp\/ansible_tower_(?P\d+)_*'), + ('/tmp/ansible_tower_proot_*', None), + ]: + for path in glob.iglob(search_pattern): + st = os.stat(path) + modtime = datetime.datetime.fromtimestamp(st.st_mtime) + if modtime > cutoff_time: + # If job's pk value is lower than threshold, we delete it + try: + if extract_pattern is None: + continue + re_match = re.match(extract_pattern, path) + if re_match is None: + continue + job_id = int(re_match.group('job_id')) + if job_id >= cutoff_pk: + module.debug('Skipping job {}, which may still be running.'.format(job_id)) + continue + except (ValueError, IndexError): + continue + else: + module.debug('Deleting path {} because modification date is too old.'.format(path)) + job_id = 'unknown' + changed = True + jobs_removed.add(job_id) + if os.path.islink(path): + os.remove(path) + else: + shutil.rmtree(path) + + module.exit_json(changed=changed, jobs_removed=[j for j in jobs_removed]) + + +if __name__ == '__main__': + main() diff --git a/awx/playbooks/heartbeat_isolated.yml b/awx/playbooks/heartbeat_isolated.yml index 403e3d7439..3232254a20 100644 --- a/awx/playbooks/heartbeat_isolated.yml +++ b/awx/playbooks/heartbeat_isolated.yml @@ -12,5 +12,9 @@ tower_capacity: register: result - - name: Print capacity in escaped string to scrape out + - name: Print capacity in escaped string to scrape debug: msg="{{ start_delimiter|default('') }}{{ result['capacity'] }}{{ end_delimiter|default('') }}" + + - name: Remove any stale temporary files + tower_isolated_cleanup: + cutoff_pk: "{{ cutoff_pk | default(0) }}" From 7eb8fdd0f161c2ee3d88e194a5972b9b66ed0964 Mon Sep 17 00:00:00 2001 From: AlanCoding Date: Mon, 19 Jun 2017 12:05:26 -0400 Subject: [PATCH 3/5] Change use of subprocess lib for calculating system capacity Suggested by Ryan Petrello as a part of the PR feedback for the isolated instance setup work. --- awx/lib/management_modules/tower_capacity.py | 7 +++++-- awx/main/utils/common.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/awx/lib/management_modules/tower_capacity.py b/awx/lib/management_modules/tower_capacity.py index bd8dcab80e..2c3785fe1a 100644 --- a/awx/lib/management_modules/tower_capacity.py +++ b/awx/lib/management_modules/tower_capacity.py @@ -25,8 +25,11 @@ def main(): argument_spec = dict() ) # Duplicated with awx.main.utils.common.get_system_task_capacity - proc = subprocess.Popen(['free', '-m'], stdout=subprocess.PIPE) - out,err = proc.communicate() + try: + out = subprocess.check_output(['free', '-m']) + except subprocess.CalledProcessError as e: + module.fail_json(msg=str(e)) + return total_mem_value = out.split()[7] if int(total_mem_value) <= 2048: cap = 50 diff --git a/awx/main/utils/common.py b/awx/main/utils/common.py index b78f5b4ba5..d8c9e11f20 100644 --- a/awx/main/utils/common.py +++ b/awx/main/utils/common.py @@ -543,8 +543,11 @@ def get_system_task_capacity(): from django.conf import settings if hasattr(settings, 'SYSTEM_TASK_CAPACITY'): return settings.SYSTEM_TASK_CAPACITY - proc = subprocess.Popen(['free', '-m'], stdout=subprocess.PIPE) - out,err = proc.communicate() + try: + out = subprocess.check_output(['free', '-m']) + except subprocess.CalledProcessError as e: + logger.error('Problem obtaining capacity from system, error:\n{}'.format(str(e))) + return 0 total_mem_value = out.split()[7] if int(total_mem_value) <= 2048: return 50 From f371dd71b259908a9f5fd49727e46b24f397a356 Mon Sep 17 00:00:00 2001 From: AlanCoding Date: Mon, 19 Jun 2017 12:07:40 -0400 Subject: [PATCH 4/5] Run isolated heartbeat against all hosts at once Previously we were running the playbook on a host-by-host basis, but this changes it to pass in the list of all isolated isntances the machine is responsible for. Using the `json` Ansible stdout module, we are able to parse the output for information on each host. --- awx/main/isolated/isolated_manager.py | 33 ++++++++++++++++----------- awx/main/tasks.py | 7 +++--- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/awx/main/isolated/isolated_manager.py b/awx/main/isolated/isolated_manager.py index 44aa11e1a3..1a08b98549 100644 --- a/awx/main/isolated/isolated_manager.py +++ b/awx/main/isolated/isolated_manager.py @@ -305,26 +305,29 @@ class IsolatedManager(object): logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(buff.getvalue())) @staticmethod - def health_check(instance, cutoff_pk=0): + def health_check(instance_qs, cutoff_pk=0): ''' - :param instance: Django object representing the isolated instance + :param instance_qs: List of Django objects representing the + isolated instances to manage :param cutoff_pk: Job id of the oldest job still in the running state Method logic not yet written. returns the instance's capacity or None if it is not reachable ''' - start_delimiter = 'wNqCXG6uul' - end_delimiter = 'n6kmoFyyAP' extra_vars = dict( cutoff_pk=cutoff_pk, - start_delimiter=start_delimiter, - end_delimiter=end_delimiter ) + hostname_string = '' + for instance in instance_qs: + hostname_string += '{},'.format(instance.hostname) args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', - '%s,' % instance.hostname, 'heartbeat_isolated.yml', '-e', + hostname_string, 'heartbeat_isolated.yml', '-e', json.dumps(extra_vars)] module_path = os.path.join(os.path.dirname(awx.__file__), 'lib', 'management_modules') playbook_path = os.path.join(os.path.dirname(awx.__file__), 'playbooks') - env = {'ANSIBLE_LIBRARY': module_path} + env = { + 'ANSIBLE_LIBRARY': module_path, + 'ANSIBLE_STDOUT_CALLBACK': 'json' + } buff = cStringIO.StringIO() status, rc = run.run_pexpect( args, playbook_path, env, buff, @@ -332,11 +335,15 @@ class IsolatedManager(object): pexpect_timeout=5 ) output = buff.getvalue() - if status != 'successful': - return 0 # recognized by task manager as 'unreachable' - result = re.search('{}(.*){}'.format(start_delimiter, end_delimiter), output) - cap = result.group(1) - return cap + output = output[output.find('{'):] # Remove starting log statements + result = json.loads(output) + for instance in instance_qs: + task_result = result['plays'][0]['tasks'][0]['hosts'][instance.hostname] + if 'capacity' in task_result: + instance.capacity = int(task_result['capacity']) + instance.save(update_fields=['capacity']) + elif 'msg' in task_result: + logger.warning('Could not update capacity of {}, msg={}'.format(instance.hostname, task_result['msg'])) @staticmethod def wrap_stdout_handle(instance, private_data_dir, stdout_handle): diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 27f16a5d38..09e32c25f5 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -218,10 +218,9 @@ def tower_isolated_heartbeat(self): return cutoff_pk = UnifiedJob.lowest_running_id() # Slow pass looping over isolated IGs and their isolated instances - for isolated_instance in isolated_instance_qs: - logger.debug("Managing isolated instance {}.".format(isolated_instance.hostname)) - isolated_instance.capacity = isolated_manager.IsolatedManager.health_check(isolated_instance, cutoff_pk=cutoff_pk) - isolated_instance.save(update_fields=['capacity']) + if len(isolated_instance_qs) > 0: + logger.debug("Managing isolated instances {}.".format(','.join([inst.hostname for inst in isolated_instance_qs]))) + isolated_manager.IsolatedManager.health_check(isolated_instance_qs, cutoff_pk=cutoff_pk) @task(bind=True, queue='tower') From 40287d8e78b7b7e4820139c6d28d1ad3f3095476 Mon Sep 17 00:00:00 2001 From: AlanCoding Date: Mon, 19 Jun 2017 12:13:03 -0400 Subject: [PATCH 5/5] multi-host isolated heartbeat w tower-isolated check * use tower-expect command to determine job status when running the isolated heartbeat playbook * grok JSON output of playbook to obtain result information * run playbook against multiple isolated hosts at the same time (addresses scalability concerns) --- awx/lib/management_modules/tower_capacity.py | 2 +- .../tower_isolated_cleanup.py | 51 ++++++------ awx/main/isolated/isolated_manager.py | 77 +++++++++++-------- awx/main/models/unified_jobs.py | 12 --- awx/main/tasks.py | 9 +-- .../functional/models/test_unified_job.py | 15 +--- awx/main/tests/functional/test_tasks.py | 15 ++-- awx/main/utils/common.py | 4 +- awx/playbooks/heartbeat_isolated.yml | 9 --- 9 files changed, 84 insertions(+), 110 deletions(-) diff --git a/awx/lib/management_modules/tower_capacity.py b/awx/lib/management_modules/tower_capacity.py index 2c3785fe1a..03bbb0cecd 100644 --- a/awx/lib/management_modules/tower_capacity.py +++ b/awx/lib/management_modules/tower_capacity.py @@ -35,7 +35,7 @@ def main(): cap = 50 cap = 50 + ((int(total_mem_value) / 1024) - 2) * 75 - # Module never results in a change and (hopefully) never fails + # Module never results in a change module.exit_json(changed=False, capacity=cap) diff --git a/awx/lib/management_modules/tower_isolated_cleanup.py b/awx/lib/management_modules/tower_isolated_cleanup.py index 94b4f4063a..529a24fd9d 100644 --- a/awx/lib/management_modules/tower_isolated_cleanup.py +++ b/awx/lib/management_modules/tower_isolated_cleanup.py @@ -22,53 +22,48 @@ import os import re import shutil import datetime +import subprocess def main(): module = AnsibleModule( - argument_spec = dict( - cutoff_pk = dict(required=False, default=0, type='int'), - ) + argument_spec = dict() ) - cutoff_pk = module.params.get('cutoff_pk') changed = False - jobs_removed = set([]) + paths_removed = set([]) - cutoff_time = datetime.datetime.now() - datetime.timedelta(days=7) + # If a folder was last modified before this datetime, it will always be deleted + folder_cutoff = datetime.datetime.now() - datetime.timedelta(days=7) + # If a folder does not have an associated job running and is older than + # this datetime, then it will be deleted because its job has finished + job_cutoff = datetime.datetime.now() - datetime.timedelta(hours=1) - for search_pattern, extract_pattern in [ - ('/tmp/ansible_tower/jobs/*', r'\/tmp\/ansible_tower\/jobs\/(?P\d+)'), - ('/tmp/ansible_tower_*', r'\/tmp\/ansible_tower_(?P\d+)_*'), - ('/tmp/ansible_tower_proot_*', None), + for search_pattern in [ + '/tmp/ansible_tower_[0-9]*_*', '/tmp/ansible_tower_proot_*', ]: for path in glob.iglob(search_pattern): st = os.stat(path) modtime = datetime.datetime.fromtimestamp(st.st_mtime) - if modtime > cutoff_time: - # If job's pk value is lower than threshold, we delete it + + if modtime > job_cutoff: + continue + elif modtime > folder_cutoff: try: - if extract_pattern is None: - continue - re_match = re.match(extract_pattern, path) - if re_match is None: - continue - job_id = int(re_match.group('job_id')) - if job_id >= cutoff_pk: - module.debug('Skipping job {}, which may still be running.'.format(job_id)) - continue + re_match = re.match(r'\/tmp\/ansible_tower_\d+_.+', path) + if re_match is not None: + if subprocess.check_call(['tower-expect', 'is-alive', path]) == 0: + continue + else: + module.debug('Deleting path {} its job has completed.'.format(path)) except (ValueError, IndexError): continue else: module.debug('Deleting path {} because modification date is too old.'.format(path)) - job_id = 'unknown' changed = True - jobs_removed.add(job_id) - if os.path.islink(path): - os.remove(path) - else: - shutil.rmtree(path) + paths_removed.add(path) + shutil.rmtree(path) - module.exit_json(changed=changed, jobs_removed=[j for j in jobs_removed]) + module.exit_json(changed=changed, paths_removed=list(paths_removed)) if __name__ == '__main__': diff --git a/awx/main/isolated/isolated_manager.py b/awx/main/isolated/isolated_manager.py index 1a08b98549..139231271b 100644 --- a/awx/main/isolated/isolated_manager.py +++ b/awx/main/isolated/isolated_manager.py @@ -57,9 +57,7 @@ class IsolatedManager(object): self.cwd = cwd self.env = env.copy() # Do not use callbacks for controller's management jobs - self.env['ANSIBLE_CALLBACK_PLUGINS'] = '' - self.env['CALLBACK_QUEUE'] = '' - self.env['CALLBACK_CONNECTION'] = '' + self.env.update(self._base_management_env()) self.stdout_handle = stdout_handle self.ssh_key_path = ssh_key_path self.expect_passwords = {k.pattern: v for k, v in expect_passwords.items()} @@ -71,8 +69,18 @@ class IsolatedManager(object): self.proot_cmd = proot_cmd self.started_at = None - @property - def awx_playbook_path(self): + @staticmethod + def _base_management_env(): + return { + 'ANSIBLE_CALLBACK_PLUGINS': '', + 'CALLBACK_QUEUE': '', + 'CALLBACK_CONNECTION': '', + 'ANSIBLE_RETRY_FILES_ENABLED': 'False', + 'ANSIBLE_HOST_KEY_CHECKING': 'False' + } + + @classmethod + def awx_playbook_path(cls): return os.path.join( os.path.dirname(awx.__file__), 'playbooks' @@ -134,7 +142,7 @@ class IsolatedManager(object): buff = StringIO.StringIO() logger.debug('Starting job on isolated host with `run_isolated.yml` playbook.') status, rc = run.run_pexpect( - args, self.awx_playbook_path, self.env, buff, + args, self.awx_playbook_path(), self.env, buff, expect_passwords={ re.compile(r'Secret:\s*?$', re.M): base64.b64encode(json.dumps(secrets)) }, @@ -244,7 +252,7 @@ class IsolatedManager(object): buff = cStringIO.StringIO() logger.debug('Checking job on isolated host with `check_isolated.yml` playbook.') status, rc = run.run_pexpect( - args, self.awx_playbook_path, self.env, buff, + args, self.awx_playbook_path(), self.env, buff, cancelled_callback=self.cancelled_callback, idle_timeout=remaining, job_timeout=remaining, @@ -295,7 +303,7 @@ class IsolatedManager(object): logger.debug('Cleaning up job on isolated host with `clean_isolated.yml` playbook.') buff = cStringIO.StringIO() status, rc = run.run_pexpect( - args, self.awx_playbook_path, self.env, buff, + args, self.awx_playbook_path(), self.env, buff, idle_timeout=60, job_timeout=60, pexpect_timeout=5 ) @@ -304,46 +312,55 @@ class IsolatedManager(object): # stdout_handle is closed by this point so writing output to logs is our only option logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(buff.getvalue())) - @staticmethod - def health_check(instance_qs, cutoff_pk=0): + @classmethod + def health_check(cls, instance_qs): ''' :param instance_qs: List of Django objects representing the isolated instances to manage - :param cutoff_pk: Job id of the oldest job still in the running state - Method logic not yet written. - returns the instance's capacity or None if it is not reachable + Runs playbook that will + - determine if instance is reachable + - find the instance capacity + - clean up orphaned private files + Performs save on each instance to update its capacity. ''' - extra_vars = dict( - cutoff_pk=cutoff_pk, - ) hostname_string = '' for instance in instance_qs: hostname_string += '{},'.format(instance.hostname) args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i', - hostname_string, 'heartbeat_isolated.yml', '-e', - json.dumps(extra_vars)] - module_path = os.path.join(os.path.dirname(awx.__file__), 'lib', 'management_modules') - playbook_path = os.path.join(os.path.dirname(awx.__file__), 'playbooks') - env = { - 'ANSIBLE_LIBRARY': module_path, - 'ANSIBLE_STDOUT_CALLBACK': 'json' - } + hostname_string, 'heartbeat_isolated.yml'] + env = cls._base_management_env() + env['ANSIBLE_LIBRARY'] = os.path.join(os.path.dirname(awx.__file__), 'lib', 'management_modules') + env['ANSIBLE_STDOUT_CALLBACK'] = 'json' + buff = cStringIO.StringIO() status, rc = run.run_pexpect( - args, playbook_path, env, buff, + args, cls.awx_playbook_path(), env, buff, idle_timeout=60, job_timeout=60, pexpect_timeout=5 ) output = buff.getvalue() - output = output[output.find('{'):] # Remove starting log statements - result = json.loads(output) + buff.close() + + try: + result = json.loads(output) + if not isinstance(result, dict): + raise TypeError('Expected a dict but received {}.'.format(str(type(result)))) + except (ValueError, AssertionError, TypeError): + logger.exception('Failed to read status from isolated instances, output:\n {}'.format(output)) + return + for instance in instance_qs: - task_result = result['plays'][0]['tasks'][0]['hosts'][instance.hostname] + try: + task_result = result['plays'][0]['tasks'][0]['hosts'][instance.hostname] + except (KeyError, IndexError): + logger.exception('Failed to read status from isolated instance {}.'.format(instance.hostname)) + continue if 'capacity' in task_result: instance.capacity = int(task_result['capacity']) instance.save(update_fields=['capacity']) - elif 'msg' in task_result: - logger.warning('Could not update capacity of {}, msg={}'.format(instance.hostname, task_result['msg'])) + else: + logger.warning('Could not update capacity of {}, msg={}'.format( + instance.hostname, task_result.get('msg', 'unknown failure'))) @staticmethod def wrap_stdout_handle(instance, private_data_dir, stdout_handle): diff --git a/awx/main/models/unified_jobs.py b/awx/main/models/unified_jobs.py index c8259c7590..55c53f1d8f 100644 --- a/awx/main/models/unified_jobs.py +++ b/awx/main/models/unified_jobs.py @@ -726,18 +726,6 @@ class UnifiedJob(PolymorphicModel, PasswordFieldsModel, CommonModelNameNotUnique pass super(UnifiedJob, self).delete() - @classmethod - def lowest_running_id(cls): - oldest_running_job = cls.objects.filter(status__in=ACTIVE_STATES).order_by('id').only('id').first() - if oldest_running_job is not None: - return oldest_running_job.id - else: - newest_finished_job = cls.objects.order_by('id').only('id').last() - if newest_finished_job is None: - return 1 # System has no finished jobs - else: - return newest_finished_job.id + 1 - def copy_unified_job(self): ''' Returns saved object, including related fields. diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 09e32c25f5..19bd26bb1f 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -201,7 +201,7 @@ def tower_isolated_heartbeat(self): local_hostname = settings.CLUSTER_HOST_ID logger.debug("Controlling node checking for any isolated management tasks.") poll_interval = settings.AWX_ISOLATED_PERIODIC_CHECK - # Add in some task buffer time + # Get isolated instances not checked since poll interval - some buffer nowtime = now() accept_before = nowtime - timedelta(seconds=(poll_interval - 10)) isolated_instance_qs = Instance.objects.filter( @@ -212,15 +212,12 @@ def tower_isolated_heartbeat(self): with transaction.atomic(): for isolated_instance in isolated_instance_qs: isolated_instance.last_isolated_check = nowtime + # Prevent modified time from being changed, as in normal heartbeat isolated_instance.save(update_fields=['last_isolated_check']) - # Find the oldest job in the system and pass that to the cleanup - if not isolated_instance_qs: - return - cutoff_pk = UnifiedJob.lowest_running_id() # Slow pass looping over isolated IGs and their isolated instances if len(isolated_instance_qs) > 0: logger.debug("Managing isolated instances {}.".format(','.join([inst.hostname for inst in isolated_instance_qs]))) - isolated_manager.IsolatedManager.health_check(isolated_instance_qs, cutoff_pk=cutoff_pk) + isolated_manager.IsolatedManager.health_check(isolated_instance_qs) @task(bind=True, queue='tower') diff --git a/awx/main/tests/functional/models/test_unified_job.py b/awx/main/tests/functional/models/test_unified_job.py index 358dfc02ab..5b89644457 100644 --- a/awx/main/tests/functional/models/test_unified_job.py +++ b/awx/main/tests/functional/models/test_unified_job.py @@ -4,7 +4,7 @@ import pytest from django.contrib.contenttypes.models import ContentType # AWX -from awx.main.models import UnifiedJobTemplate, Job, JobTemplate, WorkflowJobTemplate, Project, UnifiedJob +from awx.main.models import UnifiedJobTemplate, Job, JobTemplate, WorkflowJobTemplate, Project @pytest.mark.django_db @@ -65,16 +65,3 @@ class TestCreateUnifiedJob: assert second_job.inventory == job_with_links.inventory assert second_job.limit == 'my_server' assert net_credential in second_job.extra_credentials.all() - - -@pytest.mark.django_db -def test_lowest_running_id(): - assert UnifiedJob.lowest_running_id() == 1 - Job.objects.create(status='finished') - old_job = Job.objects.create(status='finished') - assert UnifiedJob.lowest_running_id() == old_job.id + 1 - old_running_job = Job.objects.create(status='running') - Job.objects.create(status='running') - assert UnifiedJob.lowest_running_id() == old_running_job.id - Job.objects.create(status='finished') - assert UnifiedJob.lowest_running_id() == old_running_job.id diff --git a/awx/main/tests/functional/test_tasks.py b/awx/main/tests/functional/test_tasks.py index da1f774047..065d979819 100644 --- a/awx/main/tests/functional/test_tasks.py +++ b/awx/main/tests/functional/test_tasks.py @@ -104,8 +104,7 @@ class TestIsolatedManagementTask: @pytest.fixture def needs_updating(self, control_group): ig = InstanceGroup.objects.create(name='thepentagon', controller=control_group) - inst = ig.instances.create( - hostname='isolated', capacity=103) + inst = ig.instances.create(hostname='isolated', capacity=103) inst.last_isolated_check=now() - timedelta(seconds=MockSettings.AWX_ISOLATED_PERIODIC_CHECK) inst.save() return ig @@ -113,25 +112,25 @@ class TestIsolatedManagementTask: @pytest.fixture def just_updated(self, control_group): ig = InstanceGroup.objects.create(name='thepentagon', controller=control_group) - inst = ig.instances.create( - hostname='isolated', capacity=103) + inst = ig.instances.create(hostname='isolated', capacity=103) inst.last_isolated_check=now() inst.save() return inst def test_takes_action(self, control_instance, needs_updating): + original_isolated_instance = needs_updating.instances.all().first() with mock.patch('awx.main.tasks.settings', MockSettings()): with mock.patch.object(isolated_manager.IsolatedManager, 'health_check') as check_mock: - check_mock.return_value = 98 tower_isolated_heartbeat() iso_instance = Instance.objects.get(hostname='isolated') - check_mock.assert_called_once_with(iso_instance, cutoff_pk=mock.ANY) - assert iso_instance.capacity == 98 + call_args, _ = check_mock.call_args + assert call_args[0][0] == iso_instance + assert iso_instance.last_isolated_check > original_isolated_instance.last_isolated_check + assert iso_instance.modified == original_isolated_instance.modified def test_does_not_take_action(self, control_instance, just_updated): with mock.patch('awx.main.tasks.settings', MockSettings()): with mock.patch.object(isolated_manager.IsolatedManager, 'health_check') as check_mock: - check_mock.return_value = 98 tower_isolated_heartbeat() iso_instance = Instance.objects.get(hostname='isolated') check_mock.assert_not_called() diff --git a/awx/main/utils/common.py b/awx/main/utils/common.py index d8c9e11f20..c10f44a800 100644 --- a/awx/main/utils/common.py +++ b/awx/main/utils/common.py @@ -545,8 +545,8 @@ def get_system_task_capacity(): return settings.SYSTEM_TASK_CAPACITY try: out = subprocess.check_output(['free', '-m']) - except subprocess.CalledProcessError as e: - logger.error('Problem obtaining capacity from system, error:\n{}'.format(str(e))) + except subprocess.CalledProcessError: + logger.exception('Problem obtaining capacity from system.') return 0 total_mem_value = out.split()[7] if int(total_mem_value) <= 2048: diff --git a/awx/playbooks/heartbeat_isolated.yml b/awx/playbooks/heartbeat_isolated.yml index 3232254a20..58b2f52b3c 100644 --- a/awx/playbooks/heartbeat_isolated.yml +++ b/awx/playbooks/heartbeat_isolated.yml @@ -1,8 +1,4 @@ --- - -# The following variables will be set by the runner of this playbook: -# job_id_cutoff: - - hosts: all gather_facts: false @@ -10,11 +6,6 @@ - name: Get capacity of the instance tower_capacity: - register: result - - - name: Print capacity in escaped string to scrape - debug: msg="{{ start_delimiter|default('') }}{{ result['capacity'] }}{{ end_delimiter|default('') }}" - name: Remove any stale temporary files tower_isolated_cleanup: - cutoff_pk: "{{ cutoff_pk | default(0) }}"