diff --git a/Makefile b/Makefile
index 0d3673bbba..c346268767 100644
--- a/Makefile
+++ b/Makefile
@@ -950,10 +950,10 @@ docker-isolated:
TAG=$(COMPOSE_TAG) docker-compose -f tools/docker-compose.yml -f tools/docker-isolated-override.yml create
docker start tools_tower_1
docker start tools_isolated_1
- if [ "`docker exec -i -t tools_isolated_1 cat /root/.ssh/authorized_keys`" != "" ]; then \
+ if [ "`docker exec -i -t tools_isolated_1 cat /root/.ssh/authorized_keys`" == "`docker exec -t tools_tower_1 cat /root/.ssh/id_rsa.pub`" ]; then \
echo "SSH keys already copied to isolated instance"; \
else \
- docker exec "tools_isolated_1" bash -c "mkdir -p /root/.ssh && echo $$(docker exec -t tools_tower_1 cat /root/.ssh/id_rsa.pub) >> /root/.ssh/authorized_keys"; \
+ docker exec "tools_isolated_1" bash -c "mkdir -p /root/.ssh && rm -f /root/.ssh/authorized_keys && echo $$(docker exec -t tools_tower_1 cat /root/.ssh/id_rsa.pub) >> /root/.ssh/authorized_keys"; \
fi
TAG=$(COMPOSE_TAG) docker-compose -f tools/docker-compose.yml -f tools/docker-isolated-override.yml up
diff --git a/awx/lib/management_modules/tower_capacity.py b/awx/lib/management_modules/tower_capacity.py
new file mode 100644
index 0000000000..03bbb0cecd
--- /dev/null
+++ b/awx/lib/management_modules/tower_capacity.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2017 Ansible by Red Hat
+#
+# This file is part of Ansible Tower, but depends on code imported from Ansible.
+#
+# Ansible is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Ansible is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Ansible. If not, see .
+
+from ansible.module_utils.basic import AnsibleModule
+
+import subprocess
+
+
+def main():
+ module = AnsibleModule(
+ argument_spec = dict()
+ )
+ # Duplicated with awx.main.utils.common.get_system_task_capacity
+ try:
+ out = subprocess.check_output(['free', '-m'])
+ except subprocess.CalledProcessError as e:
+ module.fail_json(msg=str(e))
+ return
+ total_mem_value = out.split()[7]
+ if int(total_mem_value) <= 2048:
+ cap = 50
+ cap = 50 + ((int(total_mem_value) / 1024) - 2) * 75
+
+ # Module never results in a change
+ module.exit_json(changed=False, capacity=cap)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/awx/lib/management_modules/tower_isolated_cleanup.py b/awx/lib/management_modules/tower_isolated_cleanup.py
new file mode 100644
index 0000000000..529a24fd9d
--- /dev/null
+++ b/awx/lib/management_modules/tower_isolated_cleanup.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2017 Ansible by Red Hat
+#
+# This file is part of Ansible Tower, but depends on code imported from Ansible.
+#
+# Ansible is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Ansible is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Ansible. If not, see .
+
+from ansible.module_utils.basic import AnsibleModule
+
+import glob
+import os
+import re
+import shutil
+import datetime
+import subprocess
+
+
+def main():
+ module = AnsibleModule(
+ argument_spec = dict()
+ )
+ changed = False
+ paths_removed = set([])
+
+ # If a folder was last modified before this datetime, it will always be deleted
+ folder_cutoff = datetime.datetime.now() - datetime.timedelta(days=7)
+ # If a folder does not have an associated job running and is older than
+ # this datetime, then it will be deleted because its job has finished
+ job_cutoff = datetime.datetime.now() - datetime.timedelta(hours=1)
+
+ for search_pattern in [
+ '/tmp/ansible_tower_[0-9]*_*', '/tmp/ansible_tower_proot_*',
+ ]:
+ for path in glob.iglob(search_pattern):
+ st = os.stat(path)
+ modtime = datetime.datetime.fromtimestamp(st.st_mtime)
+
+ if modtime > job_cutoff:
+ continue
+ elif modtime > folder_cutoff:
+ try:
+ re_match = re.match(r'\/tmp\/ansible_tower_\d+_.+', path)
+ if re_match is not None:
+ if subprocess.check_call(['tower-expect', 'is-alive', path]) == 0:
+ continue
+ else:
+ module.debug('Deleting path {} its job has completed.'.format(path))
+ except (ValueError, IndexError):
+ continue
+ else:
+ module.debug('Deleting path {} because modification date is too old.'.format(path))
+ changed = True
+ paths_removed.add(path)
+ shutil.rmtree(path)
+
+ module.exit_json(changed=changed, paths_removed=list(paths_removed))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/awx/main/isolated/isolated_manager.py b/awx/main/isolated/isolated_manager.py
index caa2e1db01..139231271b 100644
--- a/awx/main/isolated/isolated_manager.py
+++ b/awx/main/isolated/isolated_manager.py
@@ -57,9 +57,7 @@ class IsolatedManager(object):
self.cwd = cwd
self.env = env.copy()
# Do not use callbacks for controller's management jobs
- self.env['ANSIBLE_CALLBACK_PLUGINS'] = ''
- self.env['CALLBACK_QUEUE'] = ''
- self.env['CALLBACK_CONNECTION'] = ''
+ self.env.update(self._base_management_env())
self.stdout_handle = stdout_handle
self.ssh_key_path = ssh_key_path
self.expect_passwords = {k.pattern: v for k, v in expect_passwords.items()}
@@ -71,8 +69,18 @@ class IsolatedManager(object):
self.proot_cmd = proot_cmd
self.started_at = None
- @property
- def awx_playbook_path(self):
+ @staticmethod
+ def _base_management_env():
+ return {
+ 'ANSIBLE_CALLBACK_PLUGINS': '',
+ 'CALLBACK_QUEUE': '',
+ 'CALLBACK_CONNECTION': '',
+ 'ANSIBLE_RETRY_FILES_ENABLED': 'False',
+ 'ANSIBLE_HOST_KEY_CHECKING': 'False'
+ }
+
+ @classmethod
+ def awx_playbook_path(cls):
return os.path.join(
os.path.dirname(awx.__file__),
'playbooks'
@@ -134,7 +142,7 @@ class IsolatedManager(object):
buff = StringIO.StringIO()
logger.debug('Starting job on isolated host with `run_isolated.yml` playbook.')
status, rc = run.run_pexpect(
- args, self.awx_playbook_path, self.env, buff,
+ args, self.awx_playbook_path(), self.env, buff,
expect_passwords={
re.compile(r'Secret:\s*?$', re.M): base64.b64encode(json.dumps(secrets))
},
@@ -244,7 +252,7 @@ class IsolatedManager(object):
buff = cStringIO.StringIO()
logger.debug('Checking job on isolated host with `check_isolated.yml` playbook.')
status, rc = run.run_pexpect(
- args, self.awx_playbook_path, self.env, buff,
+ args, self.awx_playbook_path(), self.env, buff,
cancelled_callback=self.cancelled_callback,
idle_timeout=remaining,
job_timeout=remaining,
@@ -295,7 +303,7 @@ class IsolatedManager(object):
logger.debug('Cleaning up job on isolated host with `clean_isolated.yml` playbook.')
buff = cStringIO.StringIO()
status, rc = run.run_pexpect(
- args, self.awx_playbook_path, self.env, buff,
+ args, self.awx_playbook_path(), self.env, buff,
idle_timeout=60, job_timeout=60,
pexpect_timeout=5
)
@@ -304,6 +312,56 @@ class IsolatedManager(object):
# stdout_handle is closed by this point so writing output to logs is our only option
logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(buff.getvalue()))
+ @classmethod
+ def health_check(cls, instance_qs):
+ '''
+ :param instance_qs: List of Django objects representing the
+ isolated instances to manage
+ Runs playbook that will
+ - determine if instance is reachable
+ - find the instance capacity
+ - clean up orphaned private files
+ Performs save on each instance to update its capacity.
+ '''
+ hostname_string = ''
+ for instance in instance_qs:
+ hostname_string += '{},'.format(instance.hostname)
+ args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i',
+ hostname_string, 'heartbeat_isolated.yml']
+ env = cls._base_management_env()
+ env['ANSIBLE_LIBRARY'] = os.path.join(os.path.dirname(awx.__file__), 'lib', 'management_modules')
+ env['ANSIBLE_STDOUT_CALLBACK'] = 'json'
+
+ buff = cStringIO.StringIO()
+ status, rc = run.run_pexpect(
+ args, cls.awx_playbook_path(), env, buff,
+ idle_timeout=60, job_timeout=60,
+ pexpect_timeout=5
+ )
+ output = buff.getvalue()
+ buff.close()
+
+ try:
+ result = json.loads(output)
+ if not isinstance(result, dict):
+ raise TypeError('Expected a dict but received {}.'.format(str(type(result))))
+ except (ValueError, AssertionError, TypeError):
+ logger.exception('Failed to read status from isolated instances, output:\n {}'.format(output))
+ return
+
+ for instance in instance_qs:
+ try:
+ task_result = result['plays'][0]['tasks'][0]['hosts'][instance.hostname]
+ except (KeyError, IndexError):
+ logger.exception('Failed to read status from isolated instance {}.'.format(instance.hostname))
+ continue
+ if 'capacity' in task_result:
+ instance.capacity = int(task_result['capacity'])
+ instance.save(update_fields=['capacity'])
+ else:
+ logger.warning('Could not update capacity of {}, msg={}'.format(
+ instance.hostname, task_result.get('msg', 'unknown failure')))
+
@staticmethod
def wrap_stdout_handle(instance, private_data_dir, stdout_handle):
dispatcher = CallbackQueueDispatcher()
diff --git a/awx/main/management/commands/deprovision_node.py b/awx/main/management/commands/deprovision_node.py
index 8412b5bd86..d64a780434 100644
--- a/awx/main/management/commands/deprovision_node.py
+++ b/awx/main/management/commands/deprovision_node.py
@@ -23,11 +23,13 @@ class Command(BaseCommand):
instance = Instance.objects.filter(hostname=options.get('name'))
if instance.exists():
instance.delete()
+ print("Instance Removed")
result = subprocess.Popen("rabbitmqctl forget_cluster_node rabbitmq@{}".format(options.get('name')), shell=True).wait()
if result != 0:
print("Node deprovisioning may have failed when attempting to remove the RabbitMQ instance from the cluster")
else:
print('Successfully deprovisioned {}'.format(options.get('name')))
+ print('(changed: True)')
else:
print('No instance found matching name {}'.format(options.get('name')))
diff --git a/awx/main/management/commands/register_instance.py b/awx/main/management/commands/register_instance.py
index 7ce6be787b..6895aa644f 100644
--- a/awx/main/management/commands/register_instance.py
+++ b/awx/main/management/commands/register_instance.py
@@ -17,14 +17,32 @@ class Command(BaseCommand):
option_list = BaseCommand.option_list + (
make_option('--hostname', dest='hostname', type='string',
help='Hostname used during provisioning'),
+ make_option('--hostnames', dest='hostnames', type='string',
+ help='Alternatively hostnames can be provided with '
+ 'this option as a comma-Delimited list'),
)
- def handle(self, **options):
- uuid = settings.SYSTEM_UUID
- instance = Instance.objects.filter(hostname=options.get('hostname'))
+ def _register_hostname(self, hostname):
+ if not hostname:
+ return
+ instance = Instance.objects.filter(hostname=hostname)
if instance.exists():
print("Instance already registered {}".format(instance[0]))
return
- instance = Instance(uuid=uuid, hostname=options.get('hostname'))
+ instance = Instance(uuid=self.uuid, hostname=hostname)
instance.save()
- print('Successfully registered instance {}'.format(instance))
+ print('Successfully registered instance {}'.format(hostname))
+ self.changed = True
+
+ def handle(self, **options):
+ self.uuid = settings.SYSTEM_UUID
+ self.changed = False
+ self._register_hostname(options.get('hostname'))
+ hostname_list = []
+ if options.get('hostnames'):
+ hostname_list = options.get('hostnames').split(",")
+ instance_list = [x.strip() for x in hostname_list if x]
+ for inst_name in instance_list:
+ self._register_hostname(inst_name)
+ if self.changed:
+ print('(changed: True)')
diff --git a/awx/main/management/commands/register_queue.py b/awx/main/management/commands/register_queue.py
index e0ca862a37..3601b009d4 100644
--- a/awx/main/management/commands/register_queue.py
+++ b/awx/main/management/commands/register_queue.py
@@ -5,7 +5,7 @@ import sys
from awx.main.models import Instance, InstanceGroup
from optparse import make_option
-from django.core.management.base import BaseCommand
+from django.core.management.base import BaseCommand, CommandError
class Command(BaseCommand):
@@ -20,34 +20,44 @@ class Command(BaseCommand):
)
def handle(self, **options):
+ if not options.get('queuename'):
+ raise CommandError("Specify `--queuename` to use this command.")
+ changed = False
ig = InstanceGroup.objects.filter(name=options.get('queuename'))
control_ig = None
if options.get('controller'):
control_ig = InstanceGroup.objects.filter(name=options.get('controller')).first()
if ig.exists():
- print("Instance Group already registered {}".format(ig[0]))
+ print("Instance Group already registered {}".format(ig[0].name))
ig = ig[0]
if control_ig and ig.controller_id != control_ig.pk:
ig.controller = control_ig
ig.save()
- print("Set controller group {} on {}.".format(control_ig, ig))
+ print("Set controller group {} on {}.".format(control_ig.name, ig.name))
+ changed = True
else:
print("Creating instance group {}".format(options.get('queuename')))
ig = InstanceGroup(name=options.get('queuename'))
if control_ig:
ig.controller = control_ig
ig.save()
+ changed = True
hostname_list = []
if options.get('hostnames'):
hostname_list = options.get('hostnames').split(",")
- instance_list = [x.strip() for x in hostname_list]
+ instance_list = [x.strip() for x in hostname_list if x]
for inst_name in instance_list:
instance = Instance.objects.filter(hostname=inst_name)
- if instance.exists() and instance not in ig.instances.all():
+ if instance.exists() and instance[0] not in ig.instances.all():
ig.instances.add(instance[0])
- print("Added instance {} to {}".format(instance[0], ig))
+ print("Added instance {} to {}".format(instance[0].hostname, ig.name))
+ changed = True
elif not instance.exists():
print("Instance does not exist: {}".format(inst_name))
+ if changed:
+ print('(changed: True)')
sys.exit(1)
else:
- print("Instance already registered {}".format(instance[0]))
+ print("Instance already registered {}".format(instance[0].hostname))
+ if changed:
+ print('(changed: True)')
diff --git a/awx/main/management/commands/unregister_queue.py b/awx/main/management/commands/unregister_queue.py
index 388a8f0588..335ce38dbc 100644
--- a/awx/main/management/commands/unregister_queue.py
+++ b/awx/main/management/commands/unregister_queue.py
@@ -30,3 +30,4 @@ class Command(BaseCommand):
ig = ig.first()
ig.delete()
print("Instance Group Removed")
+ print('(changed: True)')
diff --git a/awx/main/migrations/0043_v320_instancegroups.py b/awx/main/migrations/0043_v320_instancegroups.py
index bf0585acf6..3f7d279307 100644
--- a/awx/main/migrations/0043_v320_instancegroups.py
+++ b/awx/main/migrations/0043_v320_instancegroups.py
@@ -48,4 +48,9 @@ class Migration(migrations.Migration):
name='instance_group',
field=models.ManyToManyField(to='main.InstanceGroup', blank=True),
),
+ migrations.AddField(
+ model_name='instance',
+ name='last_isolated_check',
+ field=models.DateTimeField(auto_now_add=True, null=True),
+ ),
]
diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py
index 134646d2cb..600a72af27 100644
--- a/awx/main/models/ha.py
+++ b/awx/main/models/ha.py
@@ -26,6 +26,11 @@ class Instance(models.Model):
hostname = models.CharField(max_length=250, unique=True)
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
+ last_isolated_check = models.DateTimeField(
+ null=True,
+ editable=False,
+ auto_now_add=True
+ )
version = models.CharField(max_length=24, blank=True)
capacity = models.PositiveIntegerField(
default=100,
diff --git a/awx/main/tasks.py b/awx/main/tasks.py
index 8445e6ce19..19bd26bb1f 100644
--- a/awx/main/tasks.py
+++ b/awx/main/tasks.py
@@ -19,7 +19,6 @@ import traceback
import urlparse
import uuid
from distutils.version import LooseVersion as Version
-from datetime import timedelta
import yaml
import fcntl
try:
@@ -34,7 +33,7 @@ from celery.signals import celeryd_init, worker_process_init
# Django
from django.conf import settings
from django.db import transaction, DatabaseError, IntegrityError
-from django.utils.timezone import now
+from django.utils.timezone import now, timedelta
from django.utils.encoding import smart_str
from django.core.mail import send_mail
from django.contrib.auth.models import User
@@ -197,6 +196,29 @@ def cluster_node_heartbeat(self):
stop_local_services(['uwsgi', 'celery', 'beat', 'callback', 'fact'])
+@task(bind=True)
+def tower_isolated_heartbeat(self):
+ local_hostname = settings.CLUSTER_HOST_ID
+ logger.debug("Controlling node checking for any isolated management tasks.")
+ poll_interval = settings.AWX_ISOLATED_PERIODIC_CHECK
+ # Get isolated instances not checked since poll interval - some buffer
+ nowtime = now()
+ accept_before = nowtime - timedelta(seconds=(poll_interval - 10))
+ isolated_instance_qs = Instance.objects.filter(
+ rampart_groups__controller__instances__hostname=local_hostname,
+ last_isolated_check__lt=accept_before
+ )
+ # Fast pass of isolated instances, claiming the nodes to update
+ with transaction.atomic():
+ for isolated_instance in isolated_instance_qs:
+ isolated_instance.last_isolated_check = nowtime
+ # Prevent modified time from being changed, as in normal heartbeat
+ isolated_instance.save(update_fields=['last_isolated_check'])
+ # Slow pass looping over isolated IGs and their isolated instances
+ if len(isolated_instance_qs) > 0:
+ logger.debug("Managing isolated instances {}.".format(','.join([inst.hostname for inst in isolated_instance_qs])))
+ isolated_manager.IsolatedManager.health_check(isolated_instance_qs)
+
@task(bind=True, queue='tower')
def tower_periodic_scheduler(self):
diff --git a/awx/main/tests/functional/test_tasks.py b/awx/main/tests/functional/test_tasks.py
index f09847c2d2..065d979819 100644
--- a/awx/main/tests/functional/test_tasks.py
+++ b/awx/main/tests/functional/test_tasks.py
@@ -2,8 +2,17 @@ import pytest
import mock
import os
-from awx.main.tasks import RunProjectUpdate, RunInventoryUpdate
-from awx.main.models import ProjectUpdate, InventoryUpdate, InventorySource
+from django.utils.timezone import now, timedelta
+
+from awx.main.tasks import (
+ RunProjectUpdate, RunInventoryUpdate,
+ tower_isolated_heartbeat,
+ isolated_manager
+)
+from awx.main.models import (
+ ProjectUpdate, InventoryUpdate, InventorySource,
+ Instance, InstanceGroup
+)
@pytest.fixture
@@ -73,3 +82,56 @@ class TestDependentInventoryUpdate:
# Verify that it bails after 1st update, detecting a cancel
assert is2.inventory_updates.count() == 0
iu_run_mock.assert_called_once()
+
+
+
+class MockSettings:
+ AWX_ISOLATED_PERIODIC_CHECK = 60
+ CLUSTER_HOST_ID = 'tower_1'
+
+
+@pytest.mark.django_db
+class TestIsolatedManagementTask:
+
+ @pytest.fixture
+ def control_group(self):
+ return InstanceGroup.objects.create(name='alpha')
+
+ @pytest.fixture
+ def control_instance(self, control_group):
+ return control_group.instances.create(hostname='tower_1')
+
+ @pytest.fixture
+ def needs_updating(self, control_group):
+ ig = InstanceGroup.objects.create(name='thepentagon', controller=control_group)
+ inst = ig.instances.create(hostname='isolated', capacity=103)
+ inst.last_isolated_check=now() - timedelta(seconds=MockSettings.AWX_ISOLATED_PERIODIC_CHECK)
+ inst.save()
+ return ig
+
+ @pytest.fixture
+ def just_updated(self, control_group):
+ ig = InstanceGroup.objects.create(name='thepentagon', controller=control_group)
+ inst = ig.instances.create(hostname='isolated', capacity=103)
+ inst.last_isolated_check=now()
+ inst.save()
+ return inst
+
+ def test_takes_action(self, control_instance, needs_updating):
+ original_isolated_instance = needs_updating.instances.all().first()
+ with mock.patch('awx.main.tasks.settings', MockSettings()):
+ with mock.patch.object(isolated_manager.IsolatedManager, 'health_check') as check_mock:
+ tower_isolated_heartbeat()
+ iso_instance = Instance.objects.get(hostname='isolated')
+ call_args, _ = check_mock.call_args
+ assert call_args[0][0] == iso_instance
+ assert iso_instance.last_isolated_check > original_isolated_instance.last_isolated_check
+ assert iso_instance.modified == original_isolated_instance.modified
+
+ def test_does_not_take_action(self, control_instance, just_updated):
+ with mock.patch('awx.main.tasks.settings', MockSettings()):
+ with mock.patch.object(isolated_manager.IsolatedManager, 'health_check') as check_mock:
+ tower_isolated_heartbeat()
+ iso_instance = Instance.objects.get(hostname='isolated')
+ check_mock.assert_not_called()
+ assert iso_instance.capacity == 103
diff --git a/awx/main/utils/common.py b/awx/main/utils/common.py
index b345a93de3..5dea718faf 100644
--- a/awx/main/utils/common.py
+++ b/awx/main/utils/common.py
@@ -543,8 +543,11 @@ def get_system_task_capacity():
from django.conf import settings
if hasattr(settings, 'SYSTEM_TASK_CAPACITY'):
return settings.SYSTEM_TASK_CAPACITY
- proc = subprocess.Popen(['free', '-m'], stdout=subprocess.PIPE)
- out,err = proc.communicate()
+ try:
+ out = subprocess.check_output(['free', '-m'])
+ except subprocess.CalledProcessError:
+ logger.exception('Problem obtaining capacity from system.')
+ return 0
total_mem_value = out.split()[7]
if int(total_mem_value) <= 2048:
return 50
diff --git a/awx/playbooks/heartbeat_isolated.yml b/awx/playbooks/heartbeat_isolated.yml
new file mode 100644
index 0000000000..58b2f52b3c
--- /dev/null
+++ b/awx/playbooks/heartbeat_isolated.yml
@@ -0,0 +1,11 @@
+---
+- hosts: all
+ gather_facts: false
+
+ tasks:
+
+ - name: Get capacity of the instance
+ tower_capacity:
+
+ - name: Remove any stale temporary files
+ tower_isolated_cleanup:
diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py
index 6530331863..537f5f07b6 100644
--- a/awx/settings/defaults.py
+++ b/awx/settings/defaults.py
@@ -609,6 +609,8 @@ AWX_ISOLATED_CHECK_INTERVAL = 30
# The timeout (in seconds) for launching jobs on isolated nodes
AWX_ISOLATED_LAUNCH_TIMEOUT = 600
+# The time between the background isolated heartbeat status check
+AWX_ISOLATED_PERIODIC_CHECK = 600
# Enable Pendo on the UI, possible values are 'off', 'anonymous', and 'detailed'
# Note: This setting may be overridden by database settings.
diff --git a/awx/settings/development.py b/awx/settings/development.py
index 414bcfb48f..0edf353f6a 100644
--- a/awx/settings/development.py
+++ b/awx/settings/development.py
@@ -114,6 +114,13 @@ except ImportError:
CLUSTER_HOST_ID = socket.gethostname()
CELERY_ROUTES['awx.main.tasks.cluster_node_heartbeat'] = {'queue': CLUSTER_HOST_ID, 'routing_key': CLUSTER_HOST_ID}
+# Production only runs this schedule on controlling nodes
+# but development will just run it on all nodes
+CELERYBEAT_SCHEDULE['isolated_heartbeat'] = {
+ 'task': 'awx.main.tasks.tower_isolated_heartbeat',
+ 'schedule': timedelta(seconds = AWX_ISOLATED_PERIODIC_CHECK),
+ 'options': {'expires': AWX_ISOLATED_PERIODIC_CHECK * 2,}
+}
# Supervisor service name dictionary used for programatic restart
SERVICE_NAME_DICT = {
diff --git a/docs/clustering.md b/docs/clustering.md
index 56fe723cf0..05dba666a1 100644
--- a/docs/clustering.md
+++ b/docs/clustering.md
@@ -112,6 +112,58 @@ rabbitmq_use_long_name=false
rabbitmq_enable_manager=false
```
+### Security Isolated Rampart Groups
+
+In Tower versions 3.2+ customers may optionally define isolated groups
+inside security-restricted networking zones to run jobs from.
+Instances in these groups will _not_ have a full install of Tower, but will have a minimal
+set of utilities used to run jobs on them. These must be specified
+in the inventory file prefixed with `isolated_group_`. An example inventory
+file is shown below.
+
+```
+[tower]
+towerA
+towerB
+towerC
+
+[instance_group_security]
+towerB
+towerC
+
+[isolated_group_govcloud]
+isolatedA
+isolatedB
+
+[isolated_group_govcloud:vars]
+controller=security
+```
+
+In this example, when a job runs inside of the `govcloud` isolated group, a
+managing task runs simultaneously on either one of the two instances in
+the `security` ordinary instance group.
+
+Networking security rules must allow
+connections to all nodes in an isolated group from all nodes in its controller
+group. The system is designed such that
+isolated instances never make requests to any of their controllers.
+The controlling instance for a particular job will send management commands to
+a daemon that runs the job, and will slurp job artifacts.
+
+Isolated groups are architected such that they may exist inside of a VPC
+with security rules that _only_ permit the instances in its `controller`
+group to access them.
+
+Recommendations for system configuration with isolated groups:
+ - Do not put any isolated instances inside the `tower` group or other
+ ordinary instance groups.
+ - Define the `controller` variable as either a group var or as a hostvar
+ on all the instances in the isolated group. Please _do not_ allow
+ isolated instances in the same group have a different value for this
+ variable - the behavior in this case can not be predicted.
+ - Do not put an isolated instance in more than 1 isolated group.
+
+
### Provisioning and Deprovisioning Instances and Groups
* Provisioning