mirror of
https://github.com/ansible/awx.git
synced 2026-03-09 05:29:26 -02:30
Merge pull request #6617 from ansible/isolated_setup
setup playbook and heartbeat for isolated deployments
This commit is contained in:
4
Makefile
4
Makefile
@@ -950,10 +950,10 @@ docker-isolated:
|
|||||||
TAG=$(COMPOSE_TAG) docker-compose -f tools/docker-compose.yml -f tools/docker-isolated-override.yml create
|
TAG=$(COMPOSE_TAG) docker-compose -f tools/docker-compose.yml -f tools/docker-isolated-override.yml create
|
||||||
docker start tools_tower_1
|
docker start tools_tower_1
|
||||||
docker start tools_isolated_1
|
docker start tools_isolated_1
|
||||||
if [ "`docker exec -i -t tools_isolated_1 cat /root/.ssh/authorized_keys`" != "" ]; then \
|
if [ "`docker exec -i -t tools_isolated_1 cat /root/.ssh/authorized_keys`" == "`docker exec -t tools_tower_1 cat /root/.ssh/id_rsa.pub`" ]; then \
|
||||||
echo "SSH keys already copied to isolated instance"; \
|
echo "SSH keys already copied to isolated instance"; \
|
||||||
else \
|
else \
|
||||||
docker exec "tools_isolated_1" bash -c "mkdir -p /root/.ssh && echo $$(docker exec -t tools_tower_1 cat /root/.ssh/id_rsa.pub) >> /root/.ssh/authorized_keys"; \
|
docker exec "tools_isolated_1" bash -c "mkdir -p /root/.ssh && rm -f /root/.ssh/authorized_keys && echo $$(docker exec -t tools_tower_1 cat /root/.ssh/id_rsa.pub) >> /root/.ssh/authorized_keys"; \
|
||||||
fi
|
fi
|
||||||
TAG=$(COMPOSE_TAG) docker-compose -f tools/docker-compose.yml -f tools/docker-isolated-override.yml up
|
TAG=$(COMPOSE_TAG) docker-compose -f tools/docker-compose.yml -f tools/docker-isolated-override.yml up
|
||||||
|
|
||||||
|
|||||||
43
awx/lib/management_modules/tower_capacity.py
Normal file
43
awx/lib/management_modules/tower_capacity.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# Copyright (c) 2017 Ansible by Red Hat
|
||||||
|
#
|
||||||
|
# This file is part of Ansible Tower, but depends on code imported from Ansible.
|
||||||
|
#
|
||||||
|
# Ansible is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# Ansible is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with Ansible. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
from ansible.module_utils.basic import AnsibleModule
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
module = AnsibleModule(
|
||||||
|
argument_spec = dict()
|
||||||
|
)
|
||||||
|
# Duplicated with awx.main.utils.common.get_system_task_capacity
|
||||||
|
try:
|
||||||
|
out = subprocess.check_output(['free', '-m'])
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
module.fail_json(msg=str(e))
|
||||||
|
return
|
||||||
|
total_mem_value = out.split()[7]
|
||||||
|
if int(total_mem_value) <= 2048:
|
||||||
|
cap = 50
|
||||||
|
cap = 50 + ((int(total_mem_value) / 1024) - 2) * 75
|
||||||
|
|
||||||
|
# Module never results in a change
|
||||||
|
module.exit_json(changed=False, capacity=cap)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
70
awx/lib/management_modules/tower_isolated_cleanup.py
Normal file
70
awx/lib/management_modules/tower_isolated_cleanup.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
# Copyright (c) 2017 Ansible by Red Hat
|
||||||
|
#
|
||||||
|
# This file is part of Ansible Tower, but depends on code imported from Ansible.
|
||||||
|
#
|
||||||
|
# Ansible is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# Ansible is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with Ansible. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
from ansible.module_utils.basic import AnsibleModule
|
||||||
|
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import datetime
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
module = AnsibleModule(
|
||||||
|
argument_spec = dict()
|
||||||
|
)
|
||||||
|
changed = False
|
||||||
|
paths_removed = set([])
|
||||||
|
|
||||||
|
# If a folder was last modified before this datetime, it will always be deleted
|
||||||
|
folder_cutoff = datetime.datetime.now() - datetime.timedelta(days=7)
|
||||||
|
# If a folder does not have an associated job running and is older than
|
||||||
|
# this datetime, then it will be deleted because its job has finished
|
||||||
|
job_cutoff = datetime.datetime.now() - datetime.timedelta(hours=1)
|
||||||
|
|
||||||
|
for search_pattern in [
|
||||||
|
'/tmp/ansible_tower_[0-9]*_*', '/tmp/ansible_tower_proot_*',
|
||||||
|
]:
|
||||||
|
for path in glob.iglob(search_pattern):
|
||||||
|
st = os.stat(path)
|
||||||
|
modtime = datetime.datetime.fromtimestamp(st.st_mtime)
|
||||||
|
|
||||||
|
if modtime > job_cutoff:
|
||||||
|
continue
|
||||||
|
elif modtime > folder_cutoff:
|
||||||
|
try:
|
||||||
|
re_match = re.match(r'\/tmp\/ansible_tower_\d+_.+', path)
|
||||||
|
if re_match is not None:
|
||||||
|
if subprocess.check_call(['tower-expect', 'is-alive', path]) == 0:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
module.debug('Deleting path {} its job has completed.'.format(path))
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
module.debug('Deleting path {} because modification date is too old.'.format(path))
|
||||||
|
changed = True
|
||||||
|
paths_removed.add(path)
|
||||||
|
shutil.rmtree(path)
|
||||||
|
|
||||||
|
module.exit_json(changed=changed, paths_removed=list(paths_removed))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@@ -57,9 +57,7 @@ class IsolatedManager(object):
|
|||||||
self.cwd = cwd
|
self.cwd = cwd
|
||||||
self.env = env.copy()
|
self.env = env.copy()
|
||||||
# Do not use callbacks for controller's management jobs
|
# Do not use callbacks for controller's management jobs
|
||||||
self.env['ANSIBLE_CALLBACK_PLUGINS'] = ''
|
self.env.update(self._base_management_env())
|
||||||
self.env['CALLBACK_QUEUE'] = ''
|
|
||||||
self.env['CALLBACK_CONNECTION'] = ''
|
|
||||||
self.stdout_handle = stdout_handle
|
self.stdout_handle = stdout_handle
|
||||||
self.ssh_key_path = ssh_key_path
|
self.ssh_key_path = ssh_key_path
|
||||||
self.expect_passwords = {k.pattern: v for k, v in expect_passwords.items()}
|
self.expect_passwords = {k.pattern: v for k, v in expect_passwords.items()}
|
||||||
@@ -71,8 +69,18 @@ class IsolatedManager(object):
|
|||||||
self.proot_cmd = proot_cmd
|
self.proot_cmd = proot_cmd
|
||||||
self.started_at = None
|
self.started_at = None
|
||||||
|
|
||||||
@property
|
@staticmethod
|
||||||
def awx_playbook_path(self):
|
def _base_management_env():
|
||||||
|
return {
|
||||||
|
'ANSIBLE_CALLBACK_PLUGINS': '',
|
||||||
|
'CALLBACK_QUEUE': '',
|
||||||
|
'CALLBACK_CONNECTION': '',
|
||||||
|
'ANSIBLE_RETRY_FILES_ENABLED': 'False',
|
||||||
|
'ANSIBLE_HOST_KEY_CHECKING': 'False'
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def awx_playbook_path(cls):
|
||||||
return os.path.join(
|
return os.path.join(
|
||||||
os.path.dirname(awx.__file__),
|
os.path.dirname(awx.__file__),
|
||||||
'playbooks'
|
'playbooks'
|
||||||
@@ -134,7 +142,7 @@ class IsolatedManager(object):
|
|||||||
buff = StringIO.StringIO()
|
buff = StringIO.StringIO()
|
||||||
logger.debug('Starting job on isolated host with `run_isolated.yml` playbook.')
|
logger.debug('Starting job on isolated host with `run_isolated.yml` playbook.')
|
||||||
status, rc = run.run_pexpect(
|
status, rc = run.run_pexpect(
|
||||||
args, self.awx_playbook_path, self.env, buff,
|
args, self.awx_playbook_path(), self.env, buff,
|
||||||
expect_passwords={
|
expect_passwords={
|
||||||
re.compile(r'Secret:\s*?$', re.M): base64.b64encode(json.dumps(secrets))
|
re.compile(r'Secret:\s*?$', re.M): base64.b64encode(json.dumps(secrets))
|
||||||
},
|
},
|
||||||
@@ -244,7 +252,7 @@ class IsolatedManager(object):
|
|||||||
buff = cStringIO.StringIO()
|
buff = cStringIO.StringIO()
|
||||||
logger.debug('Checking job on isolated host with `check_isolated.yml` playbook.')
|
logger.debug('Checking job on isolated host with `check_isolated.yml` playbook.')
|
||||||
status, rc = run.run_pexpect(
|
status, rc = run.run_pexpect(
|
||||||
args, self.awx_playbook_path, self.env, buff,
|
args, self.awx_playbook_path(), self.env, buff,
|
||||||
cancelled_callback=self.cancelled_callback,
|
cancelled_callback=self.cancelled_callback,
|
||||||
idle_timeout=remaining,
|
idle_timeout=remaining,
|
||||||
job_timeout=remaining,
|
job_timeout=remaining,
|
||||||
@@ -295,7 +303,7 @@ class IsolatedManager(object):
|
|||||||
logger.debug('Cleaning up job on isolated host with `clean_isolated.yml` playbook.')
|
logger.debug('Cleaning up job on isolated host with `clean_isolated.yml` playbook.')
|
||||||
buff = cStringIO.StringIO()
|
buff = cStringIO.StringIO()
|
||||||
status, rc = run.run_pexpect(
|
status, rc = run.run_pexpect(
|
||||||
args, self.awx_playbook_path, self.env, buff,
|
args, self.awx_playbook_path(), self.env, buff,
|
||||||
idle_timeout=60, job_timeout=60,
|
idle_timeout=60, job_timeout=60,
|
||||||
pexpect_timeout=5
|
pexpect_timeout=5
|
||||||
)
|
)
|
||||||
@@ -304,6 +312,56 @@ class IsolatedManager(object):
|
|||||||
# stdout_handle is closed by this point so writing output to logs is our only option
|
# stdout_handle is closed by this point so writing output to logs is our only option
|
||||||
logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(buff.getvalue()))
|
logger.warning('Cleanup from isolated job encountered error, output:\n{}'.format(buff.getvalue()))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def health_check(cls, instance_qs):
|
||||||
|
'''
|
||||||
|
:param instance_qs: List of Django objects representing the
|
||||||
|
isolated instances to manage
|
||||||
|
Runs playbook that will
|
||||||
|
- determine if instance is reachable
|
||||||
|
- find the instance capacity
|
||||||
|
- clean up orphaned private files
|
||||||
|
Performs save on each instance to update its capacity.
|
||||||
|
'''
|
||||||
|
hostname_string = ''
|
||||||
|
for instance in instance_qs:
|
||||||
|
hostname_string += '{},'.format(instance.hostname)
|
||||||
|
args = ['ansible-playbook', '-u', settings.AWX_ISOLATED_USERNAME, '-i',
|
||||||
|
hostname_string, 'heartbeat_isolated.yml']
|
||||||
|
env = cls._base_management_env()
|
||||||
|
env['ANSIBLE_LIBRARY'] = os.path.join(os.path.dirname(awx.__file__), 'lib', 'management_modules')
|
||||||
|
env['ANSIBLE_STDOUT_CALLBACK'] = 'json'
|
||||||
|
|
||||||
|
buff = cStringIO.StringIO()
|
||||||
|
status, rc = run.run_pexpect(
|
||||||
|
args, cls.awx_playbook_path(), env, buff,
|
||||||
|
idle_timeout=60, job_timeout=60,
|
||||||
|
pexpect_timeout=5
|
||||||
|
)
|
||||||
|
output = buff.getvalue()
|
||||||
|
buff.close()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = json.loads(output)
|
||||||
|
if not isinstance(result, dict):
|
||||||
|
raise TypeError('Expected a dict but received {}.'.format(str(type(result))))
|
||||||
|
except (ValueError, AssertionError, TypeError):
|
||||||
|
logger.exception('Failed to read status from isolated instances, output:\n {}'.format(output))
|
||||||
|
return
|
||||||
|
|
||||||
|
for instance in instance_qs:
|
||||||
|
try:
|
||||||
|
task_result = result['plays'][0]['tasks'][0]['hosts'][instance.hostname]
|
||||||
|
except (KeyError, IndexError):
|
||||||
|
logger.exception('Failed to read status from isolated instance {}.'.format(instance.hostname))
|
||||||
|
continue
|
||||||
|
if 'capacity' in task_result:
|
||||||
|
instance.capacity = int(task_result['capacity'])
|
||||||
|
instance.save(update_fields=['capacity'])
|
||||||
|
else:
|
||||||
|
logger.warning('Could not update capacity of {}, msg={}'.format(
|
||||||
|
instance.hostname, task_result.get('msg', 'unknown failure')))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def wrap_stdout_handle(instance, private_data_dir, stdout_handle):
|
def wrap_stdout_handle(instance, private_data_dir, stdout_handle):
|
||||||
dispatcher = CallbackQueueDispatcher()
|
dispatcher = CallbackQueueDispatcher()
|
||||||
|
|||||||
@@ -23,11 +23,13 @@ class Command(BaseCommand):
|
|||||||
instance = Instance.objects.filter(hostname=options.get('name'))
|
instance = Instance.objects.filter(hostname=options.get('name'))
|
||||||
if instance.exists():
|
if instance.exists():
|
||||||
instance.delete()
|
instance.delete()
|
||||||
|
print("Instance Removed")
|
||||||
result = subprocess.Popen("rabbitmqctl forget_cluster_node rabbitmq@{}".format(options.get('name')), shell=True).wait()
|
result = subprocess.Popen("rabbitmqctl forget_cluster_node rabbitmq@{}".format(options.get('name')), shell=True).wait()
|
||||||
if result != 0:
|
if result != 0:
|
||||||
print("Node deprovisioning may have failed when attempting to remove the RabbitMQ instance from the cluster")
|
print("Node deprovisioning may have failed when attempting to remove the RabbitMQ instance from the cluster")
|
||||||
else:
|
else:
|
||||||
print('Successfully deprovisioned {}'.format(options.get('name')))
|
print('Successfully deprovisioned {}'.format(options.get('name')))
|
||||||
|
print('(changed: True)')
|
||||||
else:
|
else:
|
||||||
print('No instance found matching name {}'.format(options.get('name')))
|
print('No instance found matching name {}'.format(options.get('name')))
|
||||||
|
|
||||||
|
|||||||
@@ -17,14 +17,32 @@ class Command(BaseCommand):
|
|||||||
option_list = BaseCommand.option_list + (
|
option_list = BaseCommand.option_list + (
|
||||||
make_option('--hostname', dest='hostname', type='string',
|
make_option('--hostname', dest='hostname', type='string',
|
||||||
help='Hostname used during provisioning'),
|
help='Hostname used during provisioning'),
|
||||||
|
make_option('--hostnames', dest='hostnames', type='string',
|
||||||
|
help='Alternatively hostnames can be provided with '
|
||||||
|
'this option as a comma-Delimited list'),
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle(self, **options):
|
def _register_hostname(self, hostname):
|
||||||
uuid = settings.SYSTEM_UUID
|
if not hostname:
|
||||||
instance = Instance.objects.filter(hostname=options.get('hostname'))
|
return
|
||||||
|
instance = Instance.objects.filter(hostname=hostname)
|
||||||
if instance.exists():
|
if instance.exists():
|
||||||
print("Instance already registered {}".format(instance[0]))
|
print("Instance already registered {}".format(instance[0]))
|
||||||
return
|
return
|
||||||
instance = Instance(uuid=uuid, hostname=options.get('hostname'))
|
instance = Instance(uuid=self.uuid, hostname=hostname)
|
||||||
instance.save()
|
instance.save()
|
||||||
print('Successfully registered instance {}'.format(instance))
|
print('Successfully registered instance {}'.format(hostname))
|
||||||
|
self.changed = True
|
||||||
|
|
||||||
|
def handle(self, **options):
|
||||||
|
self.uuid = settings.SYSTEM_UUID
|
||||||
|
self.changed = False
|
||||||
|
self._register_hostname(options.get('hostname'))
|
||||||
|
hostname_list = []
|
||||||
|
if options.get('hostnames'):
|
||||||
|
hostname_list = options.get('hostnames').split(",")
|
||||||
|
instance_list = [x.strip() for x in hostname_list if x]
|
||||||
|
for inst_name in instance_list:
|
||||||
|
self._register_hostname(inst_name)
|
||||||
|
if self.changed:
|
||||||
|
print('(changed: True)')
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import sys
|
|||||||
from awx.main.models import Instance, InstanceGroup
|
from awx.main.models import Instance, InstanceGroup
|
||||||
|
|
||||||
from optparse import make_option
|
from optparse import make_option
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand, CommandError
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
@@ -20,34 +20,44 @@ class Command(BaseCommand):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def handle(self, **options):
|
def handle(self, **options):
|
||||||
|
if not options.get('queuename'):
|
||||||
|
raise CommandError("Specify `--queuename` to use this command.")
|
||||||
|
changed = False
|
||||||
ig = InstanceGroup.objects.filter(name=options.get('queuename'))
|
ig = InstanceGroup.objects.filter(name=options.get('queuename'))
|
||||||
control_ig = None
|
control_ig = None
|
||||||
if options.get('controller'):
|
if options.get('controller'):
|
||||||
control_ig = InstanceGroup.objects.filter(name=options.get('controller')).first()
|
control_ig = InstanceGroup.objects.filter(name=options.get('controller')).first()
|
||||||
if ig.exists():
|
if ig.exists():
|
||||||
print("Instance Group already registered {}".format(ig[0]))
|
print("Instance Group already registered {}".format(ig[0].name))
|
||||||
ig = ig[0]
|
ig = ig[0]
|
||||||
if control_ig and ig.controller_id != control_ig.pk:
|
if control_ig and ig.controller_id != control_ig.pk:
|
||||||
ig.controller = control_ig
|
ig.controller = control_ig
|
||||||
ig.save()
|
ig.save()
|
||||||
print("Set controller group {} on {}.".format(control_ig, ig))
|
print("Set controller group {} on {}.".format(control_ig.name, ig.name))
|
||||||
|
changed = True
|
||||||
else:
|
else:
|
||||||
print("Creating instance group {}".format(options.get('queuename')))
|
print("Creating instance group {}".format(options.get('queuename')))
|
||||||
ig = InstanceGroup(name=options.get('queuename'))
|
ig = InstanceGroup(name=options.get('queuename'))
|
||||||
if control_ig:
|
if control_ig:
|
||||||
ig.controller = control_ig
|
ig.controller = control_ig
|
||||||
ig.save()
|
ig.save()
|
||||||
|
changed = True
|
||||||
hostname_list = []
|
hostname_list = []
|
||||||
if options.get('hostnames'):
|
if options.get('hostnames'):
|
||||||
hostname_list = options.get('hostnames').split(",")
|
hostname_list = options.get('hostnames').split(",")
|
||||||
instance_list = [x.strip() for x in hostname_list]
|
instance_list = [x.strip() for x in hostname_list if x]
|
||||||
for inst_name in instance_list:
|
for inst_name in instance_list:
|
||||||
instance = Instance.objects.filter(hostname=inst_name)
|
instance = Instance.objects.filter(hostname=inst_name)
|
||||||
if instance.exists() and instance not in ig.instances.all():
|
if instance.exists() and instance[0] not in ig.instances.all():
|
||||||
ig.instances.add(instance[0])
|
ig.instances.add(instance[0])
|
||||||
print("Added instance {} to {}".format(instance[0], ig))
|
print("Added instance {} to {}".format(instance[0].hostname, ig.name))
|
||||||
|
changed = True
|
||||||
elif not instance.exists():
|
elif not instance.exists():
|
||||||
print("Instance does not exist: {}".format(inst_name))
|
print("Instance does not exist: {}".format(inst_name))
|
||||||
|
if changed:
|
||||||
|
print('(changed: True)')
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
print("Instance already registered {}".format(instance[0]))
|
print("Instance already registered {}".format(instance[0].hostname))
|
||||||
|
if changed:
|
||||||
|
print('(changed: True)')
|
||||||
|
|||||||
@@ -30,3 +30,4 @@ class Command(BaseCommand):
|
|||||||
ig = ig.first()
|
ig = ig.first()
|
||||||
ig.delete()
|
ig.delete()
|
||||||
print("Instance Group Removed")
|
print("Instance Group Removed")
|
||||||
|
print('(changed: True)')
|
||||||
|
|||||||
@@ -48,4 +48,9 @@ class Migration(migrations.Migration):
|
|||||||
name='instance_group',
|
name='instance_group',
|
||||||
field=models.ManyToManyField(to='main.InstanceGroup', blank=True),
|
field=models.ManyToManyField(to='main.InstanceGroup', blank=True),
|
||||||
),
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='instance',
|
||||||
|
name='last_isolated_check',
|
||||||
|
field=models.DateTimeField(auto_now_add=True, null=True),
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -26,6 +26,11 @@ class Instance(models.Model):
|
|||||||
hostname = models.CharField(max_length=250, unique=True)
|
hostname = models.CharField(max_length=250, unique=True)
|
||||||
created = models.DateTimeField(auto_now_add=True)
|
created = models.DateTimeField(auto_now_add=True)
|
||||||
modified = models.DateTimeField(auto_now=True)
|
modified = models.DateTimeField(auto_now=True)
|
||||||
|
last_isolated_check = models.DateTimeField(
|
||||||
|
null=True,
|
||||||
|
editable=False,
|
||||||
|
auto_now_add=True
|
||||||
|
)
|
||||||
version = models.CharField(max_length=24, blank=True)
|
version = models.CharField(max_length=24, blank=True)
|
||||||
capacity = models.PositiveIntegerField(
|
capacity = models.PositiveIntegerField(
|
||||||
default=100,
|
default=100,
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ import traceback
|
|||||||
import urlparse
|
import urlparse
|
||||||
import uuid
|
import uuid
|
||||||
from distutils.version import LooseVersion as Version
|
from distutils.version import LooseVersion as Version
|
||||||
from datetime import timedelta
|
|
||||||
import yaml
|
import yaml
|
||||||
import fcntl
|
import fcntl
|
||||||
try:
|
try:
|
||||||
@@ -34,7 +33,7 @@ from celery.signals import celeryd_init, worker_process_init
|
|||||||
# Django
|
# Django
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.db import transaction, DatabaseError, IntegrityError
|
from django.db import transaction, DatabaseError, IntegrityError
|
||||||
from django.utils.timezone import now
|
from django.utils.timezone import now, timedelta
|
||||||
from django.utils.encoding import smart_str
|
from django.utils.encoding import smart_str
|
||||||
from django.core.mail import send_mail
|
from django.core.mail import send_mail
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
@@ -197,6 +196,29 @@ def cluster_node_heartbeat(self):
|
|||||||
stop_local_services(['uwsgi', 'celery', 'beat', 'callback', 'fact'])
|
stop_local_services(['uwsgi', 'celery', 'beat', 'callback', 'fact'])
|
||||||
|
|
||||||
|
|
||||||
|
@task(bind=True)
|
||||||
|
def tower_isolated_heartbeat(self):
|
||||||
|
local_hostname = settings.CLUSTER_HOST_ID
|
||||||
|
logger.debug("Controlling node checking for any isolated management tasks.")
|
||||||
|
poll_interval = settings.AWX_ISOLATED_PERIODIC_CHECK
|
||||||
|
# Get isolated instances not checked since poll interval - some buffer
|
||||||
|
nowtime = now()
|
||||||
|
accept_before = nowtime - timedelta(seconds=(poll_interval - 10))
|
||||||
|
isolated_instance_qs = Instance.objects.filter(
|
||||||
|
rampart_groups__controller__instances__hostname=local_hostname,
|
||||||
|
last_isolated_check__lt=accept_before
|
||||||
|
)
|
||||||
|
# Fast pass of isolated instances, claiming the nodes to update
|
||||||
|
with transaction.atomic():
|
||||||
|
for isolated_instance in isolated_instance_qs:
|
||||||
|
isolated_instance.last_isolated_check = nowtime
|
||||||
|
# Prevent modified time from being changed, as in normal heartbeat
|
||||||
|
isolated_instance.save(update_fields=['last_isolated_check'])
|
||||||
|
# Slow pass looping over isolated IGs and their isolated instances
|
||||||
|
if len(isolated_instance_qs) > 0:
|
||||||
|
logger.debug("Managing isolated instances {}.".format(','.join([inst.hostname for inst in isolated_instance_qs])))
|
||||||
|
isolated_manager.IsolatedManager.health_check(isolated_instance_qs)
|
||||||
|
|
||||||
|
|
||||||
@task(bind=True, queue='tower')
|
@task(bind=True, queue='tower')
|
||||||
def tower_periodic_scheduler(self):
|
def tower_periodic_scheduler(self):
|
||||||
|
|||||||
@@ -2,8 +2,17 @@ import pytest
|
|||||||
import mock
|
import mock
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from awx.main.tasks import RunProjectUpdate, RunInventoryUpdate
|
from django.utils.timezone import now, timedelta
|
||||||
from awx.main.models import ProjectUpdate, InventoryUpdate, InventorySource
|
|
||||||
|
from awx.main.tasks import (
|
||||||
|
RunProjectUpdate, RunInventoryUpdate,
|
||||||
|
tower_isolated_heartbeat,
|
||||||
|
isolated_manager
|
||||||
|
)
|
||||||
|
from awx.main.models import (
|
||||||
|
ProjectUpdate, InventoryUpdate, InventorySource,
|
||||||
|
Instance, InstanceGroup
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@@ -73,3 +82,56 @@ class TestDependentInventoryUpdate:
|
|||||||
# Verify that it bails after 1st update, detecting a cancel
|
# Verify that it bails after 1st update, detecting a cancel
|
||||||
assert is2.inventory_updates.count() == 0
|
assert is2.inventory_updates.count() == 0
|
||||||
iu_run_mock.assert_called_once()
|
iu_run_mock.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class MockSettings:
|
||||||
|
AWX_ISOLATED_PERIODIC_CHECK = 60
|
||||||
|
CLUSTER_HOST_ID = 'tower_1'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
class TestIsolatedManagementTask:
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def control_group(self):
|
||||||
|
return InstanceGroup.objects.create(name='alpha')
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def control_instance(self, control_group):
|
||||||
|
return control_group.instances.create(hostname='tower_1')
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def needs_updating(self, control_group):
|
||||||
|
ig = InstanceGroup.objects.create(name='thepentagon', controller=control_group)
|
||||||
|
inst = ig.instances.create(hostname='isolated', capacity=103)
|
||||||
|
inst.last_isolated_check=now() - timedelta(seconds=MockSettings.AWX_ISOLATED_PERIODIC_CHECK)
|
||||||
|
inst.save()
|
||||||
|
return ig
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def just_updated(self, control_group):
|
||||||
|
ig = InstanceGroup.objects.create(name='thepentagon', controller=control_group)
|
||||||
|
inst = ig.instances.create(hostname='isolated', capacity=103)
|
||||||
|
inst.last_isolated_check=now()
|
||||||
|
inst.save()
|
||||||
|
return inst
|
||||||
|
|
||||||
|
def test_takes_action(self, control_instance, needs_updating):
|
||||||
|
original_isolated_instance = needs_updating.instances.all().first()
|
||||||
|
with mock.patch('awx.main.tasks.settings', MockSettings()):
|
||||||
|
with mock.patch.object(isolated_manager.IsolatedManager, 'health_check') as check_mock:
|
||||||
|
tower_isolated_heartbeat()
|
||||||
|
iso_instance = Instance.objects.get(hostname='isolated')
|
||||||
|
call_args, _ = check_mock.call_args
|
||||||
|
assert call_args[0][0] == iso_instance
|
||||||
|
assert iso_instance.last_isolated_check > original_isolated_instance.last_isolated_check
|
||||||
|
assert iso_instance.modified == original_isolated_instance.modified
|
||||||
|
|
||||||
|
def test_does_not_take_action(self, control_instance, just_updated):
|
||||||
|
with mock.patch('awx.main.tasks.settings', MockSettings()):
|
||||||
|
with mock.patch.object(isolated_manager.IsolatedManager, 'health_check') as check_mock:
|
||||||
|
tower_isolated_heartbeat()
|
||||||
|
iso_instance = Instance.objects.get(hostname='isolated')
|
||||||
|
check_mock.assert_not_called()
|
||||||
|
assert iso_instance.capacity == 103
|
||||||
|
|||||||
@@ -543,8 +543,11 @@ def get_system_task_capacity():
|
|||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
if hasattr(settings, 'SYSTEM_TASK_CAPACITY'):
|
if hasattr(settings, 'SYSTEM_TASK_CAPACITY'):
|
||||||
return settings.SYSTEM_TASK_CAPACITY
|
return settings.SYSTEM_TASK_CAPACITY
|
||||||
proc = subprocess.Popen(['free', '-m'], stdout=subprocess.PIPE)
|
try:
|
||||||
out,err = proc.communicate()
|
out = subprocess.check_output(['free', '-m'])
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
logger.exception('Problem obtaining capacity from system.')
|
||||||
|
return 0
|
||||||
total_mem_value = out.split()[7]
|
total_mem_value = out.split()[7]
|
||||||
if int(total_mem_value) <= 2048:
|
if int(total_mem_value) <= 2048:
|
||||||
return 50
|
return 50
|
||||||
|
|||||||
11
awx/playbooks/heartbeat_isolated.yml
Normal file
11
awx/playbooks/heartbeat_isolated.yml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
- hosts: all
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
|
||||||
|
- name: Get capacity of the instance
|
||||||
|
tower_capacity:
|
||||||
|
|
||||||
|
- name: Remove any stale temporary files
|
||||||
|
tower_isolated_cleanup:
|
||||||
@@ -609,6 +609,8 @@ AWX_ISOLATED_CHECK_INTERVAL = 30
|
|||||||
|
|
||||||
# The timeout (in seconds) for launching jobs on isolated nodes
|
# The timeout (in seconds) for launching jobs on isolated nodes
|
||||||
AWX_ISOLATED_LAUNCH_TIMEOUT = 600
|
AWX_ISOLATED_LAUNCH_TIMEOUT = 600
|
||||||
|
# The time between the background isolated heartbeat status check
|
||||||
|
AWX_ISOLATED_PERIODIC_CHECK = 600
|
||||||
|
|
||||||
# Enable Pendo on the UI, possible values are 'off', 'anonymous', and 'detailed'
|
# Enable Pendo on the UI, possible values are 'off', 'anonymous', and 'detailed'
|
||||||
# Note: This setting may be overridden by database settings.
|
# Note: This setting may be overridden by database settings.
|
||||||
|
|||||||
@@ -114,6 +114,13 @@ except ImportError:
|
|||||||
|
|
||||||
CLUSTER_HOST_ID = socket.gethostname()
|
CLUSTER_HOST_ID = socket.gethostname()
|
||||||
CELERY_ROUTES['awx.main.tasks.cluster_node_heartbeat'] = {'queue': CLUSTER_HOST_ID, 'routing_key': CLUSTER_HOST_ID}
|
CELERY_ROUTES['awx.main.tasks.cluster_node_heartbeat'] = {'queue': CLUSTER_HOST_ID, 'routing_key': CLUSTER_HOST_ID}
|
||||||
|
# Production only runs this schedule on controlling nodes
|
||||||
|
# but development will just run it on all nodes
|
||||||
|
CELERYBEAT_SCHEDULE['isolated_heartbeat'] = {
|
||||||
|
'task': 'awx.main.tasks.tower_isolated_heartbeat',
|
||||||
|
'schedule': timedelta(seconds = AWX_ISOLATED_PERIODIC_CHECK),
|
||||||
|
'options': {'expires': AWX_ISOLATED_PERIODIC_CHECK * 2,}
|
||||||
|
}
|
||||||
|
|
||||||
# Supervisor service name dictionary used for programatic restart
|
# Supervisor service name dictionary used for programatic restart
|
||||||
SERVICE_NAME_DICT = {
|
SERVICE_NAME_DICT = {
|
||||||
|
|||||||
@@ -112,6 +112,58 @@ rabbitmq_use_long_name=false
|
|||||||
rabbitmq_enable_manager=false
|
rabbitmq_enable_manager=false
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Security Isolated Rampart Groups
|
||||||
|
|
||||||
|
In Tower versions 3.2+ customers may optionally define isolated groups
|
||||||
|
inside security-restricted networking zones to run jobs from.
|
||||||
|
Instances in these groups will _not_ have a full install of Tower, but will have a minimal
|
||||||
|
set of utilities used to run jobs on them. These must be specified
|
||||||
|
in the inventory file prefixed with `isolated_group_`. An example inventory
|
||||||
|
file is shown below.
|
||||||
|
|
||||||
|
```
|
||||||
|
[tower]
|
||||||
|
towerA
|
||||||
|
towerB
|
||||||
|
towerC
|
||||||
|
|
||||||
|
[instance_group_security]
|
||||||
|
towerB
|
||||||
|
towerC
|
||||||
|
|
||||||
|
[isolated_group_govcloud]
|
||||||
|
isolatedA
|
||||||
|
isolatedB
|
||||||
|
|
||||||
|
[isolated_group_govcloud:vars]
|
||||||
|
controller=security
|
||||||
|
```
|
||||||
|
|
||||||
|
In this example, when a job runs inside of the `govcloud` isolated group, a
|
||||||
|
managing task runs simultaneously on either one of the two instances in
|
||||||
|
the `security` ordinary instance group.
|
||||||
|
|
||||||
|
Networking security rules must allow
|
||||||
|
connections to all nodes in an isolated group from all nodes in its controller
|
||||||
|
group. The system is designed such that
|
||||||
|
isolated instances never make requests to any of their controllers.
|
||||||
|
The controlling instance for a particular job will send management commands to
|
||||||
|
a daemon that runs the job, and will slurp job artifacts.
|
||||||
|
|
||||||
|
Isolated groups are architected such that they may exist inside of a VPC
|
||||||
|
with security rules that _only_ permit the instances in its `controller`
|
||||||
|
group to access them.
|
||||||
|
|
||||||
|
Recommendations for system configuration with isolated groups:
|
||||||
|
- Do not put any isolated instances inside the `tower` group or other
|
||||||
|
ordinary instance groups.
|
||||||
|
- Define the `controller` variable as either a group var or as a hostvar
|
||||||
|
on all the instances in the isolated group. Please _do not_ allow
|
||||||
|
isolated instances in the same group have a different value for this
|
||||||
|
variable - the behavior in this case can not be predicted.
|
||||||
|
- Do not put an isolated instance in more than 1 isolated group.
|
||||||
|
|
||||||
|
|
||||||
### Provisioning and Deprovisioning Instances and Groups
|
### Provisioning and Deprovisioning Instances and Groups
|
||||||
|
|
||||||
* Provisioning
|
* Provisioning
|
||||||
|
|||||||
Reference in New Issue
Block a user