mirror of
https://github.com/ansible/awx.git
synced 2026-01-16 04:10:44 -03:30
Merge pull request #2801 from ryanpetrello/more-robust-isolated-capacity
collect isolated capacity using a cache plugin, not stdout parsing Reviewed-by: https://github.com/softwarefactory-project-zuul[bot]
This commit is contained in:
commit
aa8cda0001
@ -197,6 +197,18 @@ register(
|
||||
category_slug='jobs',
|
||||
)
|
||||
|
||||
register(
|
||||
'AWX_ISOLATED_VERBOSITY',
|
||||
field_class=fields.IntegerField,
|
||||
min_value=0,
|
||||
max_value=5,
|
||||
label=_('Verbosity level for isolated node management tasks'),
|
||||
help_text=_('This can be raised to aid in debugging connection issues for isolated task execution'),
|
||||
category=_('Jobs'),
|
||||
category_slug='jobs',
|
||||
default=0
|
||||
)
|
||||
|
||||
register(
|
||||
'AWX_ISOLATED_CHECK_INTERVAL',
|
||||
field_class=fields.IntegerField,
|
||||
|
||||
@ -101,6 +101,8 @@ class IsolatedManager(object):
|
||||
]
|
||||
if extra_vars:
|
||||
args.extend(['-e', json.dumps(extra_vars)])
|
||||
if settings.AWX_ISOLATED_VERBOSITY:
|
||||
args.append('-%s' % ('v' * min(5, settings.AWX_ISOLATED_VERBOSITY)))
|
||||
return args
|
||||
|
||||
@staticmethod
|
||||
@ -407,46 +409,52 @@ class IsolatedManager(object):
|
||||
args = cls._build_args('heartbeat_isolated.yml', hostname_string)
|
||||
args.extend(['--forks', str(len(instance_qs))])
|
||||
env = cls._base_management_env()
|
||||
env['ANSIBLE_STDOUT_CALLBACK'] = 'json'
|
||||
|
||||
buff = StringIO.StringIO()
|
||||
timeout = max(60, 2 * settings.AWX_ISOLATED_CONNECTION_TIMEOUT)
|
||||
status, rc = IsolatedManager.run_pexpect(
|
||||
args, cls.awx_playbook_path(), env, buff,
|
||||
idle_timeout=timeout, job_timeout=timeout,
|
||||
pexpect_timeout=5
|
||||
)
|
||||
output = buff.getvalue().encode('utf-8')
|
||||
buff.close()
|
||||
|
||||
try:
|
||||
result = json.loads(output)
|
||||
if not isinstance(result, dict):
|
||||
raise TypeError('Expected a dict but received {}.'.format(str(type(result))))
|
||||
except (ValueError, AssertionError, TypeError):
|
||||
logger.exception('Failed to read status from isolated instances, output:\n {}'.format(output))
|
||||
return
|
||||
facts_path = tempfile.mkdtemp()
|
||||
env['ANSIBLE_CACHE_PLUGIN'] = 'jsonfile'
|
||||
env['ANSIBLE_CACHE_PLUGIN_CONNECTION'] = facts_path
|
||||
|
||||
for instance in instance_qs:
|
||||
try:
|
||||
task_result = result['plays'][0]['tasks'][0]['hosts'][instance.hostname]
|
||||
except (KeyError, IndexError):
|
||||
buff = StringIO.StringIO()
|
||||
timeout = max(60, 2 * settings.AWX_ISOLATED_CONNECTION_TIMEOUT)
|
||||
status, rc = IsolatedManager.run_pexpect(
|
||||
args, cls.awx_playbook_path(), env, buff,
|
||||
idle_timeout=timeout, job_timeout=timeout,
|
||||
pexpect_timeout=5
|
||||
)
|
||||
heartbeat_stdout = buff.getvalue().encode('utf-8')
|
||||
buff.close()
|
||||
|
||||
for instance in instance_qs:
|
||||
output = heartbeat_stdout
|
||||
task_result = {}
|
||||
if 'capacity_cpu' in task_result and 'capacity_mem' in task_result:
|
||||
cls.update_capacity(instance, task_result, awx_application_version)
|
||||
logger.debug('Isolated instance {} successful heartbeat'.format(instance.hostname))
|
||||
elif instance.capacity == 0:
|
||||
logger.debug('Isolated instance {} previously marked as lost, could not re-join.'.format(
|
||||
instance.hostname))
|
||||
else:
|
||||
logger.warning('Could not update status of isolated instance {}, msg={}'.format(
|
||||
instance.hostname, task_result.get('msg', 'unknown failure')
|
||||
))
|
||||
if instance.is_lost(isolated=True):
|
||||
instance.capacity = 0
|
||||
instance.save(update_fields=['capacity'])
|
||||
logger.error('Isolated instance {} last checked in at {}, marked as lost.'.format(
|
||||
instance.hostname, instance.modified))
|
||||
try:
|
||||
with open(os.path.join(facts_path, instance.hostname), 'r') as facts_data:
|
||||
output = facts_data.read()
|
||||
task_result = json.loads(output)
|
||||
except Exception:
|
||||
logger.exception('Failed to read status from isolated instances, output:\n {}'.format(output))
|
||||
if 'awx_capacity_cpu' in task_result and 'awx_capacity_mem' in task_result:
|
||||
task_result = {
|
||||
'capacity_cpu': task_result['awx_capacity_cpu'],
|
||||
'capacity_mem': task_result['awx_capacity_mem'],
|
||||
'version': task_result['awx_capacity_version']
|
||||
}
|
||||
cls.update_capacity(instance, task_result, awx_application_version)
|
||||
logger.debug('Isolated instance {} successful heartbeat'.format(instance.hostname))
|
||||
elif instance.capacity == 0:
|
||||
logger.debug('Isolated instance {} previously marked as lost, could not re-join.'.format(
|
||||
instance.hostname))
|
||||
else:
|
||||
logger.warning('Could not update status of isolated instance {}'.format(instance.hostname))
|
||||
if instance.is_lost(isolated=True):
|
||||
instance.capacity = 0
|
||||
instance.save(update_fields=['capacity'])
|
||||
logger.error('Isolated instance {} last checked in at {}, marked as lost.'.format(
|
||||
instance.hostname, instance.modified))
|
||||
finally:
|
||||
if os.path.exists(facts_path):
|
||||
shutil.rmtree(facts_path)
|
||||
|
||||
@staticmethod
|
||||
def get_stdout_handle(instance, private_data_dir, event_data_key='job_id'):
|
||||
|
||||
@ -62,7 +62,12 @@ def main():
|
||||
|
||||
# Module never results in a change
|
||||
module.exit_json(changed=False, capacity_cpu=capacity_cpu,
|
||||
capacity_mem=capacity_mem, version=version)
|
||||
capacity_mem=capacity_mem, version=version,
|
||||
ansible_facts=dict(
|
||||
awx_capacity_cpu=capacity_cpu,
|
||||
awx_capacity_mem=capacity_mem,
|
||||
awx_capacity_version=version
|
||||
))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@ -430,6 +430,9 @@ AWX_ISOLATED_CONNECTION_TIMEOUT = 10
|
||||
# The time (in seconds) between the periodic isolated heartbeat status check
|
||||
AWX_ISOLATED_PERIODIC_CHECK = 600
|
||||
|
||||
# Verbosity level for isolated node management tasks
|
||||
AWX_ISOLATED_VERBOSITY = 0
|
||||
|
||||
# Memcached django cache configuration
|
||||
# CACHES = {
|
||||
# 'default': {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user