Merge pull request #2801 from ryanpetrello/more-robust-isolated-capacity

collect isolated capacity using a cache plugin, not stdout parsing

Reviewed-by: https://github.com/softwarefactory-project-zuul[bot]
This commit is contained in:
softwarefactory-project-zuul[bot]
2018-12-20 20:01:00 +00:00
committed by GitHub
4 changed files with 65 additions and 37 deletions

View File

@@ -197,6 +197,18 @@ register(
category_slug='jobs', category_slug='jobs',
) )
register(
'AWX_ISOLATED_VERBOSITY',
field_class=fields.IntegerField,
min_value=0,
max_value=5,
label=_('Verbosity level for isolated node management tasks'),
help_text=_('This can be raised to aid in debugging connection issues for isolated task execution'),
category=_('Jobs'),
category_slug='jobs',
default=0
)
register( register(
'AWX_ISOLATED_CHECK_INTERVAL', 'AWX_ISOLATED_CHECK_INTERVAL',
field_class=fields.IntegerField, field_class=fields.IntegerField,

View File

@@ -101,6 +101,8 @@ class IsolatedManager(object):
] ]
if extra_vars: if extra_vars:
args.extend(['-e', json.dumps(extra_vars)]) args.extend(['-e', json.dumps(extra_vars)])
if settings.AWX_ISOLATED_VERBOSITY:
args.append('-%s' % ('v' * min(5, settings.AWX_ISOLATED_VERBOSITY)))
return args return args
@staticmethod @staticmethod
@@ -407,46 +409,52 @@ class IsolatedManager(object):
args = cls._build_args('heartbeat_isolated.yml', hostname_string) args = cls._build_args('heartbeat_isolated.yml', hostname_string)
args.extend(['--forks', str(len(instance_qs))]) args.extend(['--forks', str(len(instance_qs))])
env = cls._base_management_env() env = cls._base_management_env()
env['ANSIBLE_STDOUT_CALLBACK'] = 'json'
buff = StringIO.StringIO()
timeout = max(60, 2 * settings.AWX_ISOLATED_CONNECTION_TIMEOUT)
status, rc = IsolatedManager.run_pexpect(
args, cls.awx_playbook_path(), env, buff,
idle_timeout=timeout, job_timeout=timeout,
pexpect_timeout=5
)
output = buff.getvalue().encode('utf-8')
buff.close()
try: try:
result = json.loads(output) facts_path = tempfile.mkdtemp()
if not isinstance(result, dict): env['ANSIBLE_CACHE_PLUGIN'] = 'jsonfile'
raise TypeError('Expected a dict but received {}.'.format(str(type(result)))) env['ANSIBLE_CACHE_PLUGIN_CONNECTION'] = facts_path
except (ValueError, AssertionError, TypeError):
logger.exception('Failed to read status from isolated instances, output:\n {}'.format(output))
return
for instance in instance_qs: buff = StringIO.StringIO()
try: timeout = max(60, 2 * settings.AWX_ISOLATED_CONNECTION_TIMEOUT)
task_result = result['plays'][0]['tasks'][0]['hosts'][instance.hostname] status, rc = IsolatedManager.run_pexpect(
except (KeyError, IndexError): args, cls.awx_playbook_path(), env, buff,
idle_timeout=timeout, job_timeout=timeout,
pexpect_timeout=5
)
heartbeat_stdout = buff.getvalue().encode('utf-8')
buff.close()
for instance in instance_qs:
output = heartbeat_stdout
task_result = {} task_result = {}
if 'capacity_cpu' in task_result and 'capacity_mem' in task_result: try:
cls.update_capacity(instance, task_result, awx_application_version) with open(os.path.join(facts_path, instance.hostname), 'r') as facts_data:
logger.debug('Isolated instance {} successful heartbeat'.format(instance.hostname)) output = facts_data.read()
elif instance.capacity == 0: task_result = json.loads(output)
logger.debug('Isolated instance {} previously marked as lost, could not re-join.'.format( except Exception:
instance.hostname)) logger.exception('Failed to read status from isolated instances, output:\n {}'.format(output))
else: if 'awx_capacity_cpu' in task_result and 'awx_capacity_mem' in task_result:
logger.warning('Could not update status of isolated instance {}, msg={}'.format( task_result = {
instance.hostname, task_result.get('msg', 'unknown failure') 'capacity_cpu': task_result['awx_capacity_cpu'],
)) 'capacity_mem': task_result['awx_capacity_mem'],
if instance.is_lost(isolated=True): 'version': task_result['awx_capacity_version']
instance.capacity = 0 }
instance.save(update_fields=['capacity']) cls.update_capacity(instance, task_result, awx_application_version)
logger.error('Isolated instance {} last checked in at {}, marked as lost.'.format( logger.debug('Isolated instance {} successful heartbeat'.format(instance.hostname))
instance.hostname, instance.modified)) elif instance.capacity == 0:
logger.debug('Isolated instance {} previously marked as lost, could not re-join.'.format(
instance.hostname))
else:
logger.warning('Could not update status of isolated instance {}'.format(instance.hostname))
if instance.is_lost(isolated=True):
instance.capacity = 0
instance.save(update_fields=['capacity'])
logger.error('Isolated instance {} last checked in at {}, marked as lost.'.format(
instance.hostname, instance.modified))
finally:
if os.path.exists(facts_path):
shutil.rmtree(facts_path)
@staticmethod @staticmethod
def get_stdout_handle(instance, private_data_dir, event_data_key='job_id'): def get_stdout_handle(instance, private_data_dir, event_data_key='job_id'):

View File

@@ -62,7 +62,12 @@ def main():
# Module never results in a change # Module never results in a change
module.exit_json(changed=False, capacity_cpu=capacity_cpu, module.exit_json(changed=False, capacity_cpu=capacity_cpu,
capacity_mem=capacity_mem, version=version) capacity_mem=capacity_mem, version=version,
ansible_facts=dict(
awx_capacity_cpu=capacity_cpu,
awx_capacity_mem=capacity_mem,
awx_capacity_version=version
))
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -430,6 +430,9 @@ AWX_ISOLATED_CONNECTION_TIMEOUT = 10
# The time (in seconds) between the periodic isolated heartbeat status check # The time (in seconds) between the periodic isolated heartbeat status check
AWX_ISOLATED_PERIODIC_CHECK = 600 AWX_ISOLATED_PERIODIC_CHECK = 600
# Verbosity level for isolated node management tasks
AWX_ISOLATED_VERBOSITY = 0
# Memcached django cache configuration # Memcached django cache configuration
# CACHES = { # CACHES = {
# 'default': { # 'default': {