properly handle import errors in the isolated capacity healthcheck

if the awx_capacity module runs on an isolated node with missing
libraries (i.e., psutil) or bad permissions, then the runner status will
be "failed"

in this scenario, we *still* want to react by recording a capacity=0
This commit is contained in:
Ryan Petrello
2020-01-31 10:17:20 -05:00
parent b611164422
commit 326ed22efe

View File

@@ -370,33 +370,32 @@ class IsolatedManager(object):
private_data_dir private_data_dir
) )
if runner_obj.status == 'successful': for instance in instance_qs:
for instance in instance_qs: task_result = {}
task_result = {} try:
try: task_result = runner_obj.get_fact_cache(instance.hostname)
task_result = runner_obj.get_fact_cache(instance.hostname) except Exception:
except Exception: logger.exception('Failed to read status from isolated instances')
logger.exception('Failed to read status from isolated instances') if 'awx_capacity_cpu' in task_result and 'awx_capacity_mem' in task_result:
if 'awx_capacity_cpu' in task_result and 'awx_capacity_mem' in task_result: task_result = {
task_result = { 'cpu': task_result['awx_cpu'],
'cpu': task_result['awx_cpu'], 'mem': task_result['awx_mem'],
'mem': task_result['awx_mem'], 'capacity_cpu': task_result['awx_capacity_cpu'],
'capacity_cpu': task_result['awx_capacity_cpu'], 'capacity_mem': task_result['awx_capacity_mem'],
'capacity_mem': task_result['awx_capacity_mem'], 'version': task_result['awx_capacity_version']
'version': task_result['awx_capacity_version'] }
} IsolatedManager.update_capacity(instance, task_result)
IsolatedManager.update_capacity(instance, task_result) logger.debug('Isolated instance {} successful heartbeat'.format(instance.hostname))
logger.debug('Isolated instance {} successful heartbeat'.format(instance.hostname)) elif instance.capacity == 0:
elif instance.capacity == 0: logger.debug('Isolated instance {} previously marked as lost, could not re-join.'.format(
logger.debug('Isolated instance {} previously marked as lost, could not re-join.'.format( instance.hostname))
instance.hostname)) else:
else: logger.warning('Could not update status of isolated instance {}'.format(instance.hostname))
logger.warning('Could not update status of isolated instance {}'.format(instance.hostname)) if instance.is_lost(isolated=True):
if instance.is_lost(isolated=True): instance.capacity = 0
instance.capacity = 0 instance.save(update_fields=['capacity'])
instance.save(update_fields=['capacity']) logger.error('Isolated instance {} last checked in at {}, marked as lost.'.format(
logger.error('Isolated instance {} last checked in at {}, marked as lost.'.format( instance.hostname, instance.modified))
instance.hostname, instance.modified))
finally: finally:
if os.path.exists(private_data_dir): if os.path.exists(private_data_dir):
shutil.rmtree(private_data_dir) shutil.rmtree(private_data_dir)