Run instance health check in task container

awx-web container does not have access to receptor socket, and the
execution node health check requires receptorctl.

This change runs the health check asynchronously in the task container.
This commit is contained in:
Seth Foster 2022-09-02 20:38:01 -04:00 committed by Jeff Bradberry
parent 68a44529b6
commit eaa4f2483f
2 changed files with 11 additions and 34 deletions

View File

@ -460,41 +460,16 @@ class InstanceHealthCheck(GenericAPIView):
def post(self, request, *args, **kwargs):
obj = self.get_object()
# Note: hop nodes are already excluded by the get_queryset method
if obj.node_type == 'execution':
from awx.main.tasks.system import execution_node_health_check
runner_data = execution_node_health_check(obj.hostname)
obj.refresh_from_db()
data = self.get_serializer(data=request.data).to_representation(obj)
# Add in some extra unsaved fields
for extra_field in ('transmit_timing', 'run_timing'):
if extra_field in runner_data:
data[extra_field] = runner_data[extra_field]
execution_node_health_check.apply_async([obj.hostname])
else:
from awx.main.tasks.system import cluster_node_health_check
if settings.CLUSTER_HOST_ID == obj.hostname:
cluster_node_health_check(obj.hostname)
else:
cluster_node_health_check.apply_async([obj.hostname], queue=obj.hostname)
start_time = time.time()
prior_check_time = obj.last_health_check
while time.time() - start_time < 50.0:
obj.refresh_from_db(fields=['last_health_check'])
if obj.last_health_check != prior_check_time:
break
if time.time() - start_time < 1.0:
time.sleep(0.1)
else:
time.sleep(1.0)
else:
obj.mark_offline(errors=_('Health check initiated by user determined this instance to be unresponsive'))
obj.refresh_from_db()
data = self.get_serializer(data=request.data).to_representation(obj)
return Response(data, status=status.HTTP_200_OK)
cluster_node_health_check.apply_async([obj.hostname], queue=obj.hostname)
return Response(dict(msg=f"Health check is running for {obj.hostname}."), status=status.HTTP_200_OK)
class InstanceGroupList(ListCreateAPIView):

View File

@ -243,20 +243,21 @@ class Instance(HasPolicyEditsMixin, BaseModel):
def mark_offline(self, update_last_seen=False, perform_save=True, errors=''):
if self.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
return
return []
if self.node_state == Instance.States.UNAVAILABLE and self.errors == errors and (not update_last_seen):
return
return []
self.node_state = Instance.States.UNAVAILABLE
self.cpu_capacity = self.mem_capacity = self.capacity = 0
self.errors = errors
if update_last_seen:
self.last_seen = now()
update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors']
if update_last_seen:
update_fields += ['last_seen']
if perform_save:
update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors']
if update_last_seen:
update_fields += ['last_seen']
self.save(update_fields=update_fields)
return update_fields
def set_capacity_value(self):
"""Sets capacity according to capacity adjustment rule (no save)"""
@ -314,7 +315,8 @@ class Instance(HasPolicyEditsMixin, BaseModel):
self.node_state = Instance.States.READY
update_fields.append('node_state')
else:
self.mark_offline(perform_save=False, errors=errors)
fields_to_update = self.mark_offline(perform_save=False, errors=errors)
update_fields.extend(fields_to_update)
update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])
# disabling activity stream will avoid extra queries, which is important for heatbeat actions