Run instance health check in task container

awx-web container does not have access to receptor socket, and the
execution node health check requires receptorctl.

This change runs the health check asynchronously in the task container.
This commit is contained in:
Seth Foster
2022-09-02 20:38:01 -04:00
committed by Jeff Bradberry
parent 68a44529b6
commit eaa4f2483f
2 changed files with 11 additions and 34 deletions

View File

@@ -460,41 +460,16 @@ class InstanceHealthCheck(GenericAPIView):
def post(self, request, *args, **kwargs): def post(self, request, *args, **kwargs):
obj = self.get_object() obj = self.get_object()
# Note: hop nodes are already excluded by the get_queryset method # Note: hop nodes are already excluded by the get_queryset method
if obj.node_type == 'execution': if obj.node_type == 'execution':
from awx.main.tasks.system import execution_node_health_check from awx.main.tasks.system import execution_node_health_check
runner_data = execution_node_health_check(obj.hostname) execution_node_health_check.apply_async([obj.hostname])
obj.refresh_from_db()
data = self.get_serializer(data=request.data).to_representation(obj)
# Add in some extra unsaved fields
for extra_field in ('transmit_timing', 'run_timing'):
if extra_field in runner_data:
data[extra_field] = runner_data[extra_field]
else: else:
from awx.main.tasks.system import cluster_node_health_check from awx.main.tasks.system import cluster_node_health_check
if settings.CLUSTER_HOST_ID == obj.hostname: cluster_node_health_check.apply_async([obj.hostname], queue=obj.hostname)
cluster_node_health_check(obj.hostname) return Response(dict(msg=f"Health check is running for {obj.hostname}."), status=status.HTTP_200_OK)
else:
cluster_node_health_check.apply_async([obj.hostname], queue=obj.hostname)
start_time = time.time()
prior_check_time = obj.last_health_check
while time.time() - start_time < 50.0:
obj.refresh_from_db(fields=['last_health_check'])
if obj.last_health_check != prior_check_time:
break
if time.time() - start_time < 1.0:
time.sleep(0.1)
else:
time.sleep(1.0)
else:
obj.mark_offline(errors=_('Health check initiated by user determined this instance to be unresponsive'))
obj.refresh_from_db()
data = self.get_serializer(data=request.data).to_representation(obj)
return Response(data, status=status.HTTP_200_OK)
class InstanceGroupList(ListCreateAPIView): class InstanceGroupList(ListCreateAPIView):

View File

@@ -243,20 +243,21 @@ class Instance(HasPolicyEditsMixin, BaseModel):
def mark_offline(self, update_last_seen=False, perform_save=True, errors=''): def mark_offline(self, update_last_seen=False, perform_save=True, errors=''):
if self.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED): if self.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
return return []
if self.node_state == Instance.States.UNAVAILABLE and self.errors == errors and (not update_last_seen): if self.node_state == Instance.States.UNAVAILABLE and self.errors == errors and (not update_last_seen):
return return []
self.node_state = Instance.States.UNAVAILABLE self.node_state = Instance.States.UNAVAILABLE
self.cpu_capacity = self.mem_capacity = self.capacity = 0 self.cpu_capacity = self.mem_capacity = self.capacity = 0
self.errors = errors self.errors = errors
if update_last_seen: if update_last_seen:
self.last_seen = now() self.last_seen = now()
update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors']
if update_last_seen:
update_fields += ['last_seen']
if perform_save: if perform_save:
update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors']
if update_last_seen:
update_fields += ['last_seen']
self.save(update_fields=update_fields) self.save(update_fields=update_fields)
return update_fields
def set_capacity_value(self): def set_capacity_value(self):
"""Sets capacity according to capacity adjustment rule (no save)""" """Sets capacity according to capacity adjustment rule (no save)"""
@@ -314,7 +315,8 @@ class Instance(HasPolicyEditsMixin, BaseModel):
self.node_state = Instance.States.READY self.node_state = Instance.States.READY
update_fields.append('node_state') update_fields.append('node_state')
else: else:
self.mark_offline(perform_save=False, errors=errors) fields_to_update = self.mark_offline(perform_save=False, errors=errors)
update_fields.extend(fields_to_update)
update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity']) update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])
# disabling activity stream will avoid extra queries, which is important for heatbeat actions # disabling activity stream will avoid extra queries, which is important for heatbeat actions