mirror of
https://github.com/ansible/awx.git
synced 2026-01-17 12:41:19 -03:30
Run instance health check in task container
awx-web container does not have access to receptor socket, and the execution node health check requires receptorctl. This change runs the health check asynchronously in the task container.
This commit is contained in:
parent
68a44529b6
commit
eaa4f2483f
@ -460,41 +460,16 @@ class InstanceHealthCheck(GenericAPIView):
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
obj = self.get_object()
|
||||
|
||||
# Note: hop nodes are already excluded by the get_queryset method
|
||||
if obj.node_type == 'execution':
|
||||
from awx.main.tasks.system import execution_node_health_check
|
||||
|
||||
runner_data = execution_node_health_check(obj.hostname)
|
||||
obj.refresh_from_db()
|
||||
data = self.get_serializer(data=request.data).to_representation(obj)
|
||||
# Add in some extra unsaved fields
|
||||
for extra_field in ('transmit_timing', 'run_timing'):
|
||||
if extra_field in runner_data:
|
||||
data[extra_field] = runner_data[extra_field]
|
||||
execution_node_health_check.apply_async([obj.hostname])
|
||||
else:
|
||||
from awx.main.tasks.system import cluster_node_health_check
|
||||
|
||||
if settings.CLUSTER_HOST_ID == obj.hostname:
|
||||
cluster_node_health_check(obj.hostname)
|
||||
else:
|
||||
cluster_node_health_check.apply_async([obj.hostname], queue=obj.hostname)
|
||||
start_time = time.time()
|
||||
prior_check_time = obj.last_health_check
|
||||
while time.time() - start_time < 50.0:
|
||||
obj.refresh_from_db(fields=['last_health_check'])
|
||||
if obj.last_health_check != prior_check_time:
|
||||
break
|
||||
if time.time() - start_time < 1.0:
|
||||
time.sleep(0.1)
|
||||
else:
|
||||
time.sleep(1.0)
|
||||
else:
|
||||
obj.mark_offline(errors=_('Health check initiated by user determined this instance to be unresponsive'))
|
||||
obj.refresh_from_db()
|
||||
data = self.get_serializer(data=request.data).to_representation(obj)
|
||||
|
||||
return Response(data, status=status.HTTP_200_OK)
|
||||
cluster_node_health_check.apply_async([obj.hostname], queue=obj.hostname)
|
||||
return Response(dict(msg=f"Health check is running for {obj.hostname}."), status=status.HTTP_200_OK)
|
||||
|
||||
|
||||
class InstanceGroupList(ListCreateAPIView):
|
||||
|
||||
@ -243,20 +243,21 @@ class Instance(HasPolicyEditsMixin, BaseModel):
|
||||
|
||||
def mark_offline(self, update_last_seen=False, perform_save=True, errors=''):
|
||||
if self.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
|
||||
return
|
||||
return []
|
||||
if self.node_state == Instance.States.UNAVAILABLE and self.errors == errors and (not update_last_seen):
|
||||
return
|
||||
return []
|
||||
self.node_state = Instance.States.UNAVAILABLE
|
||||
self.cpu_capacity = self.mem_capacity = self.capacity = 0
|
||||
self.errors = errors
|
||||
if update_last_seen:
|
||||
self.last_seen = now()
|
||||
|
||||
update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors']
|
||||
if update_last_seen:
|
||||
update_fields += ['last_seen']
|
||||
if perform_save:
|
||||
update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors']
|
||||
if update_last_seen:
|
||||
update_fields += ['last_seen']
|
||||
self.save(update_fields=update_fields)
|
||||
return update_fields
|
||||
|
||||
def set_capacity_value(self):
|
||||
"""Sets capacity according to capacity adjustment rule (no save)"""
|
||||
@ -314,7 +315,8 @@ class Instance(HasPolicyEditsMixin, BaseModel):
|
||||
self.node_state = Instance.States.READY
|
||||
update_fields.append('node_state')
|
||||
else:
|
||||
self.mark_offline(perform_save=False, errors=errors)
|
||||
fields_to_update = self.mark_offline(perform_save=False, errors=errors)
|
||||
update_fields.extend(fields_to_update)
|
||||
update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])
|
||||
|
||||
# disabling activity stream will avoid extra queries, which is important for heatbeat actions
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user