Add a new Instance.health_check_started field

This will enable us to provide more useful information for the user,
now that all user-triggered health checks are async.

Also, de-bounce the health check endpoint to not allow additional
health check tasks to be triggered when one is already in progress.
This commit is contained in:
Jeff Bradberry 2022-09-27 10:34:47 -04:00
parent 84fa19f2ad
commit 65179d9cd0
5 changed files with 46 additions and 4 deletions

View File

@ -4878,6 +4878,7 @@ class InstanceSerializer(BaseSerializer):
percent_capacity_remaining = serializers.SerializerMethodField()
jobs_running = serializers.IntegerField(help_text=_('Count of jobs in the running or waiting state that are targeted for this instance'), read_only=True)
jobs_total = serializers.IntegerField(help_text=_('Count of all jobs that target this instance'), read_only=True)
health_check_pending = serializers.SerializerMethodField()
class Meta:
model = Instance
@ -4893,6 +4894,8 @@ class InstanceSerializer(BaseSerializer):
'created',
'modified',
'last_seen',
'health_check_started',
'health_check_pending',
'last_health_check',
'errors',
'capacity_adjustment',
@ -4945,6 +4948,9 @@ class InstanceSerializer(BaseSerializer):
else:
return float("{0:.2f}".format(((float(obj.capacity) - float(obj.consumed_capacity)) / (float(obj.capacity))) * 100))
def get_health_check_pending(self, obj):
return obj.health_check_pending
def validate(self, data):
if self.instance:
if self.instance.node_type == Instance.Types.HOP:

View File

@ -451,8 +451,13 @@ class InstanceHealthCheck(GenericAPIView):
def post(self, request, *args, **kwargs):
obj = self.get_object()
if obj.health_check_pending:
return Response({'msg': f"Health check was already in progress for {obj.hostname}."}, status=status.HTTP_200_OK)
# Note: hop nodes are already excluded by the get_queryset method
if obj.node_type == 'execution':
obj.health_check_started = now()
obj.save(update_fields=['health_check_started'])
if obj.node_type == models.Instance.Types.EXECUTION:
from awx.main.tasks.system import execution_node_health_check
execution_node_health_check.apply_async([obj.hostname])
@ -460,7 +465,7 @@ class InstanceHealthCheck(GenericAPIView):
from awx.main.tasks.system import cluster_node_health_check
cluster_node_health_check.apply_async([obj.hostname], queue=obj.hostname)
return Response(dict(msg=f"Health check is running for {obj.hostname}."), status=status.HTTP_200_OK)
return Response({'msg': f"Health check is running for {obj.hostname}."}, status=status.HTTP_200_OK)
class InstanceGroupList(ListCreateAPIView):

View File

@ -0,0 +1,18 @@
# Generated by Django 3.2.13 on 2022-09-26 20:54
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('main', '0170_node_and_link_state'),
]
operations = [
migrations.AddField(
model_name='instance',
name='health_check_started',
field=models.DateTimeField(editable=False, help_text='The last time a health check was initiated on this instance.', null=True),
),
]

View File

@ -114,6 +114,11 @@ class Instance(HasPolicyEditsMixin, BaseModel):
editable=False,
help_text=_('Last time instance ran its heartbeat task for main cluster nodes. Last known connection to receptor mesh for execution nodes.'),
)
health_check_started = models.DateTimeField(
null=True,
editable=False,
help_text=_("The last time a health check was initiated on this instance."),
)
last_health_check = models.DateTimeField(
null=True,
editable=False,
@ -207,6 +212,14 @@ class Instance(HasPolicyEditsMixin, BaseModel):
def jobs_total(self):
return UnifiedJob.objects.filter(execution_node=self.hostname).count()
@property
def health_check_pending(self):
if self.health_check_started is None:
return False
if self.last_health_check is None:
return True
return self.health_check_started > self.last_health_check
def get_cleanup_task_kwargs(self, **kwargs):
"""
Produce options to use for the command: ansible-runner worker cleanup

View File

@ -464,7 +464,7 @@ def inspect_execution_nodes(instance_list):
continue
# Control-plane nodes are dealt with via local_health_check instead.
if instance.node_type in ('control', 'hybrid'):
if instance.node_type in (Instance.Types.CONTROL, Instance.Types.HYBRID):
continue
last_seen = parse_date(ad['Time'])
@ -474,7 +474,7 @@ def inspect_execution_nodes(instance_list):
instance.save(update_fields=['last_seen'])
# Only execution nodes should be dealt with by execution_node_health_check
if instance.node_type == 'hop':
if instance.node_type == Instance.Types.HOP:
if instance.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh')
instance.save_health_data(errors='')