diff --git a/awx/api/serializers.py b/awx/api/serializers.py index b90c32f750..26996f44ed 100644 --- a/awx/api/serializers.py +++ b/awx/api/serializers.py @@ -4878,6 +4878,7 @@ class InstanceSerializer(BaseSerializer): percent_capacity_remaining = serializers.SerializerMethodField() jobs_running = serializers.IntegerField(help_text=_('Count of jobs in the running or waiting state that are targeted for this instance'), read_only=True) jobs_total = serializers.IntegerField(help_text=_('Count of all jobs that target this instance'), read_only=True) + health_check_pending = serializers.SerializerMethodField() class Meta: model = Instance @@ -4893,6 +4894,8 @@ class InstanceSerializer(BaseSerializer): 'created', 'modified', 'last_seen', + 'health_check_started', + 'health_check_pending', 'last_health_check', 'errors', 'capacity_adjustment', @@ -4945,6 +4948,9 @@ class InstanceSerializer(BaseSerializer): else: return float("{0:.2f}".format(((float(obj.capacity) - float(obj.consumed_capacity)) / (float(obj.capacity))) * 100)) + def get_health_check_pending(self, obj): + return obj.health_check_pending + def validate(self, data): if self.instance: if self.instance.node_type == Instance.Types.HOP: diff --git a/awx/api/views/__init__.py b/awx/api/views/__init__.py index ee9f1021b5..ce25fab7c0 100644 --- a/awx/api/views/__init__.py +++ b/awx/api/views/__init__.py @@ -451,8 +451,13 @@ class InstanceHealthCheck(GenericAPIView): def post(self, request, *args, **kwargs): obj = self.get_object() + if obj.health_check_pending: + return Response({'msg': f"Health check was already in progress for {obj.hostname}."}, status=status.HTTP_200_OK) + # Note: hop nodes are already excluded by the get_queryset method - if obj.node_type == 'execution': + obj.health_check_started = now() + obj.save(update_fields=['health_check_started']) + if obj.node_type == models.Instance.Types.EXECUTION: from awx.main.tasks.system import execution_node_health_check execution_node_health_check.apply_async([obj.hostname]) @@ -460,7 +465,7 @@ class InstanceHealthCheck(GenericAPIView): from awx.main.tasks.system import cluster_node_health_check cluster_node_health_check.apply_async([obj.hostname], queue=obj.hostname) - return Response(dict(msg=f"Health check is running for {obj.hostname}."), status=status.HTTP_200_OK) + return Response({'msg': f"Health check is running for {obj.hostname}."}, status=status.HTTP_200_OK) class InstanceGroupList(ListCreateAPIView): diff --git a/awx/main/migrations/0171_add_health_check_started.py b/awx/main/migrations/0171_add_health_check_started.py new file mode 100644 index 0000000000..65f285b5b1 --- /dev/null +++ b/awx/main/migrations/0171_add_health_check_started.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.13 on 2022-09-26 20:54 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('main', '0170_node_and_link_state'), + ] + + operations = [ + migrations.AddField( + model_name='instance', + name='health_check_started', + field=models.DateTimeField(editable=False, help_text='The last time a health check was initiated on this instance.', null=True), + ), + ] diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 38e8ac0068..dbeb81dcac 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -114,6 +114,11 @@ class Instance(HasPolicyEditsMixin, BaseModel): editable=False, help_text=_('Last time instance ran its heartbeat task for main cluster nodes. Last known connection to receptor mesh for execution nodes.'), ) + health_check_started = models.DateTimeField( + null=True, + editable=False, + help_text=_("The last time a health check was initiated on this instance."), + ) last_health_check = models.DateTimeField( null=True, editable=False, @@ -207,6 +212,14 @@ class Instance(HasPolicyEditsMixin, BaseModel): def jobs_total(self): return UnifiedJob.objects.filter(execution_node=self.hostname).count() + @property + def health_check_pending(self): + if self.health_check_started is None: + return False + if self.last_health_check is None: + return True + return self.health_check_started > self.last_health_check + def get_cleanup_task_kwargs(self, **kwargs): """ Produce options to use for the command: ansible-runner worker cleanup diff --git a/awx/main/tasks/system.py b/awx/main/tasks/system.py index 24dbd98b6e..ee3293beae 100644 --- a/awx/main/tasks/system.py +++ b/awx/main/tasks/system.py @@ -464,7 +464,7 @@ def inspect_execution_nodes(instance_list): continue # Control-plane nodes are dealt with via local_health_check instead. - if instance.node_type in ('control', 'hybrid'): + if instance.node_type in (Instance.Types.CONTROL, Instance.Types.HYBRID): continue last_seen = parse_date(ad['Time']) @@ -474,7 +474,7 @@ def inspect_execution_nodes(instance_list): instance.save(update_fields=['last_seen']) # Only execution nodes should be dealt with by execution_node_health_check - if instance.node_type == 'hop': + if instance.node_type == Instance.Types.HOP: if instance.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED): logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh') instance.save_health_data(errors='')