mirror of
https://github.com/ansible/awx.git
synced 2026-01-14 19:30:39 -03:30
Add a new Instance.health_check_started field
This will enable us to provide more useful information for the user, now that all user-triggered health checks are async. Also, de-bounce the health check endpoint to not allow additional health check tasks to be triggered when one is already in progress.
This commit is contained in:
parent
84fa19f2ad
commit
65179d9cd0
@ -4878,6 +4878,7 @@ class InstanceSerializer(BaseSerializer):
|
||||
percent_capacity_remaining = serializers.SerializerMethodField()
|
||||
jobs_running = serializers.IntegerField(help_text=_('Count of jobs in the running or waiting state that are targeted for this instance'), read_only=True)
|
||||
jobs_total = serializers.IntegerField(help_text=_('Count of all jobs that target this instance'), read_only=True)
|
||||
health_check_pending = serializers.SerializerMethodField()
|
||||
|
||||
class Meta:
|
||||
model = Instance
|
||||
@ -4893,6 +4894,8 @@ class InstanceSerializer(BaseSerializer):
|
||||
'created',
|
||||
'modified',
|
||||
'last_seen',
|
||||
'health_check_started',
|
||||
'health_check_pending',
|
||||
'last_health_check',
|
||||
'errors',
|
||||
'capacity_adjustment',
|
||||
@ -4945,6 +4948,9 @@ class InstanceSerializer(BaseSerializer):
|
||||
else:
|
||||
return float("{0:.2f}".format(((float(obj.capacity) - float(obj.consumed_capacity)) / (float(obj.capacity))) * 100))
|
||||
|
||||
def get_health_check_pending(self, obj):
|
||||
return obj.health_check_pending
|
||||
|
||||
def validate(self, data):
|
||||
if self.instance:
|
||||
if self.instance.node_type == Instance.Types.HOP:
|
||||
|
||||
@ -451,8 +451,13 @@ class InstanceHealthCheck(GenericAPIView):
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
obj = self.get_object()
|
||||
if obj.health_check_pending:
|
||||
return Response({'msg': f"Health check was already in progress for {obj.hostname}."}, status=status.HTTP_200_OK)
|
||||
|
||||
# Note: hop nodes are already excluded by the get_queryset method
|
||||
if obj.node_type == 'execution':
|
||||
obj.health_check_started = now()
|
||||
obj.save(update_fields=['health_check_started'])
|
||||
if obj.node_type == models.Instance.Types.EXECUTION:
|
||||
from awx.main.tasks.system import execution_node_health_check
|
||||
|
||||
execution_node_health_check.apply_async([obj.hostname])
|
||||
@ -460,7 +465,7 @@ class InstanceHealthCheck(GenericAPIView):
|
||||
from awx.main.tasks.system import cluster_node_health_check
|
||||
|
||||
cluster_node_health_check.apply_async([obj.hostname], queue=obj.hostname)
|
||||
return Response(dict(msg=f"Health check is running for {obj.hostname}."), status=status.HTTP_200_OK)
|
||||
return Response({'msg': f"Health check is running for {obj.hostname}."}, status=status.HTTP_200_OK)
|
||||
|
||||
|
||||
class InstanceGroupList(ListCreateAPIView):
|
||||
|
||||
18
awx/main/migrations/0171_add_health_check_started.py
Normal file
18
awx/main/migrations/0171_add_health_check_started.py
Normal file
@ -0,0 +1,18 @@
|
||||
# Generated by Django 3.2.13 on 2022-09-26 20:54
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('main', '0170_node_and_link_state'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='instance',
|
||||
name='health_check_started',
|
||||
field=models.DateTimeField(editable=False, help_text='The last time a health check was initiated on this instance.', null=True),
|
||||
),
|
||||
]
|
||||
@ -114,6 +114,11 @@ class Instance(HasPolicyEditsMixin, BaseModel):
|
||||
editable=False,
|
||||
help_text=_('Last time instance ran its heartbeat task for main cluster nodes. Last known connection to receptor mesh for execution nodes.'),
|
||||
)
|
||||
health_check_started = models.DateTimeField(
|
||||
null=True,
|
||||
editable=False,
|
||||
help_text=_("The last time a health check was initiated on this instance."),
|
||||
)
|
||||
last_health_check = models.DateTimeField(
|
||||
null=True,
|
||||
editable=False,
|
||||
@ -207,6 +212,14 @@ class Instance(HasPolicyEditsMixin, BaseModel):
|
||||
def jobs_total(self):
|
||||
return UnifiedJob.objects.filter(execution_node=self.hostname).count()
|
||||
|
||||
@property
|
||||
def health_check_pending(self):
|
||||
if self.health_check_started is None:
|
||||
return False
|
||||
if self.last_health_check is None:
|
||||
return True
|
||||
return self.health_check_started > self.last_health_check
|
||||
|
||||
def get_cleanup_task_kwargs(self, **kwargs):
|
||||
"""
|
||||
Produce options to use for the command: ansible-runner worker cleanup
|
||||
|
||||
@ -464,7 +464,7 @@ def inspect_execution_nodes(instance_list):
|
||||
continue
|
||||
|
||||
# Control-plane nodes are dealt with via local_health_check instead.
|
||||
if instance.node_type in ('control', 'hybrid'):
|
||||
if instance.node_type in (Instance.Types.CONTROL, Instance.Types.HYBRID):
|
||||
continue
|
||||
|
||||
last_seen = parse_date(ad['Time'])
|
||||
@ -474,7 +474,7 @@ def inspect_execution_nodes(instance_list):
|
||||
instance.save(update_fields=['last_seen'])
|
||||
|
||||
# Only execution nodes should be dealt with by execution_node_health_check
|
||||
if instance.node_type == 'hop':
|
||||
if instance.node_type == Instance.Types.HOP:
|
||||
if instance.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
|
||||
logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh')
|
||||
instance.save_health_data(errors='')
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user