mirror of
https://github.com/ansible/awx.git
synced 2026-03-09 13:39:27 -02:30
Allow manually running a health check, and make other adjustments to the health check trigger (#11002)
* Full finalize the planned work for health checks of execution nodes * Implementation of instance health_check endpoint * Also do version conditional to node_type * Do not use receptor mesh to check main cluster nodes health * Fix bugs from testing health check of cluster nodes, add doc * Add a few fields to health check serializer missed before * Light refactoring of error field processing * Fix errors clearing error, write more unit tests * Update health check info in docs * Bump migration of health check after rebase * Mark string for translation * Add related health_check link for system auditors too * Handle health_check cluster node timeout, add errors for peer judgement
This commit is contained in:
@@ -25,7 +25,7 @@ __all__ = [
|
||||
'ProjectUpdatePermission',
|
||||
'InventoryInventorySourcesUpdatePermission',
|
||||
'UserPermission',
|
||||
'IsSuperUser',
|
||||
'IsSystemAdminOrAuditor',
|
||||
'InstanceGroupTowerPermission',
|
||||
'WorkflowApprovalPermission',
|
||||
]
|
||||
@@ -236,13 +236,18 @@ class UserPermission(ModelAccessPermission):
|
||||
raise PermissionDenied()
|
||||
|
||||
|
||||
class IsSuperUser(permissions.BasePermission):
|
||||
class IsSystemAdminOrAuditor(permissions.BasePermission):
|
||||
"""
|
||||
Allows access only to admin users.
|
||||
Allows write access only to system admin users.
|
||||
Allows read access only to system auditor users.
|
||||
"""
|
||||
|
||||
def has_permission(self, request, view):
|
||||
return request.user and request.user.is_superuser
|
||||
if not request.user:
|
||||
return False
|
||||
if request.method == 'GET':
|
||||
return request.user.is_superuser or request.user.is_system_auditor
|
||||
return request.user.is_superuser
|
||||
|
||||
|
||||
class InstanceGroupTowerPermission(ModelAccessPermission):
|
||||
|
||||
@@ -4786,6 +4786,9 @@ class InstanceSerializer(BaseSerializer):
|
||||
"hostname",
|
||||
"created",
|
||||
"modified",
|
||||
"last_seen",
|
||||
"last_health_check",
|
||||
"errors",
|
||||
'capacity_adjustment',
|
||||
"version",
|
||||
"capacity",
|
||||
@@ -4806,6 +4809,8 @@ class InstanceSerializer(BaseSerializer):
|
||||
res = super(InstanceSerializer, self).get_related(obj)
|
||||
res['jobs'] = self.reverse('api:instance_unified_jobs_list', kwargs={'pk': obj.pk})
|
||||
res['instance_groups'] = self.reverse('api:instance_instance_groups_list', kwargs={'pk': obj.pk})
|
||||
if self.context['request'].user.is_superuser or self.context['request'].user.is_system_auditor:
|
||||
res['health_check'] = self.reverse('api:instance_health_check', kwargs={'pk': obj.pk})
|
||||
return res
|
||||
|
||||
def get_consumed_capacity(self, obj):
|
||||
@@ -4818,6 +4823,13 @@ class InstanceSerializer(BaseSerializer):
|
||||
return float("{0:.2f}".format(((float(obj.capacity) - float(obj.consumed_capacity)) / (float(obj.capacity))) * 100))
|
||||
|
||||
|
||||
class InstanceHealthCheckSerializer(BaseSerializer):
|
||||
class Meta:
|
||||
model = Instance
|
||||
read_only_fields = ('uuid', 'hostname', 'version', 'last_health_check', 'errors', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity', 'capacity')
|
||||
fields = read_only_fields
|
||||
|
||||
|
||||
class InstanceGroupSerializer(BaseSerializer):
|
||||
|
||||
show_capabilities = ['edit', 'delete']
|
||||
|
||||
33
awx/api/templates/api/instance_health_check.md
Normal file
33
awx/api/templates/api/instance_health_check.md
Normal file
@@ -0,0 +1,33 @@
|
||||
{% ifmeth GET %}
|
||||
# Health Check Data
|
||||
|
||||
Health checks are used to obtain important data about an instance.
|
||||
Instance fields affected by the health check are shown in this view.
|
||||
Fundamentally, health checks require running code on the machine in question.
|
||||
|
||||
- For instances with `node_type` of "control" or "hybrid", health checks are
|
||||
performed as part of a periodic task that runs in the background.
|
||||
- For instances with `node_type` of "execution", health checks are done by submitting
|
||||
a work unit through the receptor mesh.
|
||||
|
||||
If ran through the receptor mesh, the invoked command is:
|
||||
|
||||
```
|
||||
ansible-runner worker --worker-info
|
||||
```
|
||||
|
||||
For execution nodes, these checks are _not_ performed on a regular basis.
|
||||
Health checks against functional nodes will be ran when the node is first discovered.
|
||||
Health checks against nodes with errors will be repeated at a reduced frequency.
|
||||
|
||||
{% endifmeth %}
|
||||
|
||||
{% ifmeth POST %}
|
||||
# Manually Initiate a Health Check
|
||||
For purposes of error remediation or debugging, a health check can be
|
||||
manually initiated by making a POST request to this endpoint.
|
||||
|
||||
This will submit the work unit to the target node through the receptor mesh and wait for it to finish.
|
||||
The model will be updated with the result.
|
||||
Up-to-date values of the fields will be returned in the response data.
|
||||
{% endifmeth %}
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
from django.conf.urls import url
|
||||
|
||||
from awx.api.views import InstanceList, InstanceDetail, InstanceUnifiedJobsList, InstanceInstanceGroupsList
|
||||
from awx.api.views import InstanceList, InstanceDetail, InstanceUnifiedJobsList, InstanceInstanceGroupsList, InstanceHealthCheck
|
||||
|
||||
|
||||
urls = [
|
||||
@@ -11,6 +11,7 @@ urls = [
|
||||
url(r'^(?P<pk>[0-9]+)/$', InstanceDetail.as_view(), name='instance_detail'),
|
||||
url(r'^(?P<pk>[0-9]+)/jobs/$', InstanceUnifiedJobsList.as_view(), name='instance_unified_jobs_list'),
|
||||
url(r'^(?P<pk>[0-9]+)/instance_groups/$', InstanceInstanceGroupsList.as_view(), name='instance_instance_groups_list'),
|
||||
url(r'^(?P<pk>[0-9]+)/health_check/$', InstanceHealthCheck.as_view(), name='instance_health_check'),
|
||||
]
|
||||
|
||||
__all__ = ['urls']
|
||||
|
||||
@@ -108,6 +108,7 @@ from awx.api.permissions import (
|
||||
InstanceGroupTowerPermission,
|
||||
VariableDataPermission,
|
||||
WorkflowApprovalPermission,
|
||||
IsSystemAdminOrAuditor,
|
||||
)
|
||||
from awx.api import renderers
|
||||
from awx.api import serializers
|
||||
@@ -408,6 +409,56 @@ class InstanceInstanceGroupsList(InstanceGroupMembershipMixin, SubListCreateAtta
|
||||
return None
|
||||
|
||||
|
||||
class InstanceHealthCheck(GenericAPIView):
|
||||
|
||||
name = _('Instance Health Check')
|
||||
model = models.Instance
|
||||
serializer_class = serializers.InstanceHealthCheckSerializer
|
||||
permission_classes = (IsSystemAdminOrAuditor,)
|
||||
|
||||
def get(self, request, *args, **kwargs):
|
||||
obj = self.get_object()
|
||||
data = self.get_serializer(data=request.data).to_representation(obj)
|
||||
return Response(data, status=status.HTTP_200_OK)
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
obj = self.get_object()
|
||||
|
||||
if obj.node_type == 'execution':
|
||||
from awx.main.tasks import execution_node_health_check
|
||||
|
||||
runner_data = execution_node_health_check(obj.hostname)
|
||||
obj.refresh_from_db()
|
||||
data = self.get_serializer(data=request.data).to_representation(obj)
|
||||
# Add in some extra unsaved fields
|
||||
for extra_field in ('transmit_timing', 'run_timing'):
|
||||
if extra_field in runner_data:
|
||||
data[extra_field] = runner_data[extra_field]
|
||||
else:
|
||||
from awx.main.tasks import cluster_node_health_check
|
||||
|
||||
if settings.CLUSTER_HOST_ID == obj.hostname:
|
||||
cluster_node_health_check(obj.hostname)
|
||||
else:
|
||||
cluster_node_health_check.apply_async([obj.hostname], queue=obj.hostname)
|
||||
start_time = time.time()
|
||||
prior_check_time = obj.last_health_check
|
||||
while time.time() - start_time < 50.0:
|
||||
obj.refresh_from_db(fields=['last_health_check'])
|
||||
if obj.last_health_check != prior_check_time:
|
||||
break
|
||||
if time.time() - start_time < 1.0:
|
||||
time.sleep(0.1)
|
||||
else:
|
||||
time.sleep(1.0)
|
||||
else:
|
||||
obj.mark_offline(errors=_('Health check initiated by user determined this instance to be unresponsive'))
|
||||
obj.refresh_from_db()
|
||||
data = self.get_serializer(data=request.data).to_representation(obj)
|
||||
|
||||
return Response(data, status=status.HTTP_200_OK)
|
||||
|
||||
|
||||
class InstanceGroupList(ListCreateAPIView):
|
||||
|
||||
name = _("Instance Groups")
|
||||
|
||||
Reference in New Issue
Block a user