Allow manually running a health check, and make other adjustments to the health check trigger (#11002)

* Full finalize the planned work for health checks of execution nodes

* Implementation of instance health_check endpoint

* Also do version conditional to node_type

* Do not use receptor mesh to check main cluster nodes health

* Fix bugs from testing health check of cluster nodes, add doc

* Add a few fields to health check serializer missed before

* Light refactoring of error field processing

* Fix errors clearing error, write more unit tests

* Update health check info in docs

* Bump migration of health check after rebase

* Mark string for translation

* Add related health_check link for system auditors too

* Handle health_check cluster node timeout, add errors for peer judgement
This commit is contained in:
Alan Rominger
2021-09-03 16:37:37 -04:00
committed by GitHub
parent 169c0f6642
commit 6a17e5b65b
15 changed files with 285 additions and 53 deletions

View File

@@ -25,7 +25,7 @@ __all__ = [
'ProjectUpdatePermission',
'InventoryInventorySourcesUpdatePermission',
'UserPermission',
'IsSuperUser',
'IsSystemAdminOrAuditor',
'InstanceGroupTowerPermission',
'WorkflowApprovalPermission',
]
@@ -236,13 +236,18 @@ class UserPermission(ModelAccessPermission):
raise PermissionDenied()
class IsSuperUser(permissions.BasePermission):
class IsSystemAdminOrAuditor(permissions.BasePermission):
"""
Allows access only to admin users.
Allows write access only to system admin users.
Allows read access only to system auditor users.
"""
def has_permission(self, request, view):
return request.user and request.user.is_superuser
if not request.user:
return False
if request.method == 'GET':
return request.user.is_superuser or request.user.is_system_auditor
return request.user.is_superuser
class InstanceGroupTowerPermission(ModelAccessPermission):

View File

@@ -4786,6 +4786,9 @@ class InstanceSerializer(BaseSerializer):
"hostname",
"created",
"modified",
"last_seen",
"last_health_check",
"errors",
'capacity_adjustment',
"version",
"capacity",
@@ -4806,6 +4809,8 @@ class InstanceSerializer(BaseSerializer):
res = super(InstanceSerializer, self).get_related(obj)
res['jobs'] = self.reverse('api:instance_unified_jobs_list', kwargs={'pk': obj.pk})
res['instance_groups'] = self.reverse('api:instance_instance_groups_list', kwargs={'pk': obj.pk})
if self.context['request'].user.is_superuser or self.context['request'].user.is_system_auditor:
res['health_check'] = self.reverse('api:instance_health_check', kwargs={'pk': obj.pk})
return res
def get_consumed_capacity(self, obj):
@@ -4818,6 +4823,13 @@ class InstanceSerializer(BaseSerializer):
return float("{0:.2f}".format(((float(obj.capacity) - float(obj.consumed_capacity)) / (float(obj.capacity))) * 100))
class InstanceHealthCheckSerializer(BaseSerializer):
class Meta:
model = Instance
read_only_fields = ('uuid', 'hostname', 'version', 'last_health_check', 'errors', 'cpu', 'memory', 'cpu_capacity', 'mem_capacity', 'capacity')
fields = read_only_fields
class InstanceGroupSerializer(BaseSerializer):
show_capabilities = ['edit', 'delete']

View File

@@ -0,0 +1,33 @@
{% ifmeth GET %}
# Health Check Data
Health checks are used to obtain important data about an instance.
Instance fields affected by the health check are shown in this view.
Fundamentally, health checks require running code on the machine in question.
- For instances with `node_type` of "control" or "hybrid", health checks are
performed as part of a periodic task that runs in the background.
- For instances with `node_type` of "execution", health checks are done by submitting
a work unit through the receptor mesh.
If ran through the receptor mesh, the invoked command is:
```
ansible-runner worker --worker-info
```
For execution nodes, these checks are _not_ performed on a regular basis.
Health checks against functional nodes will be ran when the node is first discovered.
Health checks against nodes with errors will be repeated at a reduced frequency.
{% endifmeth %}
{% ifmeth POST %}
# Manually Initiate a Health Check
For purposes of error remediation or debugging, a health check can be
manually initiated by making a POST request to this endpoint.
This will submit the work unit to the target node through the receptor mesh and wait for it to finish.
The model will be updated with the result.
Up-to-date values of the fields will be returned in the response data.
{% endifmeth %}

View File

@@ -3,7 +3,7 @@
from django.conf.urls import url
from awx.api.views import InstanceList, InstanceDetail, InstanceUnifiedJobsList, InstanceInstanceGroupsList
from awx.api.views import InstanceList, InstanceDetail, InstanceUnifiedJobsList, InstanceInstanceGroupsList, InstanceHealthCheck
urls = [
@@ -11,6 +11,7 @@ urls = [
url(r'^(?P<pk>[0-9]+)/$', InstanceDetail.as_view(), name='instance_detail'),
url(r'^(?P<pk>[0-9]+)/jobs/$', InstanceUnifiedJobsList.as_view(), name='instance_unified_jobs_list'),
url(r'^(?P<pk>[0-9]+)/instance_groups/$', InstanceInstanceGroupsList.as_view(), name='instance_instance_groups_list'),
url(r'^(?P<pk>[0-9]+)/health_check/$', InstanceHealthCheck.as_view(), name='instance_health_check'),
]
__all__ = ['urls']

View File

@@ -108,6 +108,7 @@ from awx.api.permissions import (
InstanceGroupTowerPermission,
VariableDataPermission,
WorkflowApprovalPermission,
IsSystemAdminOrAuditor,
)
from awx.api import renderers
from awx.api import serializers
@@ -408,6 +409,56 @@ class InstanceInstanceGroupsList(InstanceGroupMembershipMixin, SubListCreateAtta
return None
class InstanceHealthCheck(GenericAPIView):
name = _('Instance Health Check')
model = models.Instance
serializer_class = serializers.InstanceHealthCheckSerializer
permission_classes = (IsSystemAdminOrAuditor,)
def get(self, request, *args, **kwargs):
obj = self.get_object()
data = self.get_serializer(data=request.data).to_representation(obj)
return Response(data, status=status.HTTP_200_OK)
def post(self, request, *args, **kwargs):
obj = self.get_object()
if obj.node_type == 'execution':
from awx.main.tasks import execution_node_health_check
runner_data = execution_node_health_check(obj.hostname)
obj.refresh_from_db()
data = self.get_serializer(data=request.data).to_representation(obj)
# Add in some extra unsaved fields
for extra_field in ('transmit_timing', 'run_timing'):
if extra_field in runner_data:
data[extra_field] = runner_data[extra_field]
else:
from awx.main.tasks import cluster_node_health_check
if settings.CLUSTER_HOST_ID == obj.hostname:
cluster_node_health_check(obj.hostname)
else:
cluster_node_health_check.apply_async([obj.hostname], queue=obj.hostname)
start_time = time.time()
prior_check_time = obj.last_health_check
while time.time() - start_time < 50.0:
obj.refresh_from_db(fields=['last_health_check'])
if obj.last_health_check != prior_check_time:
break
if time.time() - start_time < 1.0:
time.sleep(0.1)
else:
time.sleep(1.0)
else:
obj.mark_offline(errors=_('Health check initiated by user determined this instance to be unresponsive'))
obj.refresh_from_db()
data = self.get_serializer(data=request.data).to_representation(obj)
return Response(data, status=status.HTTP_200_OK)
class InstanceGroupList(ListCreateAPIView):
name = _("Instance Groups")