Allow manually running a health check, and make other adjustments to the health check trigger (#11002)

* Full finalize the planned work for health checks of execution nodes

* Implementation of instance health_check endpoint

* Also do version conditional to node_type

* Do not use receptor mesh to check main cluster nodes health

* Fix bugs from testing health check of cluster nodes, add doc

* Add a few fields to health check serializer missed before

* Light refactoring of error field processing

* Fix errors clearing error, write more unit tests

* Update health check info in docs

* Bump migration of health check after rebase

* Mark string for translation

* Add related health_check link for system auditors too

* Handle health_check cluster node timeout, add errors for peer judgement
This commit is contained in:
Alan Rominger
2021-09-03 16:37:37 -04:00
committed by GitHub
parent 169c0f6642
commit 6a17e5b65b
15 changed files with 285 additions and 53 deletions

View File

@@ -82,11 +82,22 @@ class Instance(HasPolicyEditsMixin, BaseModel):
editable=False,
help_text=_('Total system memory of this instance in bytes.'),
)
errors = models.TextField(
default='',
blank=True,
editable=False,
help_text=_('Any error details from the last health check.'),
)
last_seen = models.DateTimeField(
null=True,
editable=False,
help_text=_('Last time instance ran its heartbeat task for main cluster nodes. Last known connection to receptor mesh for execution nodes.'),
)
last_health_check = models.DateTimeField(
null=True,
editable=False,
help_text=_('Last time a health check was ran on this instance to refresh cpu, memory, and capacity.'),
)
# Capacity management
capacity = models.PositiveIntegerField(
default=100,
@@ -152,15 +163,16 @@ class Instance(HasPolicyEditsMixin, BaseModel):
grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD
return self.last_seen < ref_time - timedelta(seconds=grace_period)
def mark_offline(self, update_last_seen=False, perform_save=True):
if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and (not update_last_seen):
def mark_offline(self, update_last_seen=False, perform_save=True, errors=''):
if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and self.errors == errors and (not update_last_seen):
return
self.cpu_capacity = self.mem_capacity = self.capacity = 0
self.errors = errors
if update_last_seen:
self.last_seen = now()
if perform_save:
update_fields = ['capacity', 'cpu_capacity', 'mem_capacity']
update_fields = ['capacity', 'cpu_capacity', 'mem_capacity', 'errors']
if update_last_seen:
update_fields += ['last_seen']
self.save(update_fields=update_fields)
@@ -180,11 +192,12 @@ class Instance(HasPolicyEditsMixin, BaseModel):
self.mem_capacity = get_mem_effective_capacity(self.memory)
self.set_capacity_value()
def save_health_data(self, version, cpu, memory, uuid=None, last_seen=None, has_error=False):
update_fields = []
def save_health_data(self, version, cpu, memory, uuid=None, update_last_seen=False, errors=''):
self.last_health_check = now()
update_fields = ['last_health_check']
if last_seen is not None and self.last_seen != last_seen:
self.last_seen = last_seen
if update_last_seen:
self.last_seen = self.last_health_check
update_fields.append('last_seen')
if uuid is not None and self.uuid != uuid:
@@ -207,25 +220,26 @@ class Instance(HasPolicyEditsMixin, BaseModel):
self.memory = new_memory
update_fields.append('memory')
if not has_error:
if not errors:
self.refresh_capacity_fields()
self.errors = ''
else:
self.mark_offline(perform_save=False)
update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])
self.mark_offline(perform_save=False, errors=errors)
update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity', 'errors'])
self.save(update_fields=update_fields)
def local_health_check(self):
"""Only call this method on the instance that this record represents"""
has_error = False
errors = None
try:
# if redis is down for some reason, that means we can't persist
# playbook event data; we should consider this a zero capacity event
redis.Redis.from_url(settings.BROKER_URL).ping()
except redis.ConnectionError:
has_error = True
errors = _('Failed to connect ot Redis')
self.save_health_data(awx_application_version, get_cpu_count(), get_mem_in_bytes(), last_seen=now(), has_error=has_error)
self.save_health_data(awx_application_version, get_cpu_count(), get_mem_in_bytes(), update_last_seen=True, errors=errors)
class InstanceGroup(HasPolicyEditsMixin, BaseModel, RelatedJobsMixin):