mirror of
https://github.com/ansible/awx.git
synced 2026-01-11 18:09:57 -03:30
Make sure that the health checks handle the state transitions properly
- nodes with states Provisioning, Provisioning Fail, Deprovisioning, and Deprovisioning Fail should bypass health checks and should never transition due to the existing machinery - nodes with states Unavailable and Installed can transition to Ready if they check out as healthy - nodes in the Ready state should transition to Unavailable if they fail a check
This commit is contained in:
parent
dab8c3ef55
commit
03c70077f9
@ -440,6 +440,7 @@ class InstanceHealthCheck(GenericAPIView):
|
||||
def post(self, request, *args, **kwargs):
|
||||
obj = self.get_object()
|
||||
|
||||
# Note: hop nodes are already excluded by the get_queryset method
|
||||
if obj.node_type == 'execution':
|
||||
from awx.main.tasks.system import execution_node_health_check
|
||||
|
||||
|
||||
@ -238,15 +238,18 @@ class Instance(HasPolicyEditsMixin, BaseModel):
|
||||
return self.last_seen < ref_time - timedelta(seconds=grace_period)
|
||||
|
||||
def mark_offline(self, update_last_seen=False, perform_save=True, errors=''):
|
||||
if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and self.errors == errors and (not update_last_seen):
|
||||
if self.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
|
||||
return
|
||||
if self.node_state == Instance.States.UNAVAILABLE and self.errors == errors and (not update_last_seen):
|
||||
return
|
||||
self.node_state = Instance.States.UNAVAILABLE
|
||||
self.cpu_capacity = self.mem_capacity = self.capacity = 0
|
||||
self.errors = errors
|
||||
if update_last_seen:
|
||||
self.last_seen = now()
|
||||
|
||||
if perform_save:
|
||||
update_fields = ['capacity', 'cpu_capacity', 'mem_capacity', 'errors']
|
||||
update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors']
|
||||
if update_last_seen:
|
||||
update_fields += ['last_seen']
|
||||
self.save(update_fields=update_fields)
|
||||
@ -303,6 +306,9 @@ class Instance(HasPolicyEditsMixin, BaseModel):
|
||||
if not errors:
|
||||
self.refresh_capacity_fields()
|
||||
self.errors = ''
|
||||
if self.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
|
||||
self.node_state = Instance.States.READY
|
||||
update_fields.append('node_state')
|
||||
else:
|
||||
self.mark_offline(perform_save=False, errors=errors)
|
||||
update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity'])
|
||||
@ -321,7 +327,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
|
||||
# playbook event data; we should consider this a zero capacity event
|
||||
redis.Redis.from_url(settings.BROKER_URL).ping()
|
||||
except redis.ConnectionError:
|
||||
errors = _('Failed to connect ot Redis')
|
||||
errors = _('Failed to connect to Redis')
|
||||
|
||||
self.save_health_data(awx_application_version, get_cpu_count(), get_mem_in_bytes(), update_last_seen=True, errors=errors)
|
||||
|
||||
|
||||
@ -114,7 +114,7 @@ def inform_cluster_of_shutdown():
|
||||
try:
|
||||
this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID)
|
||||
this_inst.mark_offline(update_last_seen=True, errors=_('Instance received normal shutdown signal'))
|
||||
logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname))
|
||||
logger.warning('Normal shutdown signal for instance {}, removed self from capacity pool.'.format(this_inst.hostname))
|
||||
except Exception:
|
||||
logger.exception('Encountered problem with normal shutdown signal.')
|
||||
|
||||
@ -399,6 +399,9 @@ def execution_node_health_check(node):
|
||||
if instance.node_type != 'execution':
|
||||
raise RuntimeError(f'Execution node health check ran against {instance.node_type} node {instance.hostname}')
|
||||
|
||||
if instance.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED):
|
||||
raise RuntimeError(f"Execution node health check ran against node {instance.hostname} in state {instance.node_state}")
|
||||
|
||||
data = worker_info(node)
|
||||
|
||||
prior_capacity = instance.capacity
|
||||
@ -455,7 +458,7 @@ def inspect_execution_nodes(instance_list):
|
||||
|
||||
# Only execution nodes should be dealt with by execution_node_health_check
|
||||
if instance.node_type == 'hop':
|
||||
if was_lost and (not instance.is_lost(ref_time=nowtime)):
|
||||
if was_lost:
|
||||
logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh')
|
||||
instance.save_health_data(errors='')
|
||||
continue
|
||||
@ -479,7 +482,7 @@ def inspect_execution_nodes(instance_list):
|
||||
def cluster_node_heartbeat():
|
||||
logger.debug("Cluster node heartbeat task.")
|
||||
nowtime = now()
|
||||
instance_list = list(Instance.objects.all())
|
||||
instance_list = list(Instance.objects.filter(node_state__in=(Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED)))
|
||||
this_inst = None
|
||||
lost_instances = []
|
||||
|
||||
@ -530,9 +533,9 @@ def cluster_node_heartbeat():
|
||||
try:
|
||||
if settings.AWX_AUTO_DEPROVISION_INSTANCES:
|
||||
deprovision_hostname = other_inst.hostname
|
||||
other_inst.delete()
|
||||
other_inst.delete() # FIXME: what about associated inbound links?
|
||||
logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname))
|
||||
elif other_inst.capacity != 0 or (not other_inst.errors):
|
||||
elif other_inst.node_state == Instance.States.READY:
|
||||
other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive'))
|
||||
logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user