diff --git a/awx/api/views/__init__.py b/awx/api/views/__init__.py index 0d46c05834..307327522a 100644 --- a/awx/api/views/__init__.py +++ b/awx/api/views/__init__.py @@ -440,6 +440,7 @@ class InstanceHealthCheck(GenericAPIView): def post(self, request, *args, **kwargs): obj = self.get_object() + # Note: hop nodes are already excluded by the get_queryset method if obj.node_type == 'execution': from awx.main.tasks.system import execution_node_health_check diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 09b0e4aabb..6bc711503f 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -238,15 +238,18 @@ class Instance(HasPolicyEditsMixin, BaseModel): return self.last_seen < ref_time - timedelta(seconds=grace_period) def mark_offline(self, update_last_seen=False, perform_save=True, errors=''): - if self.cpu_capacity == 0 and self.mem_capacity == 0 and self.capacity == 0 and self.errors == errors and (not update_last_seen): + if self.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED): return + if self.node_state == Instance.States.UNAVAILABLE and self.errors == errors and (not update_last_seen): + return + self.node_state = Instance.States.UNAVAILABLE self.cpu_capacity = self.mem_capacity = self.capacity = 0 self.errors = errors if update_last_seen: self.last_seen = now() if perform_save: - update_fields = ['capacity', 'cpu_capacity', 'mem_capacity', 'errors'] + update_fields = ['node_state', 'capacity', 'cpu_capacity', 'mem_capacity', 'errors'] if update_last_seen: update_fields += ['last_seen'] self.save(update_fields=update_fields) @@ -303,6 +306,9 @@ class Instance(HasPolicyEditsMixin, BaseModel): if not errors: self.refresh_capacity_fields() self.errors = '' + if self.node_state in (Instance.States.UNAVAILABLE, Instance.States.INSTALLED): + self.node_state = Instance.States.READY + update_fields.append('node_state') else: self.mark_offline(perform_save=False, errors=errors) update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity']) @@ -321,7 +327,7 @@ class Instance(HasPolicyEditsMixin, BaseModel): # playbook event data; we should consider this a zero capacity event redis.Redis.from_url(settings.BROKER_URL).ping() except redis.ConnectionError: - errors = _('Failed to connect ot Redis') + errors = _('Failed to connect to Redis') self.save_health_data(awx_application_version, get_cpu_count(), get_mem_in_bytes(), update_last_seen=True, errors=errors) diff --git a/awx/main/tasks/system.py b/awx/main/tasks/system.py index b828326339..90ee318106 100644 --- a/awx/main/tasks/system.py +++ b/awx/main/tasks/system.py @@ -114,7 +114,7 @@ def inform_cluster_of_shutdown(): try: this_inst = Instance.objects.get(hostname=settings.CLUSTER_HOST_ID) this_inst.mark_offline(update_last_seen=True, errors=_('Instance received normal shutdown signal')) - logger.warning('Normal shutdown signal for instance {}, ' 'removed self from capacity pool.'.format(this_inst.hostname)) + logger.warning('Normal shutdown signal for instance {}, removed self from capacity pool.'.format(this_inst.hostname)) except Exception: logger.exception('Encountered problem with normal shutdown signal.') @@ -399,6 +399,9 @@ def execution_node_health_check(node): if instance.node_type != 'execution': raise RuntimeError(f'Execution node health check ran against {instance.node_type} node {instance.hostname}') + if instance.node_state not in (Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED): + raise RuntimeError(f"Execution node health check ran against node {instance.hostname} in state {instance.node_state}") + data = worker_info(node) prior_capacity = instance.capacity @@ -455,7 +458,7 @@ def inspect_execution_nodes(instance_list): # Only execution nodes should be dealt with by execution_node_health_check if instance.node_type == 'hop': - if was_lost and (not instance.is_lost(ref_time=nowtime)): + if was_lost: logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh') instance.save_health_data(errors='') continue @@ -479,7 +482,7 @@ def inspect_execution_nodes(instance_list): def cluster_node_heartbeat(): logger.debug("Cluster node heartbeat task.") nowtime = now() - instance_list = list(Instance.objects.all()) + instance_list = list(Instance.objects.filter(node_state__in=(Instance.States.READY, Instance.States.UNAVAILABLE, Instance.States.INSTALLED))) this_inst = None lost_instances = [] @@ -530,9 +533,9 @@ def cluster_node_heartbeat(): try: if settings.AWX_AUTO_DEPROVISION_INSTANCES: deprovision_hostname = other_inst.hostname - other_inst.delete() + other_inst.delete() # FIXME: what about associated inbound links? logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname)) - elif other_inst.capacity != 0 or (not other_inst.errors): + elif other_inst.node_state == Instance.States.READY: other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive')) logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen))