diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index add2564015..08d95bf86a 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -233,13 +233,19 @@ class Instance(HasPolicyEditsMixin, BaseModel): def refresh_capacity_fields(self): """Update derived capacity fields from cpu and memory (no save)""" - self.cpu_capacity = get_cpu_effective_capacity(self.cpu) - self.mem_capacity = get_mem_effective_capacity(self.memory) + if self.node_type == 'hop': + self.cpu_capacity = 0 + self.mem_capacity = 0 # formula has a non-zero offset, so we make sure it is 0 for hop nodes + else: + self.cpu_capacity = get_cpu_effective_capacity(self.cpu) + self.mem_capacity = get_mem_effective_capacity(self.memory) self.set_capacity_value() - def save_health_data(self, version, cpu, memory, uuid=None, update_last_seen=False, errors=''): - self.last_health_check = now() - update_fields = ['last_health_check'] + def save_health_data(self, version=None, cpu=0, memory=0, uuid=None, update_last_seen=False, errors=''): + update_fields = ['errors'] + if self.node_type != 'hop': + self.last_health_check = now() + update_fields.append('last_health_check') if update_last_seen: self.last_seen = self.last_health_check @@ -251,7 +257,7 @@ class Instance(HasPolicyEditsMixin, BaseModel): self.uuid = uuid update_fields.append('uuid') - if self.version != version: + if version is not None and self.version != version: self.version = version update_fields.append('version') @@ -270,7 +276,7 @@ class Instance(HasPolicyEditsMixin, BaseModel): self.errors = '' else: self.mark_offline(perform_save=False, errors=errors) - update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity', 'errors']) + update_fields.extend(['cpu_capacity', 'mem_capacity', 'capacity']) # disabling activity stream will avoid extra queries, which is important for heatbeat actions from awx.main.signals import disable_activity_stream diff --git a/awx/main/tasks/system.py b/awx/main/tasks/system.py index 43ac6c2b26..af02d3de55 100644 --- a/awx/main/tasks/system.py +++ b/awx/main/tasks/system.py @@ -436,7 +436,6 @@ def inspect_execution_nodes(instance_list): workers = mesh_status['Advertisements'] for ad in workers: hostname = ad['NodeID'] - changed = False if hostname in node_lookup: instance = node_lookup[hostname] @@ -458,11 +457,11 @@ def inspect_execution_nodes(instance_list): # Only execution nodes should be dealt with by execution_node_health_check if instance.node_type == 'hop': + logger.warning(f'Hop node {hostname}, has rejoined the receptor mesh') + instance.save_health_data(errors='') continue - if changed: - execution_node_health_check.apply_async([hostname]) - elif was_lost: + if was_lost: # if the instance *was* lost, but has appeared again, # attempt to re-establish the initial capacity and version # check @@ -534,20 +533,14 @@ def cluster_node_heartbeat(): except Exception: logger.exception('failed to reap jobs for {}'.format(other_inst.hostname)) try: - # Capacity could already be 0 because: - # * It's a new node and it never had a heartbeat - # * It was set to 0 by another tower node running this method - # * It was set to 0 by this node, but auto deprovisioning is off - # - # If auto deprovisioning is on, don't bother setting the capacity to 0 - # since we will delete the node anyway. - if other_inst.capacity != 0 and not settings.AWX_AUTO_DEPROVISION_INSTANCES: - other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive')) - logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen)) - elif settings.AWX_AUTO_DEPROVISION_INSTANCES: + if settings.AWX_AUTO_DEPROVISION_INSTANCES: deprovision_hostname = other_inst.hostname other_inst.delete() logger.info("Host {} Automatically Deprovisioned.".format(deprovision_hostname)) + elif other_inst.capacity != 0 or (not other_inst.errors): + other_inst.mark_offline(errors=_('Another cluster node has determined this instance to be unresponsive')) + logger.error("Host {} last checked in at {}, marked as lost.".format(other_inst.hostname, other_inst.last_seen)) + except DatabaseError as e: if 'did not affect any rows' in str(e): logger.debug('Another instance has marked {} as lost'.format(other_inst.hostname)) diff --git a/awx/main/tests/functional/test_instances.py b/awx/main/tests/functional/test_instances.py index 81771a7253..39afa7dd32 100644 --- a/awx/main/tests/functional/test_instances.py +++ b/awx/main/tests/functional/test_instances.py @@ -363,6 +363,23 @@ def test_health_check_oh_no(): assert instance.errors == 'This it not a real instance!' +@pytest.mark.django_db +def test_errors_field_alone(): + instance = Instance.objects.create(hostname='foo-1', enabled=True, node_type='hop') + + instance.save_health_data(errors='Node went missing!') + assert instance.errors == 'Node went missing!' + assert instance.capacity == 0 + assert instance.memory == instance.mem_capacity == 0 + assert instance.cpu == instance.cpu_capacity == 0 + + instance.save_health_data(errors='') + assert not instance.errors + assert instance.capacity == 0 + assert instance.memory == instance.mem_capacity == 0 + assert instance.cpu == instance.cpu_capacity == 0 + + @pytest.mark.django_db class TestInstanceGroupOrdering: def test_ad_hoc_instance_groups(self, instance_group_factory, inventory, default_instance_group):