From 77076dbd6762caa7df19b7331c354270e47306ac Mon Sep 17 00:00:00 2001 From: Alan Rominger Date: Tue, 19 Oct 2021 14:44:03 -0400 Subject: [PATCH] Reduce the number of triggers for execution node health checks --- awx/main/tasks.py | 5 +---- awx/settings/defaults.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/awx/main/tasks.py b/awx/main/tasks.py index 94d348400e..0ff7820ea5 100644 --- a/awx/main/tasks.py +++ b/awx/main/tasks.py @@ -534,7 +534,7 @@ def inspect_execution_nodes(instance_list): # check logger.warn(f'Execution node attempting to rejoin as instance {hostname}.') execution_node_health_check.apply_async([hostname]) - elif instance.capacity == 0: + elif instance.capacity == 0 and instance.enabled: # nodes with proven connection but need remediation run health checks are reduced frequency if not instance.last_health_check or (nowtime - instance.last_health_check).total_seconds() >= settings.EXECUTION_NODE_REMEDIATION_CHECKS: # Periodically re-run the health check of errored nodes, in case someone fixed it @@ -3069,9 +3069,6 @@ class AWXReceptorJob: # Make sure to always release the work unit if we established it if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK: receptor_ctl.simple_command(f"work release {self.unit_id}") - # If an error occured without the job itself failing, it could be a broken instance - if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)): - execution_node_health_check.delay(self.task.instance.execution_node) @property def sign_work(self): diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 3fda6efff9..0b5bd0b4b6 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -425,7 +425,7 @@ os.environ.setdefault('DJANGO_LIVE_TEST_SERVER_ADDRESS', 'localhost:9013-9199') # heartbeat period can factor into some forms of logic, so it is maintained as a setting here CLUSTER_NODE_HEARTBEAT_PERIOD = 60 RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD = 60 # https://github.com/ansible/receptor/blob/aa1d589e154d8a0cb99a220aff8f98faf2273be6/pkg/netceptor/netceptor.go#L34 -EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 10 # once every 10 minutes check if an execution node errors have been resolved +EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 30 # once every 30 minutes check if an execution node errors have been resolved BROKER_URL = 'unix:///var/run/redis/redis.sock' CELERYBEAT_SCHEDULE = {