Merge pull request #5367 from ansible/fewer_health_checks

Reduce the number of triggers for execution node health checks
This commit is contained in:
Alan Rominger
2021-10-19 17:18:24 -04:00
committed by GitHub
2 changed files with 2 additions and 5 deletions

View File

@@ -533,7 +533,7 @@ def inspect_execution_nodes(instance_list):
# check
logger.warn(f'Execution node attempting to rejoin as instance {hostname}.')
execution_node_health_check.apply_async([hostname])
elif instance.capacity == 0:
elif instance.capacity == 0 and instance.enabled:
# nodes with proven connection but need remediation run health checks are reduced frequency
if not instance.last_health_check or (nowtime - instance.last_health_check).total_seconds() >= settings.EXECUTION_NODE_REMEDIATION_CHECKS:
# Periodically re-run the health check of errored nodes, in case someone fixed it
@@ -3064,9 +3064,6 @@ class AWXReceptorJob:
# Make sure to always release the work unit if we established it
if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK:
receptor_ctl.simple_command(f"work release {self.unit_id}")
# If an error occured without the job itself failing, it could be a broken instance
if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)):
execution_node_health_check.delay(self.task.instance.execution_node)
@property
def sign_work(self):

View File

@@ -425,7 +425,7 @@ os.environ.setdefault('DJANGO_LIVE_TEST_SERVER_ADDRESS', 'localhost:9013-9199')
# heartbeat period can factor into some forms of logic, so it is maintained as a setting here
CLUSTER_NODE_HEARTBEAT_PERIOD = 60
RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD = 60 # https://github.com/ansible/receptor/blob/aa1d589e154d8a0cb99a220aff8f98faf2273be6/pkg/netceptor/netceptor.go#L34
EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 10 # once every 10 minutes check if an execution node errors have been resolved
EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 30 # once every 30 minutes check if an execution node errors have been resolved
BROKER_URL = 'unix:///var/run/redis/redis.sock'
CELERYBEAT_SCHEDULE = {