Reduce the number of triggers for execution node health checks

This commit is contained in:
Alan Rominger 2021-10-19 14:44:03 -04:00 committed by Shane McDonald
parent 6f20a798ab
commit 77076dbd67
No known key found for this signature in database
GPG Key ID: 6F374AF6E9EB9374
2 changed files with 2 additions and 5 deletions

View File

@ -534,7 +534,7 @@ def inspect_execution_nodes(instance_list):
# check
logger.warn(f'Execution node attempting to rejoin as instance {hostname}.')
execution_node_health_check.apply_async([hostname])
elif instance.capacity == 0:
elif instance.capacity == 0 and instance.enabled:
# nodes with proven connection but need remediation run health checks are reduced frequency
if not instance.last_health_check or (nowtime - instance.last_health_check).total_seconds() >= settings.EXECUTION_NODE_REMEDIATION_CHECKS:
# Periodically re-run the health check of errored nodes, in case someone fixed it
@ -3069,9 +3069,6 @@ class AWXReceptorJob:
# Make sure to always release the work unit if we established it
if self.unit_id is not None and settings.RECEPTOR_RELEASE_WORK:
receptor_ctl.simple_command(f"work release {self.unit_id}")
# If an error occured without the job itself failing, it could be a broken instance
if self.work_type == 'ansible-runner' and ((res is None) or (getattr(res, 'rc', None) is None)):
execution_node_health_check.delay(self.task.instance.execution_node)
@property
def sign_work(self):

View File

@ -425,7 +425,7 @@ os.environ.setdefault('DJANGO_LIVE_TEST_SERVER_ADDRESS', 'localhost:9013-9199')
# heartbeat period can factor into some forms of logic, so it is maintained as a setting here
CLUSTER_NODE_HEARTBEAT_PERIOD = 60
RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD = 60 # https://github.com/ansible/receptor/blob/aa1d589e154d8a0cb99a220aff8f98faf2273be6/pkg/netceptor/netceptor.go#L34
EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 10 # once every 10 minutes check if an execution node errors have been resolved
EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 30 # once every 30 minutes check if an execution node errors have been resolved
BROKER_URL = 'unix:///var/run/redis/redis.sock'
CELERYBEAT_SCHEDULE = {