diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 3a6b7740a2..5f9588f627 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -207,7 +207,7 @@ class Instance(HasPolicyEditsMixin, BaseModel): return True if ref_time is None: ref_time = now() - grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2 + grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * settings.CLUSTER_NODE_MISSED_HEARTBEAT_TOLERANCE if self.node_type in ('execution', 'hop'): grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD return self.last_seen < ref_time - timedelta(seconds=grace_period) diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 499225a17d..a24fe6f090 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -432,6 +432,10 @@ os.environ.setdefault('DJANGO_LIVE_TEST_SERVER_ADDRESS', 'localhost:9013-9199') # heartbeat period can factor into some forms of logic, so it is maintained as a setting here CLUSTER_NODE_HEARTBEAT_PERIOD = 60 + +# Number of missed heartbeats until a node gets marked as lost +CLUSTER_NODE_MISSED_HEARTBEAT_TOLERANCE = 2 + RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD = 60 # https://github.com/ansible/receptor/blob/aa1d589e154d8a0cb99a220aff8f98faf2273be6/pkg/netceptor/netceptor.go#L34 EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 30 # once every 30 minutes check if an execution node errors have been resolved