From c5976e2584994f9a364ce1e1e3126714b33f89f9 Mon Sep 17 00:00:00 2001 From: Shane McDonald Date: Sat, 23 Jul 2022 13:22:26 -0400 Subject: [PATCH] Add setting for missed heartbeats before marking node offline --- awx/main/models/ha.py | 2 +- awx/settings/defaults.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/awx/main/models/ha.py b/awx/main/models/ha.py index 3a6b7740a2..5f9588f627 100644 --- a/awx/main/models/ha.py +++ b/awx/main/models/ha.py @@ -207,7 +207,7 @@ class Instance(HasPolicyEditsMixin, BaseModel): return True if ref_time is None: ref_time = now() - grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2 + grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * settings.CLUSTER_NODE_MISSED_HEARTBEAT_TOLERANCE if self.node_type in ('execution', 'hop'): grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD return self.last_seen < ref_time - timedelta(seconds=grace_period) diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index 499225a17d..a24fe6f090 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -432,6 +432,10 @@ os.environ.setdefault('DJANGO_LIVE_TEST_SERVER_ADDRESS', 'localhost:9013-9199') # heartbeat period can factor into some forms of logic, so it is maintained as a setting here CLUSTER_NODE_HEARTBEAT_PERIOD = 60 + +# Number of missed heartbeats until a node gets marked as lost +CLUSTER_NODE_MISSED_HEARTBEAT_TOLERANCE = 2 + RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD = 60 # https://github.com/ansible/receptor/blob/aa1d589e154d8a0cb99a220aff8f98faf2273be6/pkg/netceptor/netceptor.go#L34 EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 30 # once every 30 minutes check if an execution node errors have been resolved