Add setting for missed heartbeats before marking node offline

This commit is contained in:
Shane McDonald 2022-07-23 13:22:26 -04:00 committed by Alan Rominger
parent 3c51cb130f
commit c5976e2584
No known key found for this signature in database
GPG Key ID: C2D7EAAA12B63559
2 changed files with 5 additions and 1 deletions

View File

@ -207,7 +207,7 @@ class Instance(HasPolicyEditsMixin, BaseModel):
return True
if ref_time is None:
ref_time = now()
grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * 2
grace_period = settings.CLUSTER_NODE_HEARTBEAT_PERIOD * settings.CLUSTER_NODE_MISSED_HEARTBEAT_TOLERANCE
if self.node_type in ('execution', 'hop'):
grace_period += settings.RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD
return self.last_seen < ref_time - timedelta(seconds=grace_period)

View File

@ -432,6 +432,10 @@ os.environ.setdefault('DJANGO_LIVE_TEST_SERVER_ADDRESS', 'localhost:9013-9199')
# heartbeat period can factor into some forms of logic, so it is maintained as a setting here
CLUSTER_NODE_HEARTBEAT_PERIOD = 60
# Number of missed heartbeats until a node gets marked as lost
CLUSTER_NODE_MISSED_HEARTBEAT_TOLERANCE = 2
RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD = 60 # https://github.com/ansible/receptor/blob/aa1d589e154d8a0cb99a220aff8f98faf2273be6/pkg/netceptor/netceptor.go#L34
EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 30 # once every 30 minutes check if an execution node errors have been resolved