Allow manually running a health check, and make other adjustments to the health check trigger (#11002)

* Full finalize the planned work for health checks of execution nodes * Implementation of instance health_check endpoint * Also do version conditional to node_type * Do not use receptor mesh to check main cluster nodes health * Fix bugs from testing health check of cluster nodes, add doc * Add a few fields to health check serializer missed before * Light refactoring of error field processing * Fix errors clearing error, write more unit tests * Update health check info in docs * Bump migration of health check after rebase * Mark string for translation * Add related health_check link for system auditors too * Handle health_check cluster node timeout, add errors for peer judgement
2026-07-28 08:29:55 -02:30 · 2021-09-03 16:37:37 -04:00
parent 169c0f6642
commit 6a17e5b65b
15 changed files with 285 additions and 53 deletions
--- a/awx/settings/defaults.py
+++ b/awx/settings/defaults.py
@@ -422,6 +422,7 @@ os.environ.setdefault('DJANGO_LIVE_TEST_SERVER_ADDRESS', 'localhost:9013-9199')
 # heartbeat period can factor into some forms of logic, so it is maintained as a setting here
 CLUSTER_NODE_HEARTBEAT_PERIOD = 60
 RECEPTOR_SERVICE_ADVERTISEMENT_PERIOD = 60  # https://github.com/ansible/receptor/blob/aa1d589e154d8a0cb99a220aff8f98faf2273be6/pkg/netceptor/netceptor.go#L34
+EXECUTION_NODE_REMEDIATION_CHECKS = 60 * 10  # once every 10 minutes check if an execution node errors have been resolved

 BROKER_URL = 'unix:///var/run/redis/redis.sock'
 CELERYBEAT_SCHEDULE = {