Register system again if deleted by another pod

Avoid cases where missing instance would throw error on startup this gives time for heartbeat to register it
2026-05-10 10:57:35 -02:30 · 2022-07-08 12:06:21 -04:00
parent 2c9a0444e6
commit 585d3f4e2a
4 changed files with 19 additions and 5 deletions
--- a/awx/main/analytics/subsystem_metrics.py
+++ b/awx/main/analytics/subsystem_metrics.py
@@ -166,7 +166,11 @@ class Metrics:
        elif settings.IS_TESTING():
            self.instance_name = "awx_testing"
        else:
-            self.instance_name = Instance.objects.me().hostname
+            try:
                self.instance_name = Instance.objects.me().hostname
            except Exception as e:
                self.instance_name = settings.CLUSTER_HOST_ID
                logger.info(f'Instance {self.instance_name} seems to be unregistered, error: {e}')
        # metric name, help_text
        METRICSLIST = [
--- a/awx/main/dispatch/reaper.py
+++ b/awx/main/dispatch/reaper.py
@@ -15,7 +15,11 @@ def startup_reaping():
    If this particular instance is starting, then we know that any running jobs are invalid
    so we will reap those jobs as a special action here
    """
-    me = Instance.objects.me()
+    try:
        me = Instance.objects.me()
    except RuntimeError as e:
        logger.warning(f'Local instance is not registered, not running startup reaper: {e}')
        return
    jobs = UnifiedJob.objects.filter(status='running', controller_node=me.hostname)
    job_ids = []
    for j in jobs:
--- a/awx/main/management/commands/run_dispatcher.py
+++ b/awx/main/management/commands/run_dispatcher.py
@@ -7,7 +7,7 @@ from django.core.cache import cache as django_cache
 from django.core.management.base import BaseCommand
 from django.db import connection as django_connection
-from awx.main.dispatch import get_local_queuename, reaper
+from awx.main.dispatch import get_local_queuename
 from awx.main.dispatch.control import Control
 from awx.main.dispatch.pool import AutoscalePool
 from awx.main.dispatch.worker import AWXConsumerPG, TaskWorker
@@ -53,7 +53,6 @@ class Command(BaseCommand):
        # (like the node heartbeat)
        periodic.run_continuously()
        reaper.startup_reaping()
        consumer = None
        try:
--- a/awx/main/tasks/system.py
+++ b/awx/main/tasks/system.py
@@ -103,6 +103,7 @@ def dispatch_startup():
    #
    apply_cluster_membership_policies()
    cluster_node_heartbeat()
    reaper.startup_reaping()
    m = Metrics()
    m.reset_values()
@@ -504,7 +505,13 @@ def cluster_node_heartbeat():
            logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
            return
    else:
-        raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
+        if settings.AWX_AUTO_DEPROVISION_INSTANCES:
            (changed, this_inst) = Instance.objects.register(ip_address=os.environ.get('MY_POD_IP'), node_type='control', uuid=settings.SYSTEM_UUID)
            if changed:
                logger.warning(f'Recreated instance record {this_inst.hostname} after unexpected removal')
            this_inst.local_health_check()
        else:
            raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
    # IFF any node has a greater version than we do, then we'll shutdown services
    for other_inst in instance_list:
        if other_inst.node_type in ('execution', 'hop'):