diff --git a/awx/main/analytics/subsystem_metrics.py b/awx/main/analytics/subsystem_metrics.py index 1b5e3d1cc4..b0fd81172b 100644 --- a/awx/main/analytics/subsystem_metrics.py +++ b/awx/main/analytics/subsystem_metrics.py @@ -166,7 +166,11 @@ class Metrics: elif settings.IS_TESTING(): self.instance_name = "awx_testing" else: - self.instance_name = Instance.objects.me().hostname + try: + self.instance_name = Instance.objects.me().hostname + except Exception as e: + self.instance_name = settings.CLUSTER_HOST_ID + logger.info(f'Instance {self.instance_name} seems to be unregistered, error: {e}') # metric name, help_text METRICSLIST = [ diff --git a/awx/main/dispatch/reaper.py b/awx/main/dispatch/reaper.py index a86664b80c..ba752f4fe6 100644 --- a/awx/main/dispatch/reaper.py +++ b/awx/main/dispatch/reaper.py @@ -15,7 +15,11 @@ def startup_reaping(): If this particular instance is starting, then we know that any running jobs are invalid so we will reap those jobs as a special action here """ - me = Instance.objects.me() + try: + me = Instance.objects.me() + except RuntimeError as e: + logger.warning(f'Local instance is not registered, not running startup reaper: {e}') + return jobs = UnifiedJob.objects.filter(status='running', controller_node=me.hostname) job_ids = [] for j in jobs: diff --git a/awx/main/management/commands/run_dispatcher.py b/awx/main/management/commands/run_dispatcher.py index e4d17f2aed..2fc35a75d2 100644 --- a/awx/main/management/commands/run_dispatcher.py +++ b/awx/main/management/commands/run_dispatcher.py @@ -7,7 +7,7 @@ from django.core.cache import cache as django_cache from django.core.management.base import BaseCommand from django.db import connection as django_connection -from awx.main.dispatch import get_local_queuename, reaper +from awx.main.dispatch import get_local_queuename from awx.main.dispatch.control import Control from awx.main.dispatch.pool import AutoscalePool from awx.main.dispatch.worker import AWXConsumerPG, TaskWorker @@ -53,7 +53,6 @@ class Command(BaseCommand): # (like the node heartbeat) periodic.run_continuously() - reaper.startup_reaping() consumer = None try: diff --git a/awx/main/tasks/system.py b/awx/main/tasks/system.py index b828326339..2c8daa8310 100644 --- a/awx/main/tasks/system.py +++ b/awx/main/tasks/system.py @@ -103,6 +103,7 @@ def dispatch_startup(): # apply_cluster_membership_policies() cluster_node_heartbeat() + reaper.startup_reaping() m = Metrics() m.reset_values() @@ -504,7 +505,13 @@ def cluster_node_heartbeat(): logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname)) return else: - raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID)) + if settings.AWX_AUTO_DEPROVISION_INSTANCES: + (changed, this_inst) = Instance.objects.register(ip_address=os.environ.get('MY_POD_IP'), node_type='control', uuid=settings.SYSTEM_UUID) + if changed: + logger.warning(f'Recreated instance record {this_inst.hostname} after unexpected removal') + this_inst.local_health_check() + else: + raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID)) # IFF any node has a greater version than we do, then we'll shutdown services for other_inst in instance_list: if other_inst.node_type in ('execution', 'hop'):