Register system again if deleted by another pod

Avoid cases where missing instance
  would throw error on startup
  this gives time for heartbeat to register it
This commit is contained in:
Alan Rominger 2022-07-08 12:06:21 -04:00
parent 2c9a0444e6
commit 585d3f4e2a
No known key found for this signature in database
GPG Key ID: C2D7EAAA12B63559
4 changed files with 19 additions and 5 deletions

View File

@ -166,7 +166,11 @@ class Metrics:
elif settings.IS_TESTING():
self.instance_name = "awx_testing"
else:
self.instance_name = Instance.objects.me().hostname
try:
self.instance_name = Instance.objects.me().hostname
except Exception as e:
self.instance_name = settings.CLUSTER_HOST_ID
logger.info(f'Instance {self.instance_name} seems to be unregistered, error: {e}')
# metric name, help_text
METRICSLIST = [

View File

@ -15,7 +15,11 @@ def startup_reaping():
If this particular instance is starting, then we know that any running jobs are invalid
so we will reap those jobs as a special action here
"""
me = Instance.objects.me()
try:
me = Instance.objects.me()
except RuntimeError as e:
logger.warning(f'Local instance is not registered, not running startup reaper: {e}')
return
jobs = UnifiedJob.objects.filter(status='running', controller_node=me.hostname)
job_ids = []
for j in jobs:

View File

@ -7,7 +7,7 @@ from django.core.cache import cache as django_cache
from django.core.management.base import BaseCommand
from django.db import connection as django_connection
from awx.main.dispatch import get_local_queuename, reaper
from awx.main.dispatch import get_local_queuename
from awx.main.dispatch.control import Control
from awx.main.dispatch.pool import AutoscalePool
from awx.main.dispatch.worker import AWXConsumerPG, TaskWorker
@ -53,7 +53,6 @@ class Command(BaseCommand):
# (like the node heartbeat)
periodic.run_continuously()
reaper.startup_reaping()
consumer = None
try:

View File

@ -103,6 +103,7 @@ def dispatch_startup():
#
apply_cluster_membership_policies()
cluster_node_heartbeat()
reaper.startup_reaping()
m = Metrics()
m.reset_values()
@ -504,7 +505,13 @@ def cluster_node_heartbeat():
logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
return
else:
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
if settings.AWX_AUTO_DEPROVISION_INSTANCES:
(changed, this_inst) = Instance.objects.register(ip_address=os.environ.get('MY_POD_IP'), node_type='control', uuid=settings.SYSTEM_UUID)
if changed:
logger.warning(f'Recreated instance record {this_inst.hostname} after unexpected removal')
this_inst.local_health_check()
else:
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
# IFF any node has a greater version than we do, then we'll shutdown services
for other_inst in instance_list:
if other_inst.node_type in ('execution', 'hop'):