Merge pull request #12494 from AlanCoding/revival

Register system again if deleted by another pod
This commit is contained in:
Alan Rominger 2022-08-17 10:12:39 -04:00 committed by GitHub
commit 9e8ba6ca09
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 19 additions and 5 deletions

View File

@ -166,7 +166,11 @@ class Metrics:
elif settings.IS_TESTING():
self.instance_name = "awx_testing"
else:
self.instance_name = Instance.objects.me().hostname
try:
self.instance_name = Instance.objects.me().hostname
except Exception as e:
self.instance_name = settings.CLUSTER_HOST_ID
logger.info(f'Instance {self.instance_name} seems to be unregistered, error: {e}')
# metric name, help_text
METRICSLIST = [

View File

@ -15,7 +15,11 @@ def startup_reaping():
If this particular instance is starting, then we know that any running jobs are invalid
so we will reap those jobs as a special action here
"""
me = Instance.objects.me()
try:
me = Instance.objects.me()
except RuntimeError as e:
logger.warning(f'Local instance is not registered, not running startup reaper: {e}')
return
jobs = UnifiedJob.objects.filter(status='running', controller_node=me.hostname)
job_ids = []
for j in jobs:

View File

@ -7,7 +7,7 @@ from django.core.cache import cache as django_cache
from django.core.management.base import BaseCommand
from django.db import connection as django_connection
from awx.main.dispatch import get_local_queuename, reaper
from awx.main.dispatch import get_local_queuename
from awx.main.dispatch.control import Control
from awx.main.dispatch.pool import AutoscalePool
from awx.main.dispatch.worker import AWXConsumerPG, TaskWorker
@ -53,7 +53,6 @@ class Command(BaseCommand):
# (like the node heartbeat)
periodic.run_continuously()
reaper.startup_reaping()
consumer = None
try:

View File

@ -104,6 +104,7 @@ def dispatch_startup():
#
apply_cluster_membership_policies()
cluster_node_heartbeat()
reaper.startup_reaping()
m = Metrics()
m.reset_values()
@ -505,7 +506,13 @@ def cluster_node_heartbeat():
logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
return
else:
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
if settings.AWX_AUTO_DEPROVISION_INSTANCES:
(changed, this_inst) = Instance.objects.register(ip_address=os.environ.get('MY_POD_IP'), node_type='control', uuid=settings.SYSTEM_UUID)
if changed:
logger.warning(f'Recreated instance record {this_inst.hostname} after unexpected removal')
this_inst.local_health_check()
else:
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
# IFF any node has a greater version than we do, then we'll shutdown services
for other_inst in instance_list:
if other_inst.node_type in ('execution', 'hop'):