Register system again if deleted by another pod

Avoid cases where missing instance
  would throw error on startup
  this gives time for heartbeat to register it
This commit is contained in:
Alan Rominger
2022-07-08 12:06:21 -04:00
parent 2c9a0444e6
commit 585d3f4e2a
4 changed files with 19 additions and 5 deletions

View File

@@ -166,7 +166,11 @@ class Metrics:
elif settings.IS_TESTING(): elif settings.IS_TESTING():
self.instance_name = "awx_testing" self.instance_name = "awx_testing"
else: else:
self.instance_name = Instance.objects.me().hostname try:
self.instance_name = Instance.objects.me().hostname
except Exception as e:
self.instance_name = settings.CLUSTER_HOST_ID
logger.info(f'Instance {self.instance_name} seems to be unregistered, error: {e}')
# metric name, help_text # metric name, help_text
METRICSLIST = [ METRICSLIST = [

View File

@@ -15,7 +15,11 @@ def startup_reaping():
If this particular instance is starting, then we know that any running jobs are invalid If this particular instance is starting, then we know that any running jobs are invalid
so we will reap those jobs as a special action here so we will reap those jobs as a special action here
""" """
me = Instance.objects.me() try:
me = Instance.objects.me()
except RuntimeError as e:
logger.warning(f'Local instance is not registered, not running startup reaper: {e}')
return
jobs = UnifiedJob.objects.filter(status='running', controller_node=me.hostname) jobs = UnifiedJob.objects.filter(status='running', controller_node=me.hostname)
job_ids = [] job_ids = []
for j in jobs: for j in jobs:

View File

@@ -7,7 +7,7 @@ from django.core.cache import cache as django_cache
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.db import connection as django_connection from django.db import connection as django_connection
from awx.main.dispatch import get_local_queuename, reaper from awx.main.dispatch import get_local_queuename
from awx.main.dispatch.control import Control from awx.main.dispatch.control import Control
from awx.main.dispatch.pool import AutoscalePool from awx.main.dispatch.pool import AutoscalePool
from awx.main.dispatch.worker import AWXConsumerPG, TaskWorker from awx.main.dispatch.worker import AWXConsumerPG, TaskWorker
@@ -53,7 +53,6 @@ class Command(BaseCommand):
# (like the node heartbeat) # (like the node heartbeat)
periodic.run_continuously() periodic.run_continuously()
reaper.startup_reaping()
consumer = None consumer = None
try: try:

View File

@@ -103,6 +103,7 @@ def dispatch_startup():
# #
apply_cluster_membership_policies() apply_cluster_membership_policies()
cluster_node_heartbeat() cluster_node_heartbeat()
reaper.startup_reaping()
m = Metrics() m = Metrics()
m.reset_values() m.reset_values()
@@ -504,7 +505,13 @@ def cluster_node_heartbeat():
logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname)) logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
return return
else: else:
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID)) if settings.AWX_AUTO_DEPROVISION_INSTANCES:
(changed, this_inst) = Instance.objects.register(ip_address=os.environ.get('MY_POD_IP'), node_type='control', uuid=settings.SYSTEM_UUID)
if changed:
logger.warning(f'Recreated instance record {this_inst.hostname} after unexpected removal')
this_inst.local_health_check()
else:
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
# IFF any node has a greater version than we do, then we'll shutdown services # IFF any node has a greater version than we do, then we'll shutdown services
for other_inst in instance_list: for other_inst in instance_list:
if other_inst.node_type in ('execution', 'hop'): if other_inst.node_type in ('execution', 'hop'):