mirror of
https://github.com/ansible/awx.git
synced 2026-01-12 18:40:01 -03:30
Register system again if deleted by another pod
Avoid cases where missing instance would throw error on startup this gives time for heartbeat to register it
This commit is contained in:
parent
2c9a0444e6
commit
585d3f4e2a
@ -166,7 +166,11 @@ class Metrics:
|
||||
elif settings.IS_TESTING():
|
||||
self.instance_name = "awx_testing"
|
||||
else:
|
||||
self.instance_name = Instance.objects.me().hostname
|
||||
try:
|
||||
self.instance_name = Instance.objects.me().hostname
|
||||
except Exception as e:
|
||||
self.instance_name = settings.CLUSTER_HOST_ID
|
||||
logger.info(f'Instance {self.instance_name} seems to be unregistered, error: {e}')
|
||||
|
||||
# metric name, help_text
|
||||
METRICSLIST = [
|
||||
|
||||
@ -15,7 +15,11 @@ def startup_reaping():
|
||||
If this particular instance is starting, then we know that any running jobs are invalid
|
||||
so we will reap those jobs as a special action here
|
||||
"""
|
||||
me = Instance.objects.me()
|
||||
try:
|
||||
me = Instance.objects.me()
|
||||
except RuntimeError as e:
|
||||
logger.warning(f'Local instance is not registered, not running startup reaper: {e}')
|
||||
return
|
||||
jobs = UnifiedJob.objects.filter(status='running', controller_node=me.hostname)
|
||||
job_ids = []
|
||||
for j in jobs:
|
||||
|
||||
@ -7,7 +7,7 @@ from django.core.cache import cache as django_cache
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import connection as django_connection
|
||||
|
||||
from awx.main.dispatch import get_local_queuename, reaper
|
||||
from awx.main.dispatch import get_local_queuename
|
||||
from awx.main.dispatch.control import Control
|
||||
from awx.main.dispatch.pool import AutoscalePool
|
||||
from awx.main.dispatch.worker import AWXConsumerPG, TaskWorker
|
||||
@ -53,7 +53,6 @@ class Command(BaseCommand):
|
||||
# (like the node heartbeat)
|
||||
periodic.run_continuously()
|
||||
|
||||
reaper.startup_reaping()
|
||||
consumer = None
|
||||
|
||||
try:
|
||||
|
||||
@ -103,6 +103,7 @@ def dispatch_startup():
|
||||
#
|
||||
apply_cluster_membership_policies()
|
||||
cluster_node_heartbeat()
|
||||
reaper.startup_reaping()
|
||||
m = Metrics()
|
||||
m.reset_values()
|
||||
|
||||
@ -504,7 +505,13 @@ def cluster_node_heartbeat():
|
||||
logger.warning('Rejoining the cluster as instance {}.'.format(this_inst.hostname))
|
||||
return
|
||||
else:
|
||||
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
|
||||
if settings.AWX_AUTO_DEPROVISION_INSTANCES:
|
||||
(changed, this_inst) = Instance.objects.register(ip_address=os.environ.get('MY_POD_IP'), node_type='control', uuid=settings.SYSTEM_UUID)
|
||||
if changed:
|
||||
logger.warning(f'Recreated instance record {this_inst.hostname} after unexpected removal')
|
||||
this_inst.local_health_check()
|
||||
else:
|
||||
raise RuntimeError("Cluster Host Not Found: {}".format(settings.CLUSTER_HOST_ID))
|
||||
# IFF any node has a greater version than we do, then we'll shutdown services
|
||||
for other_inst in instance_list:
|
||||
if other_inst.node_type in ('execution', 'hop'):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user