Merge pull request #11955 from shanemcd/fail-better

Increase resiliency when application crashes
This commit is contained in:
Shane McDonald
2022-03-30 08:58:26 -04:00
committed by GitHub
11 changed files with 110 additions and 108 deletions

View File

@@ -35,9 +35,11 @@ def reap(instance=None, status='failed', excluded_uuids=[]):
"""
me = instance
if me is None:
(changed, me) = Instance.objects.get_or_register()
if changed:
logger.info("Registered node '{}'".format(me.hostname))
try:
me = Instance.objects.me()
except RuntimeError as e:
logger.warning(f'Local instance is not registered, not running reaper: {e}')
return
now = tz_now()
workflow_ctype_id = ContentType.objects.get_for_model(WorkflowJob).id
jobs = UnifiedJob.objects.filter(

View File

@@ -3,6 +3,7 @@
from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from django.conf import settings
from awx.main.models import Instance
@@ -13,7 +14,7 @@ class Command(BaseCommand):
Register this instance with the database for HA tracking.
"""
help = "Add instance to the database. Specify `--hostname` to use this command."
help = "Add instance to the database. When no options are provided, the hostname of the current system will be used. Override with `--hostname`."
def add_arguments(self, parser):
parser.add_argument('--hostname', dest='hostname', type=str, help="Hostname used during provisioning")
@@ -22,8 +23,11 @@ class Command(BaseCommand):
def _register_hostname(self, hostname, node_type, uuid):
if not hostname:
return
(changed, instance) = Instance.objects.register(hostname=hostname, node_type=node_type, uuid=uuid)
if not settings.AWX_AUTO_DEPROVISION_INSTANCES:
raise CommandError('Registering with values from settings only intended for use in K8s installs')
(changed, instance) = Instance.objects.get_or_register()
else:
(changed, instance) = Instance.objects.register(hostname=hostname, node_type=node_type, uuid=uuid)
if changed:
print("Successfully registered instance {}".format(hostname))
else:
@@ -32,8 +36,6 @@ class Command(BaseCommand):
@transaction.atomic
def handle(self, **options):
if not options.get('hostname'):
raise CommandError("Specify `--hostname` to use this command.")
self.changed = False
self._register_hostname(options.get('hostname'), options.get('node_type'), options.get('uuid'))
if self.changed:

View File

@@ -490,10 +490,6 @@ def cluster_node_heartbeat():
if inst.hostname == settings.CLUSTER_HOST_ID:
this_inst = inst
break
else:
(changed, this_inst) = Instance.objects.get_or_register()
if changed:
logger.info("Registered tower control node '{}'".format(this_inst.hostname))
inspect_execution_nodes(instance_list)